In [1]:
import operator, collections
import pandas as pd
import numpy as np
from functools import reduce
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from itertools import product

import warnings
warnings.filterwarnings("ignore")

In [9]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
seed = 42

In [11]:
train[train['target'] == 0].shape, train[train['target'] == 1].shape

((1000, 23), (1000, 23))

In [12]:
test[test['target'] == 0].shape, test[test['target'] == 1].shape

((286, 23), (214, 23))

In [15]:
def opp_check(item, to_check, opp_sample, min_cardinality):
    dif = item - to_check
    mask = dif == 0
    if mask.sum() / mask.count() < min_cardinality:
        return None
    c_mask = opp_sample[opp_sample.columns[mask]]
    c_mask = c_mask - item[mask]
    aux = c_mask == 0
    return aux.all(axis = 1).sum()

def lazyfca(train, test, max_opposition, min_cardinality, return_report = False):
    c_plus = train[train['target'] == 1]
    c_minus = train[train['target'] == 0]
    
    def drop_target(df):
        return df.drop(columns = ['target'])
    
    test_no_target = drop_target(test).copy()
    test['pred'] = 0
    c_plus = drop_target(c_plus)
    c_minus = drop_target(c_minus)
    count_plus = c_plus.count()[0]
    count_minus = c_minus.count()[0]
    i = 0
    for i in range(len(test_no_target)):
        item = test_no_target.iloc[i]
        plus_votes_count = 0
        minus_votes_count = 0
        j = 0
        for j in range(len(c_plus)):
            plus_item = c_plus.iloc[j]
            present_minus = opp_check(item, plus_item, c_minus, min_cardinality)
            if present_minus is None:
                continue
            vote_minus = present_minus / count_minus
            if vote_minus <= max_opposition:
                plus_votes_count += 1
        j = 0
        for j in range(len(c_minus)):
            minus_item = c_minus.iloc[j]
            present_plus = opp_check(item, minus_item, c_plus, min_cardinality)
            if present_plus is None:
                continue
            vote_plus = present_plus / count_plus
            if vote_plus <= max_opposition:
                minus_votes_count += 1
        plus_votes_share = plus_votes_count / count_plus
        minus_votes_share = minus_votes_count / count_minus
        if plus_votes_share >= minus_votes_share:
            test.iloc[i, test.columns.get_loc('pred')] = 1
    if return_report == False:
        return accuracy_score(test['target'], test['pred']), precision_score(test['target'], test['pred']), recall_score(test['target'], test['pred'])
    return classification_report(test['target'], test['pred'])

In [16]:
def cross_validation_acc(data, k, model_score, params):
    data = data.sample(frac = 1, random_state = seed)
    folds = np.array_split(data, k)
    total_acc, total_pre, total_rec = 0, 0, 0
    for i in range(len(folds)):
        train = pd.concat(folds[:i] + folds[i+1:])
        a, p, r = model_score(train, folds[i], **params)
        total_acc += a
        total_pre += p
        total_rec += r
    total_acc /= k
    total_pre /= k
    total_rec /= k
    return total_acc, total_pre, total_rec

def grid_search(model, param_grid, data, num_folds, print_all_results = False):
    best_params, params = {}, {}
    max_acc = 0
    params_list = list(param_grid.keys())
    values_list = [param_grid[v] for v in params_list]
    params_combinations = list(product(*values_list))
    for combination in params_combinations:
        for param in params_list:
            params[param] = combination[params_list.index(param)]
        acc, pre, rec = cross_validation_acc(data, num_folds, model, params)
        if acc > max_acc:
            best_params = params.copy()
            max_acc = acc
        if print_all_results == True:
            print(params, acc, pre, rec)
    print('Best average accuracy during cross-validation:', max_acc)
    print('Best hyperparameters:', best_params)

In [24]:
def logit_application(train, test, C, return_report = False):
    
    logit = LogisticRegression(C = C, random_state = seed)
    logit.fit(train.drop(columns = ['target']), train['target'])
    pred = logit.predict(test.drop(columns = ['target']))
    
    if return_report == False:
        return accuracy_score(test['target'], test['pred']), precision_score(test['target'], test['pred']), recall_score(test['target'], test['pred'])
    return classification_report(test['target'], pred)

def rf_application(train, test, n_estimators, max_depth, return_report = False):
    
    rf = RandomForestClassifier(n_estimators = n_estimators, max_depth = max_depth, random_state = seed)
    rf.fit(train.drop(columns = ['target']), train['target'])
    pred = rf.predict(test.drop(columns = ['target']))
    
    if return_report == False:
        return accuracy_score(test['target'], test['pred']), precision_score(test['target'], test['pred']), recall_score(test['target'], test['pred'])
    return classification_report(test['target'], pred)

def gbc_application(train, test, n_estimators, max_depth, return_report = False):
    
    rf = GradientBoostingClassifier(n_estimators = n_estimators, max_depth = max_depth, random_state = seed)
    rf.fit(train.drop(columns = ['target']), train['target'])
    pred = rf.predict(test.drop(columns = ['target']))
    
    if return_report == False:
        return accuracy_score(test['target'], test['pred']), precision_score(test['target'], test['pred']), recall_score(test['target'], test['pred'])
    return classification_report(test['target'], pred)

In [23]:
grid_search(logit_application, {'C': [0.1, 1, 10]}, train, num_folds = 5)

Best average accuracy during cross-validation: 0.8869999999999999
Best hyperparameters: {'C': 1}


In [31]:
grid_search(rf_application, {'n_estimators': [10, 50, 100], 'max_depth': [3, 6, None]}, train, num_folds = 5)

Best average accuracy during cross-validation: 0.8885
Best hyperparameters: {'n_estimators': 100, 'max_depth': 6}


In [32]:
grid_search(gbc_application, {'n_estimators': [10, 50, 100], 'max_depth': [3, 6, None]}, train, num_folds = 5)

Best average accuracy during cross-validation: 0.889
Best hyperparameters: {'n_estimators': 100, 'max_depth': 3}


In [20]:
grid_search(lazyfca, {'min_cardinality': [0.8, 0.9], 'max_opposition': [0, 0.01]}, train, num_folds = 5, print_all_results = True)

{'min_cardinality': 0.8, 'max_opposition': 0} 0.8755000000000001 0.8751587181223197 0.8769460976155077
{'min_cardinality': 0.8, 'max_opposition': 0.01} 0.8779999999999999 0.8755166484707473 0.8820173473908849
{'min_cardinality': 0.9, 'max_opposition': 0} 0.7264999999999999 0.665135979660483 0.9321700373816799
{'min_cardinality': 0.9, 'max_opposition': 0.01} 0.7270000000000001 0.6666872657606395 0.929169291001623
Best average accuracy during cross-validation: 0.8779999999999999
Best hyperparameters: {'min_cardinality': 0.8, 'max_opposition': 0.01}


In [10]:
print(lazyfca(train, test, min_cardinality = 0.8, max_opposition = 0.01, accuracy_only = False))

              precision    recall  f1-score   support

           0       0.92      0.88      0.90       286
           1       0.85      0.90      0.87       214

    accuracy                           0.89       500
   macro avg       0.88      0.89      0.88       500
weighted avg       0.89      0.89      0.89       500



In [25]:
print(logit_application(train, test, C = 1, accuracy_only = False))

              precision    recall  f1-score   support

           0       0.93      0.91      0.92       286
           1       0.88      0.90      0.89       214

    accuracy                           0.91       500
   macro avg       0.90      0.91      0.90       500
weighted avg       0.91      0.91      0.91       500



In [26]:
print(rf_application(train, test, n_estimators = 100, max_depth = 6, accuracy_only = False))

              precision    recall  f1-score   support

           0       0.92      0.91      0.91       286
           1       0.88      0.89      0.89       214

    accuracy                           0.90       500
   macro avg       0.90      0.90      0.90       500
weighted avg       0.90      0.90      0.90       500



In [27]:
print(gbc_application(train, test, n_estimators = 100, max_depth = 3, accuracy_only = False))

              precision    recall  f1-score   support

           0       0.93      0.90      0.92       286
           1       0.87      0.91      0.89       214

    accuracy                           0.91       500
   macro avg       0.90      0.91      0.90       500
weighted avg       0.91      0.91      0.91       500

