In [1]:
import numpy as np
import pandas as pd

In [142]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV

Подготовка данных

In [49]:
def prepare_data(filename):
    X_df = pd.read_csv(filename)
    y = X_df['V10'].replace(to_replace='positive', value=1).replace(to_replace='negative', value=-1).values
    encoder = OneHotEncoder(dtype='bool')
    X = encoder.fit_transform(X_df[['V{}'.format(i) for i in range(1, 10)]]).toarray()
    return X, y

In [4]:
def context_split(X_train, y_train):
    return X_train[y_train==1], X_train[y_train==-1]

Признаки для объекта, используемые классификатором

In [5]:
def avg_intersect(sample, context):
    return np.sum(context & sample) / len(context)

In [6]:
def avg_clojure(sample, context, context_clojure):
    intersect = context & sample
    return np.array([(np.sum(context_clojure & inters, axis=1) == np.sum(inters)).sum() 
                     for inters in intersect]).mean()

Подсчёт признаков

In [47]:
def get_features(dataset, X_train, y_train):
    X_pos, X_neg = context_split(X_train, y_train)
    return np.array([[
             avg_intersect(sample, X_pos), avg_intersect(sample, X_neg)] for sample in dataset])

def get_features_bad(dataset, X_train, y_train):
    X_pos, X_neg = context_split(X_train, y_train)
    return np.array([[avg_clojure(sample, X_pos, X_pos), avg_clojure(sample, X_pos, X_neg),
             avg_clojure(sample, X_neg, X_pos), avg_clojure(sample, X_neg, X_neg),
             avg_intersect(sample, X_pos), avg_intersect(sample, X_neg)] for sample in dataset])

Для улучшения качества, заменим критерий порога по значениям признаков на решающие деревья над теми же признаками, тогда:

Классификатор - ансамбль k деревьев, обучающийся на разделении обучающей выборки на k частей. На k-й обучаем, остальные используются как контекст. Признаки на вход в дерево получаются из "ленивых" признаков обучающей выборки и контекста.

Предсказание - знак среднего предсказания ансамбля деревьев на признаках из запрашиваемого примера и всей обучающей выборки в качестве контекстов.

3 параметра:
    - n_train_splits - кол-во разделений при обучении (==кол-во деревьев в ансамбле)
    - max_depth - максимальная глубина дерева
    - features - источники признаков

In [137]:
class MyClassifier:
    def __init__(self, n_train_splits = 5, max_depth = 4, features=get_features):
        self.trees=[]
        self.n_train_splits = n_train_splits
        self.max_depth = max_depth
        self.features=features
        self.X_train = None
        self.y_train = None
        
    def fit(self, X, y):
        k_fold = KFold(n_splits=self.n_train_splits, shuffle=True)
        for train_index, test_index in k_fold.split(X):
            X_train, X_to_learn = X[train_index], X[test_index]
            y_train, y_to_learn = y[train_index], y[test_index]
            X_to_learn = self.features(X_to_learn, X_train, y_train)
            tree = DecisionTreeClassifier(max_depth=self.max_depth)
            tree.fit(X_to_learn, y_to_learn)
            self.trees.append(tree)
        self.X_train = X
        self.y_train = y
        return self
            
    def predict(self, X):
        X = self.features(X, self.X_train, self.y_train)
        tree_predictions = np.array([classifier.predict(X) for classifier in self.trees])
        return (tree_predictions.mean(axis=0) > 0) * 2 -1
    
    def get_params(self, **kwargs):
        return {'n_train_splits':self.n_train_splits, 'max_depth':self.max_depth}
    
    def set_params(self, **kwargs):
        self.n_train_splits = kwargs['n_train_splits']
        self.max_depth = kwargs['max_depth']
        return self

Рассмотрим в качестве признаков средние $|g'\cap g_i^+|$, $|g'\cap g_i^-|$,
        $|(g'\cap g_i^+)^+|$, $|(g'\cap g_i^+)^-|$, $|(g'\cap g_i^-)^-|$, $|(g'\cap g_i^-)^+|$:

In [55]:
n_split = 5
max_depth = 3
for i in range(1, 11):
    file_train = 'train{}.csv'.format(i)
    file_test = 'test{}.csv'.format(i)
    X_train, y_train = prepare_data(file_train)
    X_test, y_test = prepare_data(file_test)
    classifier = MyClassifier(n_split, max_depth, features=get_features_bad)
    classifier.fit(X_train, y_train)
    pred = classifier.predict(X_test)
    print('{} : acc. {}, prec. {}, rec. {}'.format(i, accuracy_score(y_test, pred),
                                                  precision_score(y_test, pred), recall_score(y_test, pred)))

1 : acc. 0.6129032258064516, prec. 0.8048780487804879, rec. 0.5409836065573771
2 : acc. 0.6436781609195402, prec. 0.717391304347826, rec. 0.6470588235294118
3 : acc. 0.64, prec. 0.8717948717948718, rec. 0.5230769230769231
4 : acc. 0.5056179775280899, prec. 0.7272727272727273, rec. 0.4067796610169492
5 : acc. 0.6741573033707865, prec. 0.8666666666666667, rec. 0.6290322580645161
6 : acc. 0.611764705882353, prec. 0.8108108108108109, rec. 0.5357142857142857
7 : acc. 0.6578947368421053, prec. 0.7924528301886793, rec. 0.6
8 : acc. 0.6915887850467289, prec. 0.8125, rec. 0.7123287671232876
9 : acc. 0.7475728155339806, prec. 0.8666666666666667, rec. 0.7428571428571429
10 : acc. 0.5934065934065934, prec. 0.7391304347826086, rec. 0.576271186440678


Результат не впечатляет. Попробуем ограничиться средними $|g'\cap g_i^+|$, $|g'\cap g_i^-|$:

In [56]:
n_split = 5
max_depth = 3
for i in range(1, 11):
    file_train = 'train{}.csv'.format(i)
    file_test = 'test{}.csv'.format(i)
    X_train, y_train = prepare_data(file_train)
    X_test, y_test = prepare_data(file_test)
    classifier = MyClassifier(n_split, max_depth, features=get_features)
    classifier.fit(X_train, y_train)
    pred = classifier.predict(X_test)
    print('{} : acc. {}, prec. {}, rec. {}'.format(i, accuracy_score(y_test, pred),
                                                  precision_score(y_test, pred), recall_score(y_test, pred)))

1 : acc. 0.7634408602150538, prec. 0.7746478873239436, rec. 0.9016393442622951
2 : acc. 0.7471264367816092, prec. 0.6986301369863014, rec. 1.0
3 : acc. 0.79, prec. 0.8055555555555556, rec. 0.8923076923076924
4 : acc. 0.8314606741573034, prec. 0.8142857142857143, rec. 0.9661016949152542
5 : acc. 0.7752808988764045, prec. 0.8, rec. 0.9032258064516129
6 : acc. 0.8235294117647058, prec. 0.8360655737704918, rec. 0.9107142857142857
7 : acc. 0.7631578947368421, prec. 0.7263157894736842, rec. 0.9857142857142858
8 : acc. 0.7757009345794392, prec. 0.7951807228915663, rec. 0.9041095890410958
9 : acc. 0.8446601941747572, prec. 0.8292682926829268, rec. 0.9714285714285714
10 : acc. 0.7472527472527473, prec. 0.725, rec. 0.9830508474576272


Уже лучше. Подберём параметры по сетке, оптимизируя точность:

In [139]:
parameters = {'n_train_splits':np.arange(3,15), 'max_depth':np.arange(3, 15)}
classifier = MyClassifier(2, 2, features=get_features)
gs = GridSearchCV(classifier, param_grid=parameters, scoring='accuracy', cv=5, n_jobs=-1)
params = []
X_train_all = np.empty((0, 27), dtype='bool')
y_train_all = np.empty(0, dtype='bool')
for i in range(1, 11):
    file_train = 'train{}.csv'.format(i)
    X_train, y_train = prepare_data(file_train)
    X_train_all = np.vstack((X_train_all, X_train))
    y_train_all = np.hstack((y_train_all, y_train))

best_params = gs.fit(X_train_all, y_train_all)

In [140]:
best_params_val = best_params.best_params_
print(best_params_val)

{'max_depth': 11, 'n_train_splits': 7}


Итоговые значения метрик на тесте (при обучении на соответствующем train):

In [161]:
metrics_avg = []
for i in range(1, 11):
    file_train = 'train{}.csv'.format(i)
    file_test = 'test{}.csv'.format(i)
    X_train, y_train = prepare_data(file_train)
    X_test, y_test = prepare_data(file_test)
    classifier = MyClassifier(n_split, max_depth, features=get_features)
    classifier.set_params(**best_params_val)
    classifier.fit(X_train, y_train)
    pred = classifier.predict(X_test)
    metrics_avg.append(np.array([accuracy_score(y_test, pred),
        precision_score(y_test, pred), recall_score(y_test, pred), f1_score(y_test, pred)]))
metrics_avg = np.array(metrics_avg).mean(axis=0)
print('Average: acc. {}, prec. {}, rec. {}, f1. {}'.format(metrics_avg[0], metrics_avg[1], metrics_avg[2], metrics_avg[3]))

Average: acc. 0.9410423092812241, prec. 0.9394644632821766, rec. 0.9724699566254443, f1. 0.9552805361560927
