In [1]:
import pandas as pd
import numpy as np

from sklearn.ensemble import (AdaBoostClassifier, GradientBoostingClassifier,
                              RandomForestClassifier, ExtraTreesClassifier)
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.base import clone
from sklearn.neighbors import KNeighborsClassifier

from sklearn.model_selection import train_test_split, KFold, StratifiedKFold
from sklearn.metrics import f1_score
from sklearn.datasets import load_digits

from tqdm import tqdm

from sklearn.model_selection import RandomizedSearchCV
from scipy.stats.distributions import randint

from sklearn.metrics import accuracy_score
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.model_selection import StratifiedKFold

In [2]:
dataset = load_digits()
X, y = dataset['data'], dataset['target']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, test_size=0.2)

In [3]:
def compute_meta_feature(clf, X_train, X_test, y_train, cv):
    
    n_classes = len(np.unique(y_train))
    X_meta_train = np.zeros((len(X_train), n_classes), dtype=np.float32)
    
    X_meta_test = np.zeros(len(X_test))
    i=0
    
    for train_fold_index, predict_fold_index in cv.split(X_train):
        i+=1
        X_fold_train, X_fold_predict = X_train[train_fold_index], X_train[predict_fold_index]
        y_fold_train = y_train[train_fold_index]

        folded_clf = clone(clf)
        folded_clf.fit(X_fold_train, y_fold_train)

        X_meta_train[predict_fold_index] = folded_clf.predict_proba(X_fold_predict)
        
        X_meta_test += folded_clf.predict_proba(X_test)[:, 1]

    meta_clf = clone(clf)
    meta_clf.fit(X_train, y_train)
    
    X_meta_test /= i

    return X_meta_train, X_meta_test

In [4]:
def generate_metafeatures(classifiers, X_train, X_test, y_train, cv):
    """
    Generates metafeatures using a list of classifiers.
    
    :arg classifiers: list of scikit-learn classifiers
    :args X_train, y_train: training set
    :arg X_test: testing set
    :arg cv: cross-validation folding
    """
    features = [
        compute_meta_feature(clf, X_train, X_test, y_train, cv)
        for clf in tqdm(classifiers)
    ]
    
    stacked_features_train = np.vstack([
        features_train for features_train, features_test in features
    ]).T

    stacked_features_test = np.vstack([
        features_test for features_train, features_test in features
    ]).T
    
    return stacked_features_train, stacked_features_test

In [5]:
def compute_meta_feature_mean(clf, X_train, X_test, y_train, cv):
    """
    Эта функция подсчитывает признаки для мета-классификатора. 
    Они являются вероятностями классов при решении задачи многоклассовой классификации.
    :arg clf: классификатор
    :args X_train, y_train: обучающая выборка
    :arg X_test: признаки тестовой выборки
    :arg cv: класс, генерирующий фолды (KFold)
    :returns X_meta_train, X_meta_test: новые признаки для обучающей и тестовой выборок
    """
    n_classes = len(np.unique(y_train))
    X_meta_train = np.zeros((len(X_train), n_classes), dtype=np.float32)
    X_meta_tests_array = []
    splits = 0
    for train_fold_index, predict_fold_index in cv.split(X_train):
        n_classes = len(np.unique(y_test))
        X_meta_test = np.zeros((len(X_test), n_classes), dtype=np.float32)
        splits += 1
        X_fold_train, X_fold_predict = X_train[train_fold_index], X_train[predict_fold_index]
        y_fold_train = y_train[train_fold_index]
        folded_clf = clone(clf)
        folded_clf.fit(X_fold_train, y_fold_train)
        X_meta_train[predict_fold_index] = folded_clf.predict_proba(X_fold_predict)
        X_meta_tests_array.append(folded_clf.predict_proba(X_test))
    meta_clf = clone(clf)
    meta_clf.fit(X_train, y_train)
    X_meta_test = sum(X_meta_tests_array) / splits
    return X_meta_train, X_meta_test

In [6]:
def compute_meta_feature(clf, X_train, X_test, y_train, cv):
    
    n_classes = len(np.unique(y_train))
    X_meta_train = np.zeros((len(y_train), n_classes), dtype=np.float32)

    splits = cv.split(X_train)
    for train_fold_index, predict_fold_index in splits:
        X_fold_train, X_fold_predict = X_train[train_fold_index], X_train[predict_fold_index]
        y_fold_train = y_train[train_fold_index]
        
        folded_clf = clone(clf)
        folded_clf.fit(X_fold_train, y_fold_train)
        
        X_meta_train[predict_fold_index] = folded_clf.predict_proba(X_fold_predict)
    
    meta_clf = clone(clf)
    meta_clf.fit(X_train, y_train)
    
    X_meta_test = meta_clf.predict_proba(X_test)
    
    return X_meta_train, X_meta_test

In [7]:
def generate_meta_features(classifiers, X_train, X_test, y_train, cv):
   
    features = [
        compute_meta_feature(clf, X_train, X_test, y_train, cv)
        for clf in tqdm(classifiers)
    ]
    
    stacked_features_train = np.hstack([
        features_train for features_train, features_test in features
    ])

    stacked_features_test = np.hstack([
        features_test for features_train, features_test in features
    ])
    
    return stacked_features_train, stacked_features_test

In [8]:
cv = KFold(n_splits=10, shuffle=True, random_state=42)

def compute_metric(clf, X_train=X_train, y_train=y_train, X_test=X_test):
    clf.fit(X_train, y_train)
    y_test_pred = clf.predict(X_test)
    return np.round(f1_score(y_test, y_test_pred, average='macro'), 6)

In [9]:
stacked_features_train, stacked_features_test = generate_meta_features([
    LogisticRegression(C=0.001, penalty='l1', solver='saga', max_iter=2000, multi_class='ovr', random_state=42),
    LogisticRegression(C=0.001, penalty='l2', solver='saga', max_iter=2000, multi_class='multinomial', random_state=42),  
    RandomForestClassifier(n_estimators=300, n_jobs=-1, random_state=42),
    GradientBoostingClassifier(n_estimators=200, random_state=42)
], X_train, X_test, y_train, cv)

100%|████████████████████████████████████████████████████████████████████████████████████| 4/4 [03:03<00:00, 45.85s/it]


In [10]:
total_features_train = np.hstack([X_train, stacked_features_train])
total_features_test = np.hstack([X_test, stacked_features_test])

In [11]:
np.random.seed(42)
clf = LogisticRegression(penalty='none', solver='lbfgs', multi_class='auto', random_state=42)
clf.fit(stacked_features_train, y_train)
accuracy_score(clf.predict(stacked_features_test), y_test)

0.9777777777777777

In [12]:
stacked_features_train, stacked_features_test = generate_meta_features([
    RandomForestClassifier(n_estimators=300, n_jobs=-1, random_state=42),
    ExtraTreesClassifier(n_estimators=200, random_state=42)
], X_train, X_test, y_train, cv)

100%|████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:13<00:00,  6.92s/it]


In [13]:
np.random.seed(42)
clf = LogisticRegression(penalty='none', solver='lbfgs', multi_class='auto', random_state=42)
clf.fit(stacked_features_train, y_train)
accuracy_score(clf.predict(stacked_features_test), y_test)

0.9805555555555555

In [14]:
stacked_features_train, stacked_features_test = generate_meta_features([
    KNeighborsClassifier(),
    ExtraTreesClassifier(n_estimators=300, random_state=42)
], X_train, X_test, y_train, cv)

100%|████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:08<00:00,  4.18s/it]


In [15]:
np.random.seed(42)
clf = LogisticRegression(penalty='none', solver='lbfgs', multi_class='auto', random_state=42)
clf.fit(stacked_features_train, y_train)
accuracy_score(clf.predict(stacked_features_test), y_test)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


0.9888888888888889

In [16]:
stacked_features_train, stacked_features_test = generate_meta_features([
    LogisticRegression(C=0.001, penalty='l1', solver='saga', max_iter=2000, multi_class='ovr', random_state=42),
    KNeighborsClassifier(),
    ExtraTreesClassifier(n_estimators=300, random_state=42),
    AdaBoostClassifier(random_state=42)
], X_train, X_test, y_train, cv)

100%|████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:49<00:00, 12.36s/it]


In [17]:
np.random.seed(42)
clf = LogisticRegression(penalty='none', solver='lbfgs', multi_class='auto', random_state=42)
clf.fit(stacked_features_train, y_train)
accuracy_score(clf.predict(stacked_features_test), y_test)

0.9861111111111112

In [18]:
cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

In [19]:
def compute_meta_feature(clf, X_train, X_test, y_train, cv):
    
    n_classes = len(np.unique(y_train))
    X_meta_train = np.zeros((len(y_train), n_classes), dtype=np.float32)

    splits = cv.split(X_train, y_train)
    for train_fold_index, predict_fold_index in splits:
        X_fold_train, X_fold_predict = X_train[train_fold_index], X_train[predict_fold_index]
        y_fold_train = y_train[train_fold_index]
        
        folded_clf = clone(clf)
        folded_clf.fit(X_fold_train, y_fold_train)
        
        X_meta_train[predict_fold_index] = folded_clf.predict_proba(X_fold_predict)
    
    meta_clf = clone(clf)
    meta_clf.fit(X_train, y_train)
    
    X_meta_test = meta_clf.predict_proba(X_test)
    
    return X_meta_train, X_meta_test

In [20]:
stacked_features_train, stacked_features_test = generate_meta_features([
    RandomForestClassifier(n_estimators=300, n_jobs=-1, random_state=42),
    ExtraTreesClassifier(n_estimators=300, random_state=42)
], X_train, X_test, y_train, cv)

100%|████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:16<00:00,  8.17s/it]


In [21]:
#for i,y in cv.split(X_train, y_train): print(y)

In [22]:
np.random.seed(42)
clf = LogisticRegression(penalty='none', solver='lbfgs', multi_class='auto', random_state=42)
clf.fit(stacked_features_train, y_train)
accuracy_score(clf.predict(stacked_features_test), y_test)

0.9833333333333333

In [23]:
cv = StratifiedKFold(n_splits=20, shuffle=True, random_state=42)

In [24]:
stacked_features_train, stacked_features_test = generate_meta_features([
    RandomForestClassifier(n_estimators=300, n_jobs=-1, random_state=42),
    ExtraTreesClassifier(n_estimators=300, random_state=42)
], X_train, X_test, y_train, cv)

100%|████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:31<00:00, 15.81s/it]


In [25]:
np.random.seed(42)
clf = LogisticRegression(penalty='none', solver='lbfgs', multi_class='auto', random_state=42)
clf.fit(stacked_features_train, y_train)
accuracy_score(clf.predict(stacked_features_test), y_test)

0.9833333333333333

In [26]:
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

In [27]:
stacked_features_train, stacked_features_test = generate_meta_features([
    RandomForestClassifier(n_estimators=300, n_jobs=-1, random_state=42),
    ExtraTreesClassifier(n_estimators=300, random_state=42)
], X_train, X_test, y_train, cv)

100%|████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:08<00:00,  4.29s/it]


In [28]:
np.random.seed(42)
clf = RandomForestClassifier(random_state=42)
clf.fit(stacked_features_train, y_train)
accuracy_score(clf.predict(stacked_features_test), y_test)

0.9805555555555555

In [29]:
np.random.seed(42)
clf = KNeighborsClassifier()
clf.fit(stacked_features_train, y_train)
accuracy_score(clf.predict(stacked_features_test), y_test)

0.9833333333333333

In [30]:
np.random.seed(42)
clf = GradientBoostingClassifier(random_state=42)
clf.fit(stacked_features_train, y_train)
accuracy_score(clf.predict(stacked_features_test), y_test)

0.9833333333333333

In [31]:
cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)

In [32]:
stacked_features_train, stacked_features_test = generate_meta_features([
    RandomForestClassifier(n_estimators=300, criterion='gini', n_jobs=-1, random_state=42),
    ExtraTreesClassifier(n_estimators=300, random_state=42)
], X_train, X_test, y_train, cv)

100%|████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:05<00:00,  2.57s/it]


In [33]:
np.random.seed(42)
clf = ExtraTreesClassifier(n_estimators=100, random_state=42)
clf.fit(stacked_features_train, y_train)
accuracy_score(clf.predict(stacked_features_test), y_test)

0.9861111111111112

In [86]:
def model_train(model, X_train, X_test, y_train):
    np.random.seed(42)
    model.fit(X_train, y_train)
    return np.array(model.predict(X_test))

model_list = [
    RandomForestClassifier(n_estimators=300, criterion='gini', n_jobs=-1, max_depth=24, random_state=42),
    ExtraTreesClassifier(n_estimators=300, random_state=42),
    LogisticRegression(random_state=42)]

predictions = []

for model in model_list:
    predictions.append(model_train(model, X_train, X_test, y_train))

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


In [92]:
np.array(predictions).argmax(axis=1)

array([1, 1, 1], dtype=int64)

In [105]:
np.array(predictions).sum(axis=0).argmax(axis=1)

AxisError: axis 1 is out of bounds for array of dimension 1