In [37]:
import pandas as pd
import numpy as np

from sklearn.ensemble import (AdaBoostClassifier, GradientBoostingClassifier,
                              RandomForestClassifier, ExtraTreesClassifier)
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.base import clone
from sklearn.neighbors import KNeighborsClassifier

from sklearn.model_selection import train_test_split, KFold, StratifiedKFold
from sklearn.metrics import f1_score
from sklearn.datasets import load_digits

from tqdm import tqdm
from sklearn.preprocessing import StandardScaler

from sklearn.model_selection import RandomizedSearchCV
from scipy.stats.distributions import randint

from sklearn.neighbors import NearestNeighbors
from sklearn.model_selection import StratifiedKFold

In [15]:
dataset = load_digits()
X, y = dataset['data'], dataset['target']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, test_size=0.2)

In [16]:
def compute_meta_feature(clf, X_train, X_test, y_train, cv):
    
    n_classes = len(np.unique(y_train))
    X_meta_train = np.zeros((len(y_train), n_classes), dtype=np.float32)

    splits = cv.split(X_train)
    for train_fold_index, predict_fold_index in splits:
        X_fold_train, X_fold_predict = X_train[train_fold_index], X_train[predict_fold_index]
        y_fold_train = y_train[train_fold_index]
        
        folded_clf = clone(clf)
        folded_clf.fit(X_fold_train, y_fold_train)
        
        X_meta_train[predict_fold_index] = folded_clf.predict_proba(X_fold_predict)
    
    meta_clf = clone(clf)
    meta_clf.fit(X_train, y_train)
    
    X_meta_test = meta_clf.predict_proba(X_test)
    
    return X_meta_train, X_meta_test

In [17]:
def generate_meta_features(classifiers, X_train, X_test, y_train, cv):
   
    features = [
        compute_meta_feature(clf, X_train, X_test, y_train, cv)
        for clf in tqdm(classifiers)
    ]
    
    stacked_features_train = np.hstack([
        features_train for features_train, features_test in features
    ])

    stacked_features_test = np.hstack([
        features_test for features_train, features_test in features
    ])
    
    return stacked_features_train, stacked_features_test

In [18]:
cv = KFold(n_splits=10, shuffle=True, random_state=42)

def compute_metric(clf, X_train=X_train, y_train=y_train, X_test=X_test):
    clf.fit(X_train, y_train)
    y_test_pred = clf.predict(X_test)
    return np.round(f1_score(y_test, y_test_pred, average='macro'), 6)

In [21]:
stacked_features_train, stacked_features_test = generate_meta_features([
    LogisticRegression(C=0.001, penalty='l1', solver='saga', max_iter=2000),
    LogisticRegression(C=0.001, penalty='l2', solver='saga',  max_iter=2000),  
    RandomForestClassifier(n_estimators=300, n_jobs=-1),
    GradientBoostingClassifier(n_estimators=200)
], X_train, X_test, y_train, cv)


  0%|                                                                                            | 0/4 [00:00<?, ?it/s][A
 25%|█████████████████████                                                               | 1/4 [00:30<01:30, 30.21s/it][A
 50%|██████████████████████████████████████████                                          | 2/4 [00:31<00:26, 13.16s/it][A
 75%|███████████████████████████████████████████████████████████████                     | 3/4 [00:37<00:10, 10.09s/it][A
100%|████████████████████████████████████████████████████████████████████████████████████| 4/4 [02:45<00:00, 41.34s/it][A


In [25]:
np.random.seed(42)
clf = LogisticRegression(penalty='none', multi_class='auto', solver='lbfgs')
clf.fit(stacked_features_train, y_train)
#accuracy_score(clf.predict(stacked_features_test), cover_y_test)

LogisticRegression(penalty='none')

In [26]:
compute_metric(clf, stacked_features_train, y_train,  stacked_features_test)

0.987027

In [33]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)


In [34]:
stacked_features_train, stacked_features_test = generate_meta_features([
     
    RandomForestClassifier(n_estimators=300, n_jobs=-1, random_state=42 ),
    RandomForestClassifier(n_estimators=200, n_jobs=-1, random_state=42)
], X_train, X_test, y_train, cv)


  0%|                                                                                            | 0/4 [18:27<?, ?it/s][A

 50%|██████████████████████████████████████████                                          | 1/2 [00:06<00:06,  6.37s/it][A
100%|████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:10<00:00,  5.39s/it][A


In [36]:
np.random.seed(42)
clf = LogisticRegression(penalty='none', multi_class='auto', solver='lbfgs', random_state=42)
clf.fit(stacked_features_train, y_train)
compute_metric(clf, stacked_features_train, y_train,  stacked_features_test)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


0.979952

In [40]:
#6.6.4
stacked_features_train, stacked_features_test = generate_meta_features([
     
    KNeighborsClassifier(),
    RandomForestClassifier(n_estimators=300, n_jobs=-1, random_state=42)
], X_train, X_test, y_train, cv)


  0%|                                                                                            | 0/2 [00:00<?, ?it/s][A
 50%|██████████████████████████████████████████                                          | 1/2 [00:00<00:00,  8.95it/s][A
100%|████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:07<00:00,  3.69s/it][A


In [41]:
np.random.seed(42)
clf664 = LogisticRegression( multi_class='auto', solver='lbfgs', random_state=42)
clf664.fit(stacked_features_train, y_train)
compute_metric(clf664, stacked_features_train, y_train,  stacked_features_test)

0.98502

Задание 6.6.5

In [44]:
stacked_features_train, stacked_features_test = generate_meta_features([
    LogisticRegression(C=0.001, penalty='l1', multi_class='ovr', solver='saga', max_iter=2000), 
    KNeighborsClassifier(),
    RandomForestClassifier(n_estimators=300, n_jobs=-1, random_state=42),
    AdaBoostClassifier()
], X_train, X_test, y_train, cv)



  0%|                                                                                            | 0/4 [00:00<?, ?it/s][A[A

 25%|█████████████████████                                                               | 1/4 [00:00<00:00,  3.28it/s][A[A

 75%|███████████████████████████████████████████████████████████████                     | 3/4 [00:07<00:02,  2.70s/it][A[A

100%|████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:09<00:00,  2.40s/it][A[A


In [45]:
np.random.seed(42)
clf665 = LogisticRegression( multi_class='auto', solver='lbfgs', random_state=42)
clf665.fit(stacked_features_train, y_train)
compute_metric(clf665, stacked_features_train, y_train,  stacked_features_test)

0.98502

# StratifiedKFold

In [57]:
from sklearn.model_selection import StratifiedKFold

In [58]:
def compute_meta_feature(clf, X_train, X_test, y_train, skf):
    
    n_classes = len(np.unique(y_train))
    X_meta_train = np.zeros((len(y_train), n_classes), dtype=np.float32)

    #splits = skf.get_n_splits(X_train)
    for train_fold_index, predict_fold_index in skf.split(X_train, y_train):
        X_fold_train, X_fold_predict = X_train[train_fold_index], X_train[predict_fold_index]
        y_fold_train = y_train[train_fold_index]
        
        folded_clf = clone(clf)
        folded_clf.fit(X_fold_train, y_fold_train)
        
        X_meta_train[predict_fold_index] = folded_clf.predict_proba(X_fold_predict)
    
    meta_clf = clone(clf)
    meta_clf.fit(X_train, y_train)
    
    X_meta_test = meta_clf.predict_proba(X_test)
    
    return X_meta_train, X_meta_test

In [59]:
def generate_meta_features(classifiers, X_train, X_test, y_train, skf):
   
    features = [
        compute_meta_feature(clf, X_train, X_test, y_train, skf)
        for clf in tqdm(classifiers)
    ]
    
    stacked_features_train = np.hstack([
        features_train for features_train, features_test in features
    ])

    stacked_features_test = np.hstack([
        features_test for features_train, features_test in features
    ])
    
    return stacked_features_train, stacked_features_test

In [69]:
#cv = KFold(n_splits=10, shuffle=True, random_state=42)
skf = StratifiedKFold(n_splits=20)
def compute_metric(clf, X_train=X_train, y_train=y_train, X_test=X_test):
    clf.fit(X_train, y_train)
    y_test_pred = clf.predict(X_test)
    return np.round(f1_score(y_test, y_test_pred, average='macro'), 6)

In [70]:
stacked_features_train, stacked_features_test = generate_meta_features([
   
    RandomForestClassifier(n_estimators=300, n_jobs=-1, random_state=42),
    ExtraTreesClassifier(n_estimators=300, random_state=42)
], X_train, X_test, y_train, cv)



  0%|                                                                                            | 0/2 [00:00<?, ?it/s][A[A

 50%|██████████████████████████████████████████                                          | 1/2 [00:17<00:17, 17.04s/it][A[A

100%|████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:34<00:00, 17.18s/it][A[A


In [71]:
np.random.seed(42)
clf666 = LogisticRegression( multi_class='auto', solver='lbfgs', random_state=42)
clf666.fit(stacked_features_train, y_train)
compute_metric(clf666, stacked_features_train, y_train,  stacked_features_test)

0.981296

In [72]:
np.random.seed(42)
clf668 = RandomForestClassifier(random_state=42)
clf668.fit(stacked_features_train, y_train)
compute_metric(clf668, stacked_features_train, y_train,  stacked_features_test)

0.975623

In [74]:
np.random.seed(42)
clf669 =  KNeighborsClassifier()
clf669.fit(stacked_features_train, y_train)
compute_metric(clf669, stacked_features_train, y_train,  stacked_features_test)

0.98762

In [75]:
np.random.seed(42)
clf6610 =  GradientBoostingClassifier()
clf6610.fit(stacked_features_train, y_train)
compute_metric(clf6610, stacked_features_train, y_train,  stacked_features_test)

0.977996