In [12]:
# Heterogeneous pooling
import pandas as pd
import numpy as np
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score


from sklearn.discriminant_analysis import StandardScaler
from sklearn.pipeline import Pipeline
from tqdm import tqdm


In [18]:
# all the data will be stored in this dataframe, with the method name, mean accuracy, standard deviation, lower and upper bound
df = pd.DataFrame(columns=['method', 'mean', 'std', 'lower', 'upper'])

df_per_fold = pd.DataFrame(columns=['method', 'fold', 'mean','std', 'lower', 'upper'])



In [4]:
X = pd.read_csv('X.csv')
y = pd.read_csv('y.csv')


# split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)



In [5]:
# Zero Rule Baseline
from sklearn.dummy import DummyClassifier

clf = DummyClassifier(strategy='most_frequent')

cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)

scores = cross_val_score(clf, X_train, y_train, scoring='accuracy', cv=cv, n_jobs=-1)

df = df.append({'method': 'ZR', 'mean': scores.mean(), 'std': scores.std(), 'lower': scores.mean() - scores.std(), 'upper': scores.mean() + scores.std()}, ignore_index=True)


  df = df.append({'method': 'ZR', 'mean': scores.mean(), 'std': scores.std(), 'lower': scores.mean() - scores.std(), 'upper': scores.mean() + scores.std()}, ignore_index=True)


In [16]:
def train_model(model,params_grid,name): 
    cv_inner = RepeatedStratifiedKFold(n_splits=4, n_repeats=3, random_state=1)
    cv_outer = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)

    pipe = Pipeline(steps=[('s', StandardScaler()), ('m', model)])

    params = params_grid
    counter = 0
    scores = []
    best_acc = 0
    for train_ix, test_ix in tqdm(cv_outer.split(X_train, y_train)):
        # split data
        X_train_inner, X_test_inner = X_train.iloc[train_ix, :], X_train.iloc[test_ix, :]
        y_train_inner, y_test_inner = y_train.iloc[train_ix], y_train.iloc[test_ix]

        # define search
        search = GridSearchCV(pipe, param_grid=params, scoring='accuracy', cv=cv_inner, n_jobs=-1)

        rkf = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)

        # execute search
        result = cross_val_score(search, X_train_inner, y_train_inner.values.ravel(), cv=rkf, n_jobs=-1)

        scores.append(result.mean())
        df_per_fold = df_per_fold.append({'method': name, 'fold': counter, 'mean': result.mean(), 'std': result.std(), 'lower': result.mean() - result.std(), 'upper': result.mean() + result.std()}, ignore_index=True)
        counter += 1
        # check the best model
        if result.mean() > best_acc:
            best_acc = result.mean()
            best_model = search

    df = pd.concat([df, pd.DataFrame({'method': [name], 'mean': [np.mean(scores)], 'std': [np.std(scores)], 'lower': [np.mean(scores) - np.std(scores)], 'upper': [np.mean(scores) + np.std(scores)]})], ignore_index=True)
    return df

In [17]:
from sklearn.ensemble import BaggingClassifier


bg = BaggingClassifier(n_estimators=3)

name = 'BG'

params_grid = {
    'm__n_estimators': [3,9,15,12]
    } 

df = train_model(bg,params_grid,name)


30it [01:14,  2.47s/it]


UnboundLocalError: local variable 'df' referenced before assignment

In [None]:
from sklearn.ensemble import AdaBoostClassifier

ada = AdaBoostClassifier(n_estimators=3)

name = 'ADA'

params_grid = {
    'm__n_estimators': [3,9,15,12]
    }

df = train_model(ada,params_grid,name)

In [13]:
from sklearn.ensemble import RandomForestClassifier


rf = RandomForestClassifier()

name = 'RF'

params_grid = {
    'm__n_estimators': [3,9,15,12]
} 


df = train_model(rf,params_grid,name)

30it [01:12,  2.42s/it]
