In [52]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.datasets import fetch_openml
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.model_selection import StratifiedKFold, RandomizedSearchCV, cross_val_score
# Models
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
# skopt
from skopt import BayesSearchCV

#helpers
import helpers

In [53]:
results = pd.DataFrame({'hiperparametry': [], 'wynik_zbior_1': [], 'wynik_zbior_2': [], 'wynik_zbior_3': [], 'wynik_zbior_4': [], 'model': []})

In [54]:
import warnings
warnings.filterwarnings('ignore')

In [55]:
RANDOM_STATE = 123

In [56]:
# Load datasets
import warnings
warnings.filterwarnings('ignore')

Xs = []
ys = []

df_1 = fetch_openml(data_id=31)
df_2 = fetch_openml(data_id=1489)
df_3 = fetch_openml(data_id=1558)
df_4 = fetch_openml(data_id=183)

X_1 = df_1.data
y_1 = df_1.target
Xs.append(X_1)
ys.append(y_1)

X_2 = df_2.data
y_2 = df_2.target
Xs.append(X_2)
ys.append(y_2)

X_3 = df_3.data
y_3 = df_3.target
Xs.append(X_3)
ys.append(y_3)

X_4 = df_4.data
y_4 = df_4.target
Xs.append(X_4)
ys.append(y_4)

warnings.filterwarnings('default')

In [57]:
# split datasets into train and test
# X_train_arr = []
# X_test_arr = []
# y_train_arr = []
# y_test_arr = []
cols_num = []
cols_cat = []

for i in range(len(Xs)):
#     X_train, X_test, y_train, y_test = train_test_split(Xs[i], ys[i], test_size=0.3, random_state=RANDOM_STATE)

#     X_train_arr.append(X_train)
#     X_test_arr.append(X_test)
#     y_train_arr.append(y_train)
#     y_test_arr.append(y_test)

     numeric_features = Xs[i].select_dtypes(exclude=['category']).columns
     categorical_features = Xs[i].select_dtypes(include=['category']).columns

     cols_num.append(numeric_features)
     cols_cat.append(categorical_features)

In [58]:
# helper function to wrap model and do imputing and encoding

def getPipeline(model, num_cols, cat_cols):
    numeric_transformer = Pipeline(
        steps=[("imputer", SimpleImputer(strategy="median"))]
    )

    categorical_transformer = Pipeline(
        steps=[
            ("imputer", SimpleImputer(strategy='most_frequent')),
            ("encoder", OneHotEncoder(handle_unknown="ignore"))
        ]
    )
    preprocessor = ColumnTransformer(
        transformers=[
            ("num", numeric_transformer, num_cols),
            ("cat", categorical_transformer, cat_cols),
        ]
    )

    pipe = Pipeline(
    steps=[("preprocessor", preprocessor), ("classifier", model)]
    )

    return pipe

### Parameters grid

In [59]:
grid_log_reg = {
    'classifier__C': np.logspace(-4, 4, 50),
    'classifier__l1_ratio': np.linspace(0, 1, 50)
}

grid_tree = {
    'classifier__max_depth': np.concatenate(([None], range(5, 100, 5))),
    'classifier__min_samples_split': list(range(2, 10)),
    'classifier__min_samples_leaf': list(range(2, 50, 2)),
    'classifier__max_leaf_nodes': np.concatenate(([None], range(30, 330, 30))),
    'classifier__ccp_alpha': np.logspace(-3, 3, 50)
}

grid_forest = {
    'classifier__max_depth': np.concatenate(([None], range(1, 11, 1))),
    'classifier__min_samples_split': list(range(2, 10)),
    'classifier__min_samples_leaf': list(range(2, 10, 1)),
    'classifier__max_leaf_nodes': np.concatenate(([None], range(2, 50, 1))),
    'classifier__ccp_alpha': np.logspace(-3, 2, 50),
    'classifier__bootstrap': [True, False],
    'classifier__warm_start': [True, False]
}

In [60]:
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=RANDOM_STATE)

### Important helpers. They Return best CV scores and params for each set

In [61]:
def getRandomCVResults(model, grid, n_iter=10, n_jobs=1, verbose=0):
    search_res = []
    for i in range(len(Xs)):
        clf = getPipeline(model, cols_num[i], cols_cat[i])

        res = RandomizedSearchCV(clf, param_distributions=grid, random_state=RANDOM_STATE, cv=cv, scoring='accuracy', n_iter=n_iter, n_jobs=n_jobs, verbose=verbose)
        res.fit(Xs[i], ys[i])

        search_res.append(res)
        print(f"Fit for set number {i} is done")
        
    return search_res

def getBayesCVResults(model, grid, n_iter=10, n_jobs=1, verbose=0, n_points=1, random_state=RANDOM_STATE):
    search_res = []
    for i in range(len(Xs)):
        clf = getPipeline(model, cols_num[i], cols_cat[i])

        res = BayesSearchCV(clf, search_spaces=grid, cv=cv, scoring='accuracy', n_iter=n_iter, n_jobs=n_jobs, verbose=verbose, n_points=n_points, random_state=random_state)
        res.fit(Xs[i], ys[i])
        search_res.append(res)
        print(f"Fit for set number {i} is done")
    
    return search_res


# Logistic Regression

### Look for best hyperparams (Random search method) for each set

In [62]:
warnings.filterwarnings('ignore')

lr = LogisticRegression(penalty='elasticnet', solver='saga', random_state=RANDOM_STATE, max_iter=1000)
search_rand_lr = getRandomCVResults(lr, grid_log_reg, 5, -1)

warnings.filterwarnings('default')

Fit for set number 0 is done
Fit for set number 1 is done
Fit for set number 2 is done
Fit for set number 3 is done


In [63]:
res_lr = {
    'hiperparametry': search_rand_lr[0].cv_results_['params'], 
    'wynik_zbior_1': search_rand_lr[0].cv_results_['mean_test_score'], 
    'wynik_zbior_2': search_rand_lr[1].cv_results_['mean_test_score'], 
    'wynik_zbior_3': search_rand_lr[2].cv_results_['mean_test_score'], 
    'wynik_zbior_4': search_rand_lr[3].cv_results_['mean_test_score']
}

results_lr = pd.DataFrame(res_lr)
results_lr['model'] = 'LogisticRegression'

# Decision Tree

In [64]:
#warnings.filterwarnings('ignore')

tree = DecisionTreeClassifier(random_state=RANDOM_STATE)
search_rand_tree = getRandomCVResults(tree, grid_tree, 5, -1)

#warnings.filterwarnings('default')

Fit for set number 0 is done
Fit for set number 1 is done
Fit for set number 2 is done




Fit for set number 3 is done


In [65]:
res_tree = {
    'hiperparametry': search_rand_tree[0].cv_results_['params'], 
    'wynik_zbior_1': search_rand_tree[0].cv_results_['mean_test_score'], 
    'wynik_zbior_2': search_rand_tree[1].cv_results_['mean_test_score'], 
    'wynik_zbior_3': search_rand_tree[2].cv_results_['mean_test_score'], 
    'wynik_zbior_4': search_rand_tree[3].cv_results_['mean_test_score']
}

results_tree = pd.DataFrame(res_tree)
results_tree['model'] = 'DecisionTreeClassifier'

# Random Forest

In [66]:
#warnings.filterwarnings('ignore')

forest = RandomForestClassifier(random_state=RANDOM_STATE)
search_rand_forest = getRandomCVResults(forest, grid_forest, 5, -1)

#warnings.filterwarnings('default')

Fit for set number 0 is done
Fit for set number 1 is done
Fit for set number 2 is done




Fit for set number 3 is done


In [67]:
res_forest = {
    'hiperparametry': search_rand_forest[0].cv_results_['params'], 
    'wynik_zbior_1': search_rand_forest[0].cv_results_['mean_test_score'], 
    'wynik_zbior_2': search_rand_forest[1].cv_results_['mean_test_score'], 
    'wynik_zbior_3': search_rand_forest[2].cv_results_['mean_test_score'], 
    'wynik_zbior_4': search_rand_forest[3].cv_results_['mean_test_score']
}

results_forest = pd.DataFrame(res_forest)
results_forest['model'] = 'RandomForestClassifier'

In [68]:
results_tree

Unnamed: 0,hiperparametry,wynik_zbior_1,wynik_zbior_2,wynik_zbior_3,wynik_zbior_4,model
0,"{'classifier__min_samples_split': 8, 'classifi...",0.7,0.706514,0.88476,0.164951,DecisionTreeClassifier
1,"{'classifier__min_samples_split': 7, 'classifi...",0.715,0.787748,0.889183,0.250178,DecisionTreeClassifier
2,"{'classifier__min_samples_split': 8, 'classifi...",0.727,0.820319,0.898695,0.259281,DecisionTreeClassifier
3,"{'classifier__min_samples_split': 6, 'classifi...",0.7,0.706514,0.88476,0.164951,DecisionTreeClassifier
4,"{'classifier__min_samples_split': 4, 'classifi...",0.7,0.706514,0.88476,0.164951,DecisionTreeClassifier


# Merging results

In [69]:
frames = [results_lr, results_tree, results_forest]

results = pd.concat(frames)

In [70]:
results['mean_result'] = (results['wynik_zbior_1'] + results['wynik_zbior_2'] + results['wynik_zbior_3'] + results['wynik_zbior_4']) / 4

In [71]:
results

Unnamed: 0,hiperparametry,wynik_zbior_1,wynik_zbior_2,wynik_zbior_3,wynik_zbior_4,model,mean_result
0,"{'classifier__l1_ratio': 0.9387755102040816, '...",0.7,0.749817,0.884539,0.26622,LogisticRegression,0.650144
1,"{'classifier__l1_ratio': 0.44897959183673464, ...",0.7,0.749632,0.884539,0.25449,LogisticRegression,0.647165
2,"{'classifier__l1_ratio': 0.32653061224489793, ...",0.7,0.749817,0.884539,0.272687,LogisticRegression,0.651761
3,"{'classifier__l1_ratio': 0.08163265306122448, ...",0.7,0.749817,0.884539,0.27508,LogisticRegression,0.652359
4,"{'classifier__l1_ratio': 0.9591836734693877, '...",0.7,0.749262,0.884539,0.260475,LogisticRegression,0.648569
0,"{'classifier__min_samples_split': 8, 'classifi...",0.7,0.706514,0.88476,0.164951,DecisionTreeClassifier,0.614056
1,"{'classifier__min_samples_split': 7, 'classifi...",0.715,0.787748,0.889183,0.250178,DecisionTreeClassifier,0.660527
2,"{'classifier__min_samples_split': 8, 'classifi...",0.727,0.820319,0.898695,0.259281,DecisionTreeClassifier,0.676324
3,"{'classifier__min_samples_split': 6, 'classifi...",0.7,0.706514,0.88476,0.164951,DecisionTreeClassifier,0.614056
4,"{'classifier__min_samples_split': 4, 'classifi...",0.7,0.706514,0.88476,0.164951,DecisionTreeClassifier,0.614056


In [72]:
maxi = np.max(results['mean_result'])
results[results['mean_result'] == maxi]

Unnamed: 0,hiperparametry,wynik_zbior_1,wynik_zbior_2,wynik_zbior_3,wynik_zbior_4,model,mean_result
1,"{'classifier__warm_start': False, 'classifier_...",0.731,0.835862,0.887636,0.272204,RandomForestClassifier,0.681675


# BAYES !!!!!!!!!!!!!!!!!!!!!!!!!!

In [73]:
bayes_diff_start_points_lr = []
bayes_diff_start_points_tree = []
bayes_diff_start_points_forest = []

NUMBER_OF_STARTING_POINTS = 5

In [74]:
warnings.filterwarnings('ignore')

for i in range(NUMBER_OF_STARTING_POINTS):
    lr = LogisticRegression(penalty='elasticnet', solver='saga', random_state=RANDOM_STATE, max_iter=1000)
    # give different random state each time to generate different start hyperParams to Bayes
    search_bayes_lr = getBayesCVResults(lr, grid_log_reg, n_iter=5, n_jobs=-1, random_state = (i+1)*100)
    bayes_diff_start_points_lr.append(search_bayes_lr)

warnings.filterwarnings('default')

Fit for set number 0 is done
Fit for set number 1 is done
Fit for set number 2 is done
Fit for set number 3 is done
Fit for set number 0 is done
Fit for set number 1 is done
Fit for set number 2 is done
Fit for set number 3 is done
Fit for set number 0 is done
Fit for set number 1 is done
Fit for set number 2 is done
Fit for set number 3 is done
Fit for set number 0 is done
Fit for set number 1 is done
Fit for set number 2 is done
Fit for set number 3 is done
Fit for set number 0 is done
Fit for set number 1 is done
Fit for set number 2 is done
Fit for set number 3 is done


In [75]:
for i in range(NUMBER_OF_STARTING_POINTS):
    tree = DecisionTreeClassifier(random_state=RANDOM_STATE)
    search_bayes_tree = getBayesCVResults(tree, grid_tree, n_iter=5, n_jobs=-1, random_state=(i+1)*100)
    bayes_diff_start_points_tree.append(search_bayes_tree)

Fit for set number 0 is done
Fit for set number 1 is done
Fit for set number 2 is done




Fit for set number 3 is done
Fit for set number 0 is done
Fit for set number 1 is done
Fit for set number 2 is done




Fit for set number 3 is done
Fit for set number 0 is done
Fit for set number 1 is done
Fit for set number 2 is done




Fit for set number 3 is done
Fit for set number 0 is done
Fit for set number 1 is done
Fit for set number 2 is done




Fit for set number 3 is done
Fit for set number 0 is done
Fit for set number 1 is done
Fit for set number 2 is done




Fit for set number 3 is done




In [76]:
for i in range(NUMBER_OF_STARTING_POINTS):   
    forest = RandomForestClassifier(random_state=RANDOM_STATE)
    search_bayes_forest = getBayesCVResults(forest, grid_forest, n_iter=5, n_jobs=-1, random_state=(i+1)*100)
    bayes_diff_start_points_forest.append(search_bayes_forest)

Fit for set number 0 is done
Fit for set number 1 is done
Fit for set number 2 is done




Fit for set number 3 is done
Fit for set number 0 is done
Fit for set number 1 is done
Fit for set number 2 is done




Fit for set number 3 is done
Fit for set number 0 is done
Fit for set number 1 is done
Fit for set number 2 is done




Fit for set number 3 is done
Fit for set number 0 is done
Fit for set number 1 is done
Fit for set number 2 is done




Fit for set number 3 is done
Fit for set number 0 is done
Fit for set number 1 is done
Fit for set number 2 is done




Fit for set number 3 is done


In [77]:
#[elem[0].cv_results_['mean_test_score'] for elem in bayes_diff_start_points_lr]

res_bayes_lr_dict = {
        'hiperparametry': [np.nan for i in range(5)], 
        'wynik_zbior_1': [elem[0].best_score_ for elem in bayes_diff_start_points_lr], 
        'wynik_zbior_2': [elem[1].best_score_ for elem in bayes_diff_start_points_lr], 
        'wynik_zbior_3': [elem[2].best_score_ for elem in bayes_diff_start_points_lr], 
        'wynik_zbior_4': [elem[3].best_score_ for elem in bayes_diff_start_points_lr]
}

results_bayes_lr = pd.DataFrame(res_bayes_lr_dict)
results_bayes_lr['model'] = 'LogisticRegression'

res_bayes_tree_dict = {
        'hiperparametry': [np.nan for i in range(5)], 
        'wynik_zbior_1': [elem[0].best_score_ for elem in bayes_diff_start_points_tree], 
        'wynik_zbior_2': [elem[1].best_score_ for elem in bayes_diff_start_points_tree], 
        'wynik_zbior_3': [elem[2].best_score_ for elem in bayes_diff_start_points_tree], 
        'wynik_zbior_4': [elem[3].best_score_ for elem in bayes_diff_start_points_tree]
}

results_bayes_tree = pd.DataFrame(res_bayes_tree_dict)
results_bayes_tree['model'] = 'DecisionTreeClassifier'

res_bayes_forest_dict = {
        'hiperparametry': [np.nan for i in range(5)], 
        'wynik_zbior_1': [elem[0].best_score_ for elem in bayes_diff_start_points_forest], 
        'wynik_zbior_2': [elem[1].best_score_ for elem in bayes_diff_start_points_forest], 
        'wynik_zbior_3': [elem[2].best_score_ for elem in bayes_diff_start_points_forest], 
        'wynik_zbior_4': [elem[3].best_score_ for elem in bayes_diff_start_points_forest]
}

results_bayes_forest = pd.DataFrame(res_bayes_forest_dict)
results_bayes_forest['model'] = 'RandomForestClassifier'

frames_bayes = [results_bayes_lr, results_bayes_tree, results_bayes_forest]

results_bayes = pd.concat(frames_bayes)

In [78]:
results_bayes['mean_result'] = (results_bayes['wynik_zbior_1'] + results_bayes['wynik_zbior_2'] + results_bayes['wynik_zbior_3'] + results_bayes['wynik_zbior_4']) / 4

In [79]:
results_bayes 

Unnamed: 0,hiperparametry,wynik_zbior_1,wynik_zbior_2,wynik_zbior_3,wynik_zbior_4,model,mean_result
0,,0.7,0.749817,0.884539,0.2746,LogisticRegression,0.652239
1,,0.7,0.749817,0.884539,0.274839,LogisticRegression,0.652299
2,,0.7,0.749817,0.884539,0.276516,LogisticRegression,0.652718
3,,0.7,0.750002,0.884539,0.276756,LogisticRegression,0.652824
4,,0.7,0.750002,0.884539,0.275798,LogisticRegression,0.652585
0,,0.7,0.753887,0.88476,0.164951,DecisionTreeClassifier,0.6259
1,,0.7,0.706514,0.88476,0.164951,DecisionTreeClassifier,0.614056
2,,0.7,0.753887,0.88476,0.20924,DecisionTreeClassifier,0.636972
3,,0.7,0.706514,0.88476,0.164951,DecisionTreeClassifier,0.614056
4,,0.7,0.847523,0.898251,0.257363,DecisionTreeClassifier,0.675784
