In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.datasets import fetch_openml
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.model_selection import StratifiedKFold, RandomizedSearchCV, cross_val_score
# Models
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier

# skopt
from skopt import BayesSearchCV

#helpers
import helpers

c:\Python38\lib\site-packages\numpy\.libs\libopenblas.EL2C6PLE4ZYW3ECEVIV3OXXGRN2NRFM2.gfortran-win_amd64.dll
c:\Python38\lib\site-packages\numpy\.libs\libopenblas64__v0.3.21-gcc_10_3_0.dll


In [2]:
import warnings
warnings.filterwarnings('ignore')

In [3]:
RANDOM_STATE = 123

In [4]:
# Load datasets
import warnings
warnings.filterwarnings('ignore')

Xs = []
ys = []

df_1 = fetch_openml(data_id=31)
df_2 = fetch_openml(data_id=1489)
df_3 = fetch_openml(data_id=1558)
df_4 = fetch_openml(data_id=18)

X_1 = df_1.data
y_1 = df_1.target
Xs.append(X_1)
ys.append(y_1)

X_2 = df_2.data
y_2 = df_2.target
Xs.append(X_2)
ys.append(y_2)

X_3 = df_3.data
y_3 = df_3.target
Xs.append(X_3)
ys.append(y_3)

X_4 = df_3.data
y_4 = df_3.target
Xs.append(X_4)
ys.append(y_4)

warnings.filterwarnings('default')

In [5]:
# split datasets into train and test
X_train_arr = []
X_test_arr = []
y_train_arr = []
y_test_arr = []
cols_num = []
cols_cat = []

for i in range(len(Xs)):
    X_train, X_test, y_train, y_test = train_test_split(Xs[i], ys[i], test_size=0.3, random_state=RANDOM_STATE)

    X_train_arr.append(X_train)
    X_test_arr.append(X_test)
    y_train_arr.append(y_train)
    y_test_arr.append(y_test)

    numeric_features = X_train_arr[i].select_dtypes(exclude=['category']).columns
    categorical_features = X_train_arr[i].select_dtypes(include=['category']).columns

    cols_num.append(numeric_features)
    cols_cat.append(categorical_features)

In [6]:
# helper function to wrap model and do imputing and encoding

def getPipeline(model, num_cols, cat_cols):
    numeric_transformer = Pipeline(
        steps=[("imputer", SimpleImputer(strategy="median"))]
    )

    categorical_transformer = Pipeline(
        steps=[
            ("imputer", SimpleImputer(strategy='most_frequent')),
            ("encoder", OneHotEncoder(handle_unknown="ignore"))
        ]
    )
    preprocessor = ColumnTransformer(
        transformers=[
            ("num", numeric_transformer, num_cols),
            ("cat", categorical_transformer, cat_cols),
        ]
    )

    pipe = Pipeline(
    steps=[("preprocessor", preprocessor), ("classifier", model)]
    )

    return pipe

### Parameters grid

In [7]:
grid_log_reg = {
    'classifier__C': np.logspace(-4, 4, 50),
    'classifier__l1_ratio': np.linspace(0, 1, 50)
}

grid_tree = {
    'classifier__max_depth': np.concatenate(([None], range(5, 100, 5))),
    'classifier__min_samples_split': range(1, 10),
    'min_samples_leaf': range(2, 50, 2),
    'max_leaf_nodes': np.concatenate(([None], range(30, 330, 30))),
    'ccp_alpha': np.logspace(-3, 3, 50)
}

In [8]:
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=RANDOM_STATE)

### Important helpers. They Return best CV scores and params for each set

In [11]:
def getRandomCVResults(model, grid, n_iter=10, n_jobs=1, verbose=0):
    search_res = []
    for i in range(len(X_train_arr)):
        clf = getPipeline(model, cols_num[i], cols_cat[i])

        res = RandomizedSearchCV(clf, param_distributions=grid, random_state=RANDOM_STATE, cv=cv, scoring='accuracy', n_iter=n_iter, n_jobs=n_jobs, verbose=verbose)
        res.fit(X_train_arr[i], y_train_arr[i])

        search_res.append(res)
    
    return search_res

def getBayesCVResults(model, grid, n_iter=10, n_jobs=1, verbose=0):
    search_res = []
    for i in range(len(X_train_arr)):
        clf = getPipeline(model, cols_num[i], cols_cat[i])

        res = BayesSearchCV(clf, search_spaces=grid, random_state=RANDOM_STATE, cv=cv, scoring='accuracy', n_iter=n_iter, n_jobs=n_jobs, verbose=verbose)
        res.fit(X_train_arr[i], y_train_arr[i])
        search_res.append(res)
        print(f"Fit for set number {i} is done")
    
    return search_res


# Logistic Regression

### Look for best hyperparams (Random search method) for each set

In [12]:
warnings.filterwarnings('ignore')

lr = LogisticRegression(penalty='elasticnet', solver='saga', random_state=RANDOM_STATE, max_iter=1000)
search_rand_lr = getRandomCVResults(lr, grid_log_reg, 50, -1)

warnings.filterwarnings('default')

In [15]:
for el in search_rand_lr:
    print(el.best_params_)
    print(el.best_score_)
    print("\n")

{'classifier__l1_ratio': 0.7346938775510203, 'classifier__C': 4714.8663634573895}
0.7142857142857143


{'classifier__l1_ratio': 0.9795918367346939, 'classifier__C': 1.2067926406393288}
0.7525106064736182


{'classifier__l1_ratio': 0.7346938775510203, 'classifier__C': 4714.8663634573895}
0.8808471813946047


{'classifier__l1_ratio': 0.7346938775510203, 'classifier__C': 4714.8663634573895}
0.8808471813946047




### Look for optimal, default hyperparamer dictionary (for Logistic Regression)

In [21]:
warnings.filterwarnings('ignore')

hyper_params_scores = []
for search_res_idx in range(len(search_rand_lr)):
    params = helpers.decodeParams(search_rand_lr[search_res_idx].best_params_)
    scores = []
    print(params)
    for i in range(len(X_train_arr)):
        model = getPipeline(LogisticRegression(penalty='elasticnet', max_iter=1000, random_state=RANDOM_STATE, solver='saga', **params), cols_num[i], cols_cat[i])
        score = cross_val_score(model, X_train_arr[i], y_train_arr[i], cv=cv)
        scores.append(np.mean(score))
    

    hyper_params_scores.append(scores)

warnings.filterwarnings('default')

{'l1_ratio': 0.7346938775510203, 'C': 4714.8663634573895}
{'l1_ratio': 0.9795918367346939, 'C': 1.2067926406393288}
{'l1_ratio': 0.7346938775510203, 'C': 4714.8663634573895}
{'l1_ratio': 0.7346938775510203, 'C': 4714.8663634573895}


### Verify whether it is real that different hyperparams gives the same score

In [24]:
model = getPipeline(LogisticRegression(penalty='elasticnet', max_iter=1000, random_state=RANDOM_STATE, solver='saga', l1_ratio=0.735, C=4714.9), cols_num[0], cols_cat[0])
print(cross_val_score(model, X_train_arr[0], y_train_arr[0], cv=cv))

model = getPipeline(LogisticRegression(penalty='elasticnet', max_iter=1000, random_state=RANDOM_STATE, solver='saga', l1_ratio=0.98, C=1.207), cols_num[0], cols_cat[0])
print(cross_val_score(model, X_train_arr[0], y_train_arr[0], cv=cv))



[0.71428571 0.71428571 0.71428571 0.71428571 0.71428571]




[0.71428571 0.71428571 0.71428571 0.71428571 0.71428571]




In [25]:
[np.mean(scores) for scores in hyper_params_scores]

[0.807056533320998, 0.8071226708871355, 0.807056533320998, 0.807056533320998]

### Look for best hyperparams (Bayesian method) for each set

In [26]:
warnings.filterwarnings('ignore')

lr = LogisticRegression(penalty='elasticnet', solver='saga', random_state=RANDOM_STATE, max_iter=1000)
search_bayes_lr = getBayesCVResults(lr, grid_log_reg, 50, -1)

warnings.filterwarnings('default')


Fit for set number 0 is done
Fit for set number 1 is done
Fit for set number 2 is done
Fit for set number 3 is done


In [27]:
for i in range(4):
    print(search_bayes_lr[i].best_score_)
    print(search_bayes_lr[i].best_params_)
    print(search_rand_lr[i].best_score_)
    print(search_rand_lr[i].best_params_)
    print(10*'*')

0.7142857142857143
OrderedDict([('classifier__C', 719.6856730011514), ('classifier__l1_ratio', 0.36734693877551017)])
0.7142857142857143
{'classifier__l1_ratio': 0.7346938775510203, 'classifier__C': 4714.8663634573895}
**********
0.7525106064736182
OrderedDict([('classifier__C', 0.8286427728546842), ('classifier__l1_ratio', 0.5714285714285714)])
0.7525106064736182
{'classifier__l1_ratio': 0.9795918367346939, 'classifier__C': 1.2067926406393288}
**********
0.8808471813946047
OrderedDict([('classifier__C', 719.6856730011514), ('classifier__l1_ratio', 0.36734693877551017)])
0.8808471813946047
{'classifier__l1_ratio': 0.7346938775510203, 'classifier__C': 4714.8663634573895}
**********
0.8808471813946047
OrderedDict([('classifier__C', 719.6856730011514), ('classifier__l1_ratio', 0.36734693877551017)])
0.8808471813946047
{'classifier__l1_ratio': 0.7346938775510203, 'classifier__C': 4714.8663634573895}
**********
