In [1]:
import sys; sys.path.append('../../')
from DataPreparation.DataPreparation import read_data
from ModelPipelines.ModelAnalysis import recursive_feature_elimination, test_log_linearity, show_hyperparams, vc_dimension_check
from sklearn.linear_model import LogisticRegression
from ClassImbalanceHandle.ClassImbalanceHandle import *
import numpy as np
from sklearn.model_selection import cross_val_predict
from mlpath import mlquest as mlq
from utils import nice_table
from IPython.display import display, HTML


### Initalize Model

In [2]:
x_data_d, y_data_d = read_data(kind='Numerical')
clf = LogisticRegression(multi_class='multinomial')
clf.fit(x_data_d, y_data_d)

### Hyperparameters

In [3]:
show_hyperparams(clf)

0,1
C,1.0
class_weight,
dual,False
fit_intercept,True
intercept_scaling,1
l1_ratio,
max_iter,100
multi_class,multinomial
n_jobs,
penalty,l2


### VC Dimension

In [4]:
vc_dimension_check(clf, x_data_d)

Model generalization is safe. VC Bound is satisfied where 10dvc=370 < N=1180


### Recursive Feature Elimination

In [5]:
recursive_feature_elimination(clf, min_feats=1, cv=5, x_data_d=x_data_d, y_data_d=y_data_d)

Features to keep ['Age' 'Height' 'Weight' 'Veg_Consump']


ModuleNotFoundError: No module named 'matplotlib_inline'

### Test Log-Linearity

In [None]:
test_log_linearity(clf, class_index=3, x_data_d=x_data_d, y_data_d=y_data_d)

# Handling class imbalance

1- oversampling

In [26]:
methods=['no resampling', 'SMOTE','BorderlineSMOTE','under','cost']
accuracies = []
for method in methods:
    if method == "cost":
        w = handle_class_imbalance(x_data_d, y_data_d, method=method,k=5)
        clf = LogisticRegression(multi_class='multinomial', class_weight=w)
        clf.fit(x_data_d, y_data_d)
        y_pred = mlq.l(cross_val_predict)(clf, x_data_d, y_data_d, cv=4)
        accuracies.append( np.mean(y_pred == y_data_d))
    else:
        bal_x, bal_y = handle_class_imbalance(x_data_d, y_data_d, method=method,k=5)
        clf = LogisticRegression(multi_class='multinomial')
        clf.fit(bal_x, bal_y)
        y_pred = mlq.l(cross_val_predict)(clf, bal_x, bal_y, cv=4)
        accuracies.append( np.mean(y_pred == bal_y))
    




In [27]:
show_results(accuracies, methods,title="Resampling")

Unnamed: 0,Method,Accuracy
0,no resampling,0.933051
1,SMOTE,0.961857
2,BorderlineSMOTE,0.965074
3,under,0.914474
4,cost,0.668644


2-SMOTE with const k and multiple sample strategy

In [8]:
methods=['SMOTE','SMOTE','SMOTE','SMOTE']
sampling_ratios=[[1,1,1],[0.5,0.6,0.9] , [0.4,0.5,.8] , [0.7,0.8,1]]
accuracies = []
for i,r in enumerate(sampling_ratios):
    bal_x, bal_y = handle_class_imbalance(x_data_d, y_data_d, method="SMOTE",k=5, sampling_ratio=sampling_ratios[i])
    clf = LogisticRegression(multi_class='multinomial')
    clf.fit(bal_x, bal_y)
    y_pred = mlq.l(cross_val_predict)(clf, bal_x, bal_y, cv=4)
    accuracies.append( np.mean(y_pred == bal_y))



In [9]:
show_results(accuracies, methods, sample_ratio= sampling_ratios, title="SMOTE, k=5")

Unnamed: 0,Method,Sampling Ratio,Accuracy
0,SMOTE,"[1, 1, 1]",0.962316
1,SMOTE,"[0.5, 0.6, 0.9]",0.95095
2,SMOTE,"[0.4, 0.5, 0.8]",0.950272
3,SMOTE,"[0.7, 0.8, 1]",0.960589


2-SMOTE with const sample strategy and multiple values of k

In [20]:
Ks=[5,15,25,70]
sampling_ratio=[0.5,0.6,0.9]
accuracies = []
for i,k in enumerate(Ks):
    bal_x, bal_y = handle_class_imbalance(x_data_d, y_data_d, method="SMOTE",k=k, sampling_ratio=sampling_ratio)
    clf = LogisticRegression(multi_class='multinomial')
    clf.fit(bal_x, bal_y)
    y_pred = mlq.l(cross_val_predict)(clf, bal_x, bal_y, cv=4)
    accuracies.append( np.mean(y_pred == bal_y))



In [21]:
show_results(accuracies, methods,k=Ks, title="SMOTE, k=5")

Unnamed: 0,Method,K,Accuracy
0,BorderlineSMOTE,5,0.954016
1,BorderlineSMOTE,15,0.958308
2,BorderlineSMOTE,25,0.955855
3,BorderlineSMOTE,70,0.952177


4- BorderlineSMOTE with const k and multiple sample strategy

In [22]:
methods=['BorderlineSMOTE','BorderlineSMOTE','BorderlineSMOTE','BorderlineSMOTE']
sampling_ratios=[[1,1,1],[0.5,0.6,0.9] , [0.4,0.5,.8] , [0.7,0.8,1]]
accuracies = []
for i,r in enumerate(sampling_ratios):
    bal_x, bal_y = handle_class_imbalance(x_data_d, y_data_d, method="BorderlineSMOTE",k=5, sampling_ratio=sampling_ratios[i])
    clf = LogisticRegression(multi_class='multinomial')
    clf.fit(bal_x, bal_y)
    y_pred = mlq.l(cross_val_predict)(clf, bal_x, bal_y, cv=4)
    accuracies.append( np.mean(y_pred == bal_y))



In [23]:
show_results(accuracies, methods, sample_ratio= sampling_ratios, title="BorderlineSMOTE, k=5")

Unnamed: 0,Method,Sampling Ratio,Accuracy
0,BorderlineSMOTE,"[1, 1, 1]",0.966912
1,BorderlineSMOTE,"[0.5, 0.6, 0.9]",0.95279
2,BorderlineSMOTE,"[0.4, 0.5, 0.8]",0.948229
3,BorderlineSMOTE,"[0.7, 0.8, 1]",0.96269


5- BorderlineSMOTE with const sample strategy and multiple k values

In [24]:
Ks=[5,15,25,50]
sampling_ratio=[0.5,0.6,0.9]
accuracies = []
for i,k in enumerate(Ks):
    bal_x, bal_y = handle_class_imbalance(x_data_d, y_data_d, method="BorderlineSMOTE",k=k, sampling_ratio=sampling_ratio)
    clf = LogisticRegression(multi_class='multinomial')
    clf.fit(bal_x, bal_y)
    y_pred = mlq.l(cross_val_predict)(clf, bal_x, bal_y, cv=4)
    accuracies.append( np.mean(y_pred == bal_y))



In [25]:
show_results(accuracies, methods,k=Ks, title="SMOTE, k=5")

Unnamed: 0,Method,K,Accuracy
0,BorderlineSMOTE,5,0.953403
1,BorderlineSMOTE,15,0.951563
2,BorderlineSMOTE,25,0.952177
3,BorderlineSMOTE,50,0.95279
