In [1]:
import pandas as pd
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

def score_df(y_train, y_test, y_pred_train, y_pred_test, average='binary'):
    if len(y_train) != len(y_pred_train): raise Exception('Lengths of true and predicted for train do not match.')
    if len(y_pred_test) != len(y_pred_test): raise Exception('Lengths of true and predicted for test do not match.')
    num_classes = pd.Series( y_train ).nunique()
    score_2darray = [ \
                     [ \
                      len(y_),
                      pd.Series( y_ ).nunique(),
                      accuracy_score(y_, y_pred_), 
                      precision_score(y_, y_pred_, average=average), 
                      recall_score(y_, y_pred_, average=average), 
                      f1_score(y_, y_pred_, average=average) \
                     ] \
                     + ([roc_auc_score(y_, y_pred_)] if num_classes == 2 else []) \
                     for (y_, y_pred_) in [(y_train, y_pred_train), (y_test, y_pred_test)] \
                    ]
    score_df = pd.DataFrame(score_2darray,
                            index = ['train', 'test'], 
                            columns = ['# samples', '# classes', 'accuracy', 'precision', 'recall', 'f1'] \
                            + (['auc'] if num_classes == 2 else []))
    return score_df

In [2]:
from sklearn.metrics import confusion_matrix

def conf_mat_df(y_true, y_pred):
    conf_mat = confusion_matrix(y_true, y_pred)
    num_class = len(conf_mat)
    true_labels = [f'True_{i}' for i in range(num_class)]
    pred_labels = [f'Pred_{i}' for i in range(num_class)]
    conf_mat_df = pd.DataFrame(conf_mat, index = true_labels, columns = pred_labels )
    return conf_mat_df


In [3]:
import numpy as np
seed = 1234
np.random.seed(seed)


In [4]:
# Logistic Regression

multiclass = False # can be set to either True or False
if __name__ == '__main__':
    from sklearn.linear_model import LogisticRegression
    from sklearn import datasets
    from sklearn.model_selection import train_test_split, GridSearchCV
    import pandas as pd
    from IPython.display import display
    
    iris = datasets.load_iris()
    X, y = iris.data[:, :2], iris.target
    if not multiclass:
        y = y - 1
        y_filter = y >= 0 
        X, y = X[y_filter], y[y_filter] # force to binary
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.5, random_state=None)
    
    ## Scale
    from sklearn import datasets, preprocessing
    scaler = preprocessing.StandardScaler().fit(X_train)
    X_train = scaler.transform(X_train)
    X_test = scaler.transform(X_test)
    
    ## PCA 
    from sklearn.decomposition import PCA
    pca = PCA(.95)
    pca.fit(X_train)
    X_train = pca.transform(X_train)
    X_test = pca.transform(X_test)
    
    params = {'penalty': ['l1','l2'], 'C': [0.1, 1, 10]}
    model = GridSearchCV(LogisticRegression(random_state=None), params, cv=2, return_train_score=False, n_jobs=-1)
    model.fit(X_train, y_train)
    print(f'Best parameter for logistic regression:\n{model.best_params_}')
    y_pred_train = model.predict(X_train)
    y_pred_test = model.predict(X_test)
    
    print('\nScore Table:')
    if multiclass:
        display(score_df(y_train, y_test, y_pred_train, y_pred_test, average='macro'))
    else:
        display(score_df(y_train, y_test, y_pred_train, y_pred_test, average='binary'))
    
    print('\nConfusion Matrix for Train:')
    display(conf_mat_df(y_train, y_pred_train))
    
    print('\nConfusion Matrix for Test:')
    display(conf_mat_df(y_test, y_pred_test))
    
    fi_df = pd.DataFrame(model.best_estimator_.coef_.reshape(1, -1), index=['coefficient'])
    display(fi_df)


Best parameter for logistic regression:
{'C': 1, 'penalty': 'l1'}

Score Table:


Unnamed: 0,# samples,# classes,accuracy,precision,recall,f1,auc
train,50,2,0.7,0.684211,0.590909,0.634146,0.688312
test,50,2,0.7,0.782609,0.642857,0.705882,0.707792



Confusion Matrix for Train:


Unnamed: 0,Pred_0,Pred_1
True_0,22,6
True_1,9,13



Confusion Matrix for Test:


Unnamed: 0,Pred_0,Pred_1
True_0,17,5
True_1,10,18


Unnamed: 0,0,1
coefficient,0.827603,0.811302


In [5]:
# Random Forest

multiclass = False # can be set to either True or False
if __name__ == '__main__':
    from sklearn.ensemble import RandomForestClassifier
    from sklearn import datasets
    from sklearn.model_selection import train_test_split, GridSearchCV
    import pandas as pd
    from IPython.display import display
    
    iris = datasets.load_iris()
    X, y = iris.data[:, :2], iris.target
    if not multiclass:
        y = y - 1
        y_filter = y >= 0 
        X, y = X[y_filter], y[y_filter] # force to binary
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.5, random_state=33)
    
    params = {'bootstrap': [True],
            'criterion': ['gini'],
            'max_depth': [None],
            'max_features': ['auto'],
            'min_samples_leaf': [1],
            'min_samples_split': [2],
            'min_impurity_decrease': [0.0],
            'n_estimators': [10],
            }
    model = GridSearchCV(RandomForestClassifier(random_state=None), params, cv=2, return_train_score=False, n_jobs=-1)
    model.fit(X_train, y_train)
    print(f'Best parameter:\n{model.best_params_}')
    y_pred_train = model.predict(X_train)
    y_pred_test = model.predict(X_test)
    
    print('\nScore Table:')
    if multiclass:
        display(score_df(y_train, y_test, y_pred_train, y_pred_test, average='macro'))
    else:
        display(score_df(y_train, y_test, y_pred_train, y_pred_test, average='binary'))
    
    print('\nConfusion Matrix for Train:')
    display(conf_mat_df(y_train, y_pred_train))
    
    print('\nConfusion Matrix for Test:')
    display(conf_mat_df(y_test, y_pred_test))
    
    fi_df = pd.DataFrame(model.best_estimator_.feature_importances_.reshape(1, -1), index=['feature importance'])
    display(fi_df)


Best parameter:
{'bootstrap': True, 'criterion': 'gini', 'max_depth': None, 'max_features': 'auto', 'min_impurity_decrease': 0.0, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 10}

Score Table:


Unnamed: 0,# samples,# classes,accuracy,precision,recall,f1,auc
train,50,2,0.96,0.956522,0.956522,0.956522,0.959742
test,50,2,0.56,0.631579,0.444444,0.521739,0.570048



Confusion Matrix for Train:


Unnamed: 0,Pred_0,Pred_1
True_0,26,1
True_1,1,22



Confusion Matrix for Test:


Unnamed: 0,Pred_0,Pred_1
True_0,16,7
True_1,15,12


Unnamed: 0,0,1
feature importance,0.633054,0.366946


In [6]:
# XGBoost

multiclass = False # can be set to either True or False
if __name__ == '__main__':
    from xgboost import XGBClassifier
    from sklearn import datasets
    from sklearn.model_selection import train_test_split, GridSearchCV
    import pandas as pd
    from IPython.display import display
    
    iris = datasets.load_iris()
    X, y = iris.data[:, :2], iris.target
    if not multiclass:
        y = y - 1
        y_filter = y >= 0 
        X, y = X[y_filter], y[y_filter] # force to binary
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.5, random_state=33)
    
    params = {
        'max_depth':[3], 
        'learning_rate':[0.1], 
        'n_estimators':[100], 
        'silent':[True], 
        'objective':['binary:logistic'], 
        'booster':['gbtree'], 
        'n_jobs':[-1], 
        'nthread':[None], 
        'gamma':[0], 
        'min_child_weight':[1], 
        'max_delta_step':[0], 
        'subsample':[1], 
        'colsample_bytree':[1], 
        'colsample_bylevel':[1], 
        'reg_alpha':[0], 
        'reg_lambda':[1], 
        'scale_pos_weight':[1], 
        'base_score':[0.5], 
        'missing':[None],
                }
    model = GridSearchCV(XGBClassifier(random_state=seed), params, cv=2, return_train_score=False, n_jobs=-1)
    model.fit(X_train, y_train)
    print(f'Best parameter:\n{model.best_params_}')
    y_pred_train = model.predict(X_train)
    y_pred_test = model.predict(X_test)
    
    print('\nScore Table:')
    if multiclass:
        display(score_df(y_train, y_test, y_pred_train, y_pred_test, average='macro'))
    else:
        display(score_df(y_train, y_test, y_pred_train, y_pred_test, average='binary'))
    
    print('\nConfusion Matrix for Train:')
    display(conf_mat_df(y_train, y_pred_train))
    
    print('\nConfusion Matrix for Test:')
    display(conf_mat_df(y_test, y_pred_test))
    
    fi_df = pd.DataFrame(model.best_estimator_.feature_importances_.reshape(1, -1), index=['feature importance'])
    display(fi_df)


Best parameter:
{'base_score': 0.5, 'booster': 'gbtree', 'colsample_bylevel': 1, 'colsample_bytree': 1, 'gamma': 0, 'learning_rate': 0.1, 'max_delta_step': 0, 'max_depth': 3, 'min_child_weight': 1, 'missing': None, 'n_estimators': 100, 'n_jobs': -1, 'nthread': None, 'objective': 'binary:logistic', 'reg_alpha': 0, 'reg_lambda': 1, 'scale_pos_weight': 1, 'silent': True, 'subsample': 1}

Score Table:


  if diff:
  if diff:


Unnamed: 0,# samples,# classes,accuracy,precision,recall,f1,auc
train,50,2,0.94,0.954545,0.913043,0.933333,0.938003
test,50,2,0.58,0.625,0.555556,0.588235,0.582126



Confusion Matrix for Train:


Unnamed: 0,Pred_0,Pred_1
True_0,26,1
True_1,2,21



Confusion Matrix for Test:


Unnamed: 0,Pred_0,Pred_1
True_0,14,9
True_1,12,15


Unnamed: 0,0,1
feature importance,0.580175,0.419825
