In [23]:
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, VotingClassifier, AdaBoostClassifier, StackingClassifier
from sklearn.model_selection import cross_val_score, StratifiedKFold, train_test_split, cross_val_predict, GridSearchCV
from sklearn.metrics import accuracy_score, f1_score, make_scorer, roc_auc_score, confusion_matrix, classification_report
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder, RobustScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.decomposition import PCA
import random
from sklearn.neighbors import LocalOutlierFactor
from sklearn.feature_selection import RFECV

from sklearn.impute import SimpleImputer, KNNImputer

import tensorflow as tf
from tensorflow.keras import layers, models

from Functions_Classes import *

from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import SMOTE, RandomOverSampler

from feature_engine.encoding import CountFrequencyEncoder

from xgboost import XGBClassifier

import matplotlib.pyplot as plt

pd.set_option('display.max_columns', None)
np.random.seed(0)

In [2]:
df = pd.read_excel("C:/Users/Cagan Deliktas/Desktop/ProjectDataMining2/DM2_DataCraft/data/training_data.xls")
X_test_compete = pd.read_excel("C:/Users/Cagan Deliktas/Desktop/ProjectDataMining2/DM2_DataCraft/data/test_data_no_target.xls")

df = df.loc[:, df.columns != 'Perform']
df = df.loc[:, df.columns != 'Group']


df_x = df.loc[:, df.columns != 'Class']
df_y = df['Class']

X_train = df_x.copy()
y_train =  df_y.copy()

#X_train, X_test, y_train, y_test = train_test_split(df_x, df_y, test_size=0.2,shuffle=True, stratify=df_y, random_state=0)

In [3]:
numeric_columns = X_train.loc[:, ~X_train.columns.isin(['Group'])].columns.to_list()
X_train[numeric_columns] = X_train[numeric_columns].replace(
    {
        'NA': np.nan, 
        '': np.nan,
        ' ': np.nan
    }
).astype(float)

In [4]:
X_train.head()

Unnamed: 0,I1,I2,I3,I4,I5,I6,I7,I8,I9,I10,I11,I12,I13,I14,I15,I16,I17,I18,I19,I20,I21,I22,I23,I24,I25,I26,I27,I28,I29,I30,I31,I32,I33,I34,I35,I36,I37,I38,I39,I40,I41,I42,I43,I44,I45,I46,I47,I48,I49,I50,I51,I52,I53,I54,I55,I56,I57,I58,dI1,dI2,dI3,dI4,dI5,dI6,dI7,dI8,dI9,dI10,dI11,dI12,dI13,dI14,dI15,dI16,dI17,dI18,dI19,dI20,dI21,dI22,dI23,dI24,dI25,dI26,dI27,dI28,dI29,dI30,dI31,dI32,dI33,dI34,dI35,dI36,dI37,dI38,dI39,dI40,dI41,dI42,dI43,dI44,dI45,dI46,dI47,dI48,dI49,dI50,dI51,dI52,dI53,dI54,dI55,dI56,dI57,dI58
0,0.136495,-0.028429,-0.037772,-0.232459,-0.016222,-0.187506,-0.322545,-0.043743,0.125389,-0.014757,-0.033105,0.303035,-0.093811,-0.598917,-0.271292,-0.256749,-0.100146,-0.045525,-0.078422,-0.060129,-0.069528,-0.052432,-0.114432,-0.104989,0.342845,-0.159417,0.006772,-0.303193,-0.163287,-0.080599,-0.82888,-1.064215,-0.547067,-0.540497,-0.676045,-0.305007,-0.507724,-0.191437,-0.087362,-0.856151,0.802525,0.73308,0.006512,0.53329,0.195197,0.058094,-0.228889,-0.150821,-0.104986,-0.026743,0.188312,-0.250701,-0.10119,-0.357521,-0.527956,0.611385,-0.092714,-0.055733,-0.065709,-0.002144,-0.004367,-0.079805,0.17828,0.078155,0.072802,0.00209,0.21177,-0.003073,-0.188447,0.117769,0.001613,-0.024223,0.103204,0.032484,0.002688,0.000765,-0.004447,0.148967,-0.018521,-0.01411,-0.001996,-0.002369,-0.120036,0.013172,-0.215571,-0.021999,0.001728,-5e-05,-0.01212,-0.040172,-0.060103,-0.059464,-0.044899,0.015735,0.022919,-0.003106,0.001233,-0.002339,0.040628,0.411684,0.07309,0.526222,0.07106,-0.019531,0.359889,-0.020476,0.057151,0.07711,0.102563,0.188481,-0.016027,-0.135451,-0.189667,0.250967,0.022171,-0.004265
1,-0.714522,-0.042137,-0.052968,-0.796862,-0.018394,0.070102,-0.076321,-0.063864,-1.045521,-0.037353,-0.792515,-1.082483,0.025798,-0.833652,-0.625088,-0.333608,0.072579,-0.046963,0.223022,-0.605902,-0.131099,-0.235929,-0.07392,-0.063247,-0.798768,-0.899983,1.388771,-0.248677,-0.058083,-0.01447,0.092095,0.561368,0.224819,0.22319,0.098852,-0.128227,-0.215876,-0.007164,-0.03526,-0.123911,-0.089751,-0.094963,0.362818,0.011107,-1.506356,-0.573679,-0.955222,-0.81888,-1.063295,-1.022679,-1.336188,-0.612039,-0.061357,-0.482805,-0.017077,1.192135,-0.114981,-0.028074,-0.004451,-0.000536,-0.002288,-0.045597,-0.080639,-0.081924,-0.033862,-0.005111,-0.261836,0.000122,-0.045046,0.999854,-0.008835,-0.122379,-0.199892,0.013615,0.014404,-0.000405,0.021573,-0.02416,-0.03742,-0.01261,0.003007,0.003617,-0.106893,-0.394834,-0.132496,-0.027354,-0.129804,-0.066157,-0.494334,0.123781,0.284328,0.281308,0.212767,0.192042,0.146926,-0.118826,-0.039203,-0.256107,0.176622,0.16884,0.487752,0.029464,0.014232,0.039633,0.025667,0.006626,0.00518,0.006128,-0.016375,0.020727,-0.006525,-0.01879,-0.098543,0.317744,-0.180502,-0.009215
2,0.104791,-0.038188,-0.053191,0.620233,0.148587,0.489875,0.319274,-0.060246,0.053174,-0.025008,-0.45684,1.28445,-0.13347,3.207672,2.37323,1.304427,,,-0.361293,2.995661,,-0.188988,-0.044158,-0.02455,-0.586562,-0.176292,-1.013037,0.066912,0.219649,0.15449,2.370951,1.384675,0.489152,0.484715,0.367301,0.749572,0.66941,0.423228,0.226897,3.227283,-0.329997,-0.327579,-1.033898,0.014531,0.211889,-1.197156,2.860444,,3.584223,,1.272375,7.427558,-0.182816,-2.713205,-1.877595,-0.568691,0.224945,0.052749,0.37764,0.002656,0.001226,0.22606,0.207653,0.270327,0.283061,0.002934,0.454366,0.004264,0.188623,-0.265918,0.0,2.063796,1.076458,0.240011,,,-0.028327,1.764826,,0.005847,-0.011166,-0.012626,-0.010822,0.056514,-0.100007,-0.216081,-0.127274,-0.056206,0.175751,-0.01177,0.493157,0.487919,0.438576,0.574623,0.564379,-0.165933,-0.051256,0.410379,0.056624,0.047592,0.0,-0.020586,0.237539,0.017314,0.516667,,0.404158,,0.272937,0.774169,-0.007144,0.123954,0.0,-0.110103,0.186669,-0.03072
3,-0.532847,-0.006582,-0.023377,1.306702,-0.068909,0.048024,-0.119481,-0.021057,-1.012916,-0.011783,1.206727,0.311773,-0.005928,3.869459,-1.064793,0.107702,-0.126984,-0.04436,-0.181023,-0.691971,,0.195138,-0.104877,-0.093976,-0.757725,0.004432,-1.471299,0.643575,-0.067005,-0.006874,-0.087499,0.110638,0.04688,0.047141,-0.274713,0.169046,-0.179742,0.047391,0.015197,0.105158,-0.045135,-0.051329,0.202098,0.034693,2.904519,4.514844,-0.241111,,-0.521576,,-0.308812,-0.542532,-0.165028,1.490354,-1.550745,-0.918676,0.013484,-0.013198,0.050586,0.010356,0.007522,0.194792,0.010436,0.10788,0.122549,0.017641,0.136566,0.010365,0.086853,-0.286395,-0.014883,0.347297,0.017765,0.068701,0.01554,0.000208,0.016119,0.003992,,0.043909,-0.000107,9.9e-05,-0.003895,0.00249,-0.003034,-0.015845,0.002377,0.001974,0.05634,0.010802,0.063094,0.062424,0.057012,0.118399,0.116161,-0.017039,0.000839,0.054025,0.030561,0.006389,-0.073937,0.764136,-0.076195,-0.114682,0.119667,,0.001799,,0.004938,0.018494,-0.00335,-0.029214,0.045747,-0.076884,-0.037859,-0.012046
4,-0.200815,-0.016334,-0.036754,-0.886675,0.484495,-1.148744,0.152517,-0.04358,-0.935537,-0.023262,-0.908986,-0.525121,0.015492,-0.347325,0.29636,-0.242201,0.120049,-0.048293,0.290658,-0.345816,0.249586,-0.241812,-0.082055,-0.077706,-0.845163,-0.257777,0.919065,-0.522102,0.146076,0.043851,1.281726,0.039106,0.135331,0.134652,0.654099,1.437536,1.995784,-0.145004,-0.029483,0.252151,0.308723,0.293393,-0.527888,-0.00368,-1.553644,-1.233945,-0.947111,-0.926073,-0.772468,-0.63644,-0.833875,-0.527935,-0.01417,-0.142943,1.070523,-0.284682,-0.15511,-0.026941,0.480767,0.021831,-0.003234,-0.041412,0.112513,-0.157224,-0.14618,-0.014677,-0.45195,0.034598,-0.114443,-0.307095,-0.346711,0.104144,-0.50892,-0.096666,0.044162,0.000159,0.085082,0.254664,-0.000408,-0.01539,-0.006226,-0.012542,-0.101059,0.091145,0.28211,-0.005348,0.112377,0.036976,0.73157,0.050165,0.038419,0.038011,0.265998,1.61412,1.806955,-0.122743,-0.001985,0.126103,0.630259,0.618027,-1.599633,0.032793,-0.126733,-0.163593,-0.225889,-0.02646,-0.080892,-0.095963,-0.014812,-0.324584,-0.019002,-0.379323,-0.046024,0.282145,0.011008,0.010496


## Shape

In [5]:
X_train.shape

(8000, 116)

# Classification Models

In [6]:
voting_estimators = [
    ('RandomForest', RandomForestClassifier(random_state=0)),
    #('DecisionTree', DecisionTreeClassifier(random_state=0)),
    #('SVM', SVC(random_state=0, probability=True)),
    #('NaiveBayes', GaussianNB()),
    #('KNN', KNeighborsClassifier()),
    #('LogisticRegression', LogisticRegression(random_state=0, solver="saga", max_iter=1000)),
    ('AdaBoost', AdaBoostClassifier(random_state=0, algorithm='SAMME')),
    ('GradientBoost', GradientBoostingClassifier(random_state=0)),
    ('XGBoost', XGBClassifier(seed=0)),
    #('NeuralNetwork', MLPClassifier(random_state=0, max_iter=1000)) #hidden_layer_sizes=(20,20) for 2 hidden layers with 20 neurons each
]

vote_model = VotingClassifier(
    estimators=voting_estimators, 
    voting='soft'
)

stacking_estimators = [
    ('RandomForest', RandomForestClassifier(random_state=0)),
    #('DecisionTree', DecisionTreeClassifier(random_state=0)),
    #('SVM', SVC(random_state=0, probability=True)),
    #('NaiveBayes', GaussianNB()),
    #('KNN', KNeighborsClassifier()),
    ('AdaBoost', AdaBoostClassifier(random_state=0, algorithm='SAMME')),
    ('GradientBoost', GradientBoostingClassifier(random_state=0)),
    ('XGBoost', XGBClassifier(seed=0)),
    #('NeuralNetwork', MLPClassifier(random_state=0, max_iter=1000))
]
meta_stack_classifier = LogisticRegression(random_state=0, solver="saga", max_iter=1000)

stacking_model = StackingClassifier(
    estimators=stacking_estimators, 
    final_estimator=meta_stack_classifier, 
    cv=5
)

estimators = [
    ('RandomForest', RandomForestClassifier(random_state=0)),
    #('DecisionTree', DecisionTreeClassifier(random_state=0)),
    #('SVM', SVC(random_state=0, probability=True)),
    #('NaiveBayes', GaussianNB()),
    #('KNN', KNeighborsClassifier()),
    #('LogisticRegression', LogisticRegression(random_state=0, solver="saga", max_iter=1000)),
    ('GradientBoost', GradientBoostingClassifier(random_state=0)),
    ('XGBoost', XGBClassifier(seed=0)),
    #('AdaBoost', AdaBoostClassifier(random_state=0, algorithm='SAMME'))
    #('Voting', vote_model),
    #('Stacking', stacking_model),
    #('NeuralNetwork', MLPClassifier(random_state=0, max_iter=1000)) # 2 hidden layers with 20 neurons each
]

# Create Pipeline with different combination of preprocessing steps

## Combination 7
#### knn impute, robust scaler, lof, smote, rfecv

In [14]:
objs = dict()
####################################### Imputing Missing Values
X_trainP, imp = handle_missing_vals_knn(
    X_train, 
    n_neighbors=5
)

objs['miss'] = imp
print('Shape of xtrain: ', X_trainP.shape)

####################################### Robust
std_scale_cols = (
    X_trainP
    .loc[:, ~X_trainP.columns.str.contains('Group')]
    .columns
)

X_trainP, std_scaler = apply_robust_scaler(
    X_trainP, 
    std_scale_cols
)

objs['robust_scaler'] = std_scaler
####################################### LOF
X_trainP_df = pd.concat(
    [
        X_trainP.reset_index(drop=True), 
        pd.Series(y_train, name='Class').reset_index(drop=True)
    ], 
    axis=1
)

X_trainP_df = detect_outliers_with_lof(
    data=X_trainP_df
)[0]

X_trainP = (
    X_trainP_df
    .loc[:, X_trainP_df.columns != 'Class']
)

y_trainP = X_trainP_df['Class']
print('Shape of xtrain: ', X_trainP.shape)

####################################### Smote
X_trainP, y_trainP = apply_smote(X_trainP, y_trainP)
####################################### RFECV
classifier = DecisionTreeClassifier()
X_trainP = select_features_rfecv(
    X=X_trainP, 
    y=y_trainP, 
    classifier=classifier, 
    cv=3, 
    scoring=matrix_error_function
)

print('X_train last shape: ', X_trainP.shape)
print('X_train: ')
display(X_trainP.head())

Shape of xtrain:  (8000, 116)
Shape of xtrain:  (5669, 116)
{0: -1, 1: 0, 2: 1}
X_train last shape:  (8097, 62)
X_train: 


Unnamed: 0,I1,I5,I7,I8,I10,I12,I14,I17,I20,I24,I25,I26,I27,I29,I31,I34,I37,I41,I43,I44,I45,I46,I48,I49,I55,dI5,dI6,dI8,dI10,dI11,dI14,dI15,dI16,dI17,dI18,dI19,dI20,dI21,dI24,dI25,dI28,dI29,dI32,dI33,dI37,dI38,dI39,dI40,dI42,dI43,dI44,dI45,dI46,dI48,dI49,dI50,dI51,dI52,dI54,dI56,dI57,dI58
0,0.257222,0.228407,-0.340551,0.052209,0.271002,0.496189,-0.597146,-0.401116,0.253007,-0.50614,0.723366,-0.359138,0.209829,-0.886616,-0.725255,-0.448191,-0.407886,2.020021,0.087606,10.853418,0.23296,0.294091,0.202464,0.248966,-0.361939,0.864354,0.42059,0.152012,-0.27844,-0.697403,-0.151908,0.330271,0.480937,0.105801,0.828692,-0.153417,0.627844,-1.023557,-0.350078,-1.124496,-0.481862,-0.011507,-0.365698,-0.313408,-0.028306,-0.039932,-0.00059,-0.08433,1.349119,0.304139,6.665128,0.326409,-0.114915,-0.270225,0.285937,0.395921,0.367199,0.934762,-0.56983,0.991797,0.310161,-0.306106
1,-0.361863,0.031782,0.022637,0.94332,0.387194,0.512188,7.264564,-0.669534,-0.708633,-0.182917,-0.596198,0.054723,-1.110256,0.071068,0.096117,0.119222,-0.120504,-0.180083,0.2535,0.652781,2.104211,4.149428,-0.132189,-0.270833,-1.142337,0.056756,0.588031,1.596422,0.754412,0.26882,3.652596,0.076181,0.994636,0.589544,0.246309,0.312247,0.002619,4.374371,0.063015,-0.020819,-0.343208,0.000404,0.098332,0.187312,0.307731,-0.228564,-0.015191,0.160223,0.00608,-0.307661,9.676799,-0.342343,-0.442173,-0.230985,0.061436,-0.192504,0.069321,0.11101,-0.139752,-0.187492,-0.543427,-0.767147
2,-0.054761,2.097046,0.509116,0.058617,-0.06122,-1.020109,-0.154491,1.801177,-0.181799,0.29459,-0.701036,-0.607583,1.024611,2.190536,1.613072,0.203722,1.785721,0.738357,-0.365665,-0.132286,-0.974913,-0.823595,-0.63251,-0.583883,0.857716,0.547907,-0.905303,-1.405268,2.616926,-0.437669,1.162614,-1.490145,-1.350938,1.666858,0.195359,1.873735,1.08368,0.053307,-2.052288,-0.944161,-0.10671,2.019419,0.456669,0.087022,6.401213,-1.659575,-0.119987,0.472956,2.032882,-6.656276,0.418976,-0.571856,-0.610394,-0.343737,-0.273945,-0.25454,0.009058,-1.551534,-1.557089,1.103946,0.151429,0.568551
3,0.512351,1.587358,1.427926,2.454445,0.643838,0.578078,0.243019,-0.460245,0.69965,2.244842,-0.111783,-0.480874,0.504349,-0.852084,-0.106524,1.172754,0.927211,0.569384,0.017579,-0.110495,-0.361345,-0.409949,-0.317592,0.380635,0.13757,1.039125,2.258174,4.6029,2.360588,4.014068,0.861087,4.161304,1.595727,-2.487131,1.590065,-2.570513,1.6184,1.839754,1.181724,2.702478,0.565674,-2.394709,6.835974,5.763623,0.090779,0.164046,-0.218561,0.36226,0.909431,-3.302053,0.261788,2.250664,0.975986,1.132343,0.935106,1.257019,1.112277,2.349847,-0.490743,-1.784539,3.056829,-0.315236
4,1.926111,-0.247171,-0.135065,0.20263,-0.007448,0.11606,1.393471,-0.839264,3.497515,-0.106582,1.426684,1.949872,-0.073429,-0.11329,-0.003324,0.146857,-0.069268,0.053634,0.658338,2.021241,1.124699,-0.20714,3.10971,2.566536,-0.766941,0.176622,0.247994,0.255141,-0.303156,-0.808617,-1.495864,0.238106,0.031286,0.012731,0.819207,-0.051595,-1.947615,0.070822,-0.260654,-1.387201,-0.806578,-0.242324,-0.4865,-0.185255,-0.1335,-0.303795,-0.364613,-0.474372,0.099597,1.011443,1.093995,-0.651697,-0.158384,-1.58243,-0.127088,-0.056971,-0.395614,-0.339916,-0.425856,0.329036,0.42303,-0.989233


# Apply cross validation with f1, auc and accuracy

#### *Weighted F1 Score: F1 score calculated by taking the average of F1 scores for each class. Average is weighted by support which is the number of true instances for each label. 
#### *AUC One vs One Weighted: By considering all pairwise combinations of classes, average AUC is calculated. Average is weighted by the support. 


## Create Class object and apply cv

In [15]:
error_cost_matrix = np.array([[0, 1, 2],
                              [1, 0, 1],
                              [2, 1, 0]])
model_selector = ModelSelection(
    x_train=X_trainP, 
    y_train=y_trainP,
    estimators=estimators,
    cost_matrix=error_cost_matrix
)
model_selector.encode_y_train()
model_selector.create_col_transformer()

In [16]:
model_selector.target_label_mapping

{0: -1, 1: 0, 2: 1}

In [17]:
model_selector.calculate_cv_f1(n_folds=5, scoring_average='f1_weighted')
print('**********************************************')
# model_selector.calculate_cv_auc(n_folds=5, scoring_average='roc_auc_ovo_weighted')
# print('**********************************************')   
# model_selector.calculate_cv_accuracy(n_folds=5, scoring_average='accuracy')
# print('**********************************************')   
model_selector.calculate_cost_matrix_error_cv(custom_cost_func=matrix_error_function, n_folds=5)

CV Results for Mean F1 Score:

RandomForest = 0.645754
GradientBoost = 0.508991
XGBoost = 0.631062
AdaBoost = 0.410647

Best Estimator (F1): RandomForest
**********************************************
CV Results for Cost Matrix Error Score:

RandomForest = 0.578728
GradientBoost = 0.680250
XGBoost = 0.586390
AdaBoost = 0.773370

Best Estimator (Cost Metric Error Score): RandomForest


## GridSearchCV

In [20]:
# param_grid = {
#     'ClassificationModel__n_estimators': [100, 200, 250],
#     'ClassificationModel__max_depth': [3, 10, 20],
#     'ClassificationModel__min_samples_split': [2, 10, 20],
#     'ClassificationModel__learning_rate': [0.05, 0.1, 0.2],
#     'ClassificationModel__random_state': [0]
# }
param_grid = {
    'ClassificationModel__n_estimators': [100, 150, 200],
    'ClassificationModel__max_depth': [None, 5, 10],
    'ClassificationModel__min_samples_split': [2, 5],
    'ClassificationModel__random_state': [0],
}

In [21]:
model_selector.apply_grid_cv(
    estimator=RandomForestClassifier(),
    params=param_grid,
    cv=3,
    scoring=matrix_error_function
)

Fitting 3 folds for each of 18 candidates, totalling 54 fits
Best parameters found by GridSearchCV (make_scorer(custom_error_cost_score, greater_is_better=False, response_method='predict')):
{'ClassificationModel__max_depth': 10, 'ClassificationModel__min_samples_split': 2, 'ClassificationModel__n_estimators': 150, 'ClassificationModel__random_state': 0}

Best score found by GridSearchCV (make_scorer(custom_error_cost_score, greater_is_better=False, response_method='predict')):
-0.5724342349018156


In [None]:
# Predictions on the real 

## Neural Network