# Pycaret

## Requirements

In [3]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from pycaret.classification import *
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import OneHotEncoder

pd.set_option('display.max_rows', 1000)
pd.set_option('display.max_columns', 1000)
pd.set_option('mode.chained_assignment',  None)

In [4]:
inputs = 'inputs'
outputs = 'outputs'

train = pd.read_csv(inputs + '/train.csv')
test = pd.read_csv(inputs + '/test.csv')
submission = pd.read_csv(outputs + '/sample_submission.csv')

In [5]:
train.columns

Index(['index', 'Q1', 'Q2', 'Q3', 'Q4', 'Q5', 'Q6', 'Q7', 'Q8', 'Q9', 'Q10',
       'Q11', 'Q12', 'Q13', 'Q14', 'Q15', 'Q16', 'Q17', 'Q18', 'Q19', 'Q20',
       'Q21', 'Q22', 'Q23', 'Q24', 'Q25', 'Q26', 'country', 'introelapse',
       'testelapse', 'surveyelapse', 'TIPI1', 'TIPI2', 'TIPI3', 'TIPI4',
       'TIPI5', 'TIPI6', 'TIPI7', 'TIPI8', 'TIPI9', 'TIPI10', 'VCL1', 'VCL2',
       'VCL3', 'VCL4', 'VCL5', 'VCL6', 'VCL7', 'VCL8', 'VCL9', 'VCL10',
       'VCL11', 'VCL12', 'VCL13', 'VCL14', 'VCL15', 'VCL16', 'education',
       'urban', 'gender', 'engnat', 'age', 'hand', 'religion', 'orientation',
       'voted', 'married', 'familysize', 'ASD', 'nerdiness'],
      dtype='object')

## Data Preprocessing

### Processing

In [6]:
train_x = train.copy().iloc[:, :-1]
train_y = train.copy()['nerdiness']


def onehot_encoder(dataframe, target, encoder=None):
    if encoder == None:
        encoder = OneHotEncoder(sparse=False, handle_unknown='ignore')
        encoder.fit(dataframe[[target]])
    ohe_df = encoder.transform(dataframe[[target]])
    tag_function = lambda x: f'{target}_{x}'
    ohe_df = pd.DataFrame(ohe_df, columns=list(map(tag_function, encoder.categories_[0])))
    results = pd.concat([dataframe, ohe_df], axis=1)
    
    return results, encoder


def preprocess(x, copy=True, encoders=None, scaler=None):
    if copy:
        x = x.copy()

    # cast
    int_list = [
        'VCL1', 'VCL2', 'VCL3', 'VCL4', 'VCL5', 'VCL6', 'VCL7', 'VCL8',
        'VCL9', 'VCL10', 'VCL11', 'VCL12', 'VCL13', 'VCL14', 'VCL15', 'VCL16',
        'urban', 'age'
    ]
    for col in int_list:
        x[col] = x[col].astype(float)

    # isnull (missing values)
    null_list = [
        'Q1', 'Q2', 'Q3', 'Q4', 'Q5','Q6', 'Q7', 'Q8', 'Q9', 'Q10', 'Q11', 'Q12', 'Q13',
        'Q14', 'Q15', 'Q16', 'Q17', 'Q18', 'Q19', 'Q20', 'Q21', 'Q22', 'Q23', 'Q24', 'Q25', 'Q26',
        'TIPI1', 'TIPI2', 'TIPI3', 'TIPI4', 'TIPI5', 'TIPI6', 'TIPI7', 'TIPI8', 'TIPI9', 'TIPI10',
        'education', 'gender', 'familysize']
    for col in null_list:
        x[f'{col}_isnull'] = np.where(pd.isnull(x[col]), 1.0, 0.0)

    # mean (missing values)
    mean_list = [
        'Q1', 'Q2', 'Q3', 'Q4', 'Q5','Q6', 'Q7', 'Q8', 'Q9', 'Q10', 'Q11', 'Q12', 'Q13',
        'Q14', 'Q15', 'Q16', 'Q17', 'Q18', 'Q19', 'Q20', 'Q21', 'Q22', 'Q23', 'Q24', 'Q25', 'Q26',
        'TIPI1', 'TIPI2', 'TIPI3', 'TIPI4', 'TIPI5', 'TIPI6', 'TIPI7', 'TIPI8', 'TIPI9', 'TIPI10',
        'familysize']
    for col in mean_list:
        x[col][pd.isnull(x[col])] = x[col].mean()

    # value (missing values)
    value_dict = {
        'country': 'None',
        'education': 1.5,
        'gender': 0,
        'engnat': 0,
        'hand': 1,
        'religion': 0,
        'orientation': 0,
        'voted': 0,
        'married': 0,
        'ASD': 0}
    for col in value_dict:
        x[col][pd.isnull(x[col])] = value_dict[col]

    # value (0 values)
    value0_dict = {
        'urban': 1.5
    }
    for col in value0_dict:
        x[f'{col}_isnull'] = np.where(x[col] == 0, 1.0, 0.0)
        x[col][x[col] == 0] = value0_dict[col]

    # OneHotEncoder
    ohe_list = [
        'country', 'gender', 'engnat', 'hand', 'religion',
        'orientation', 'voted', 'married', 'ASD']
    if encoders == None:
        encoders = dict()
    for col in ohe_list:
        if col in encoders:
            x, encoders[col] = onehot_encoder(x, col, encoder=encoders[col])
        else:
            x, encoders[col] = onehot_encoder(x, col)

    # log (outlier)
    log_list = [
        'introelapse', 'testelapse', 'surveyelapse'
    ]
    for col in log_list:
        x[f'log_{col}'] = np.log(x[col] + 1e-08)

    # min-max cut (outlier)
    outlier_dict = {
        'age': {'min': 1, 'max': 100},
        'log_introelapse': {'min': 0, 'max': 12},
        'log_testelapse': {'min': 3, 'max': 8},
        'log_surveyelapse': {'min': 0, 'max': 10},
        'familysize': {'min': 1, 'max': 40}}
    for col in outlier_dict:
        x[col][x[col] > outlier_dict[col]['max']] = outlier_dict[col]['max']
        x[col][x[col] < outlier_dict[col]['min']] = outlier_dict[col]['min']

    # drop
    drop_list = [
        'index', 
        'country_AGO', 'country_ALA', 'country_ARM', 'country_AZE',
        'country_BHS', 'country_BLR', 'country_BRB', 'country_BRN',
        'country_BWA', 'country_DOM', 'country_ETH', 'country_FRO',
        'country_GRL', 'country_GTM', 'country_GUF', 'country_GUY',
        'country_IRQ', 'country_KAZ', 'country_KHM', 'country_LBY',
        'country_LUX', 'country_MAC', 'country_MDV', 'country_MNP',
        'country_MOZ', 'country_MUS', 'country_MWI', 'country_NAM',
        'country_NPL', 'country_OMN', 'country_PAN', 'country_SDN',
        'country_SSD', 'country_TUN', 'country_UGA', 'country_VGB',
        'country_VIR', 'country_FSM', 'country_GEO', 'country_PNG',
        'country_RWA', 'country_SYR', 'country_LAO', 'country_MNG',
        'country_CUW', 'country_MLT', 'country_BHR', 'country_MDG'
    ] + log_list + ohe_list
    for col in drop_list:
        x = x.drop(columns=col)

    if scaler == None:
        scaler = MinMaxScaler()
        scaler.fit(x)
    x = pd.DataFrame(scaler.transform(x), columns=x.columns, index=list(x.index.values))

    return x, encoders, scaler

train_x, encoders, scaler = preprocess(train_x, copy=False)
test_x, _, _ = preprocess(test, encoders=encoders, scaler=scaler)

## Model

In [7]:
train_ = pd.concat([train_x, train_y], axis=1)

In [8]:
clf = setup(data=train_, target='nerdiness')

Unnamed: 0,Description,Value
0,session_id,8073
1,Target,nerdiness
2,Target Type,Binary
3,Label Encoded,
4,Original Data,"(15000, 229)"
5,Missing Values,False
6,Numeric Features,43
7,Categorical Features,185
8,Ordinal Features,False
9,High Cardinality Features,False


In [36]:
best_2 = compare_models(sort='AUC', n_select=3)

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
et,Extra Trees Classifier,0.7724,0.8602,0.839,0.7729,0.8045,0.5332,0.536,0.506
rf,Random Forest Classifier,0.7706,0.8567,0.8373,0.7714,0.8029,0.5297,0.5325,0.411
lightgbm,Light Gradient Boosting Machine,0.7467,0.8199,0.8076,0.7555,0.7807,0.4819,0.4834,0.14
gbc,Gradient Boosting Classifier,0.7334,0.8032,0.8062,0.7395,0.7714,0.4531,0.4556,0.941
lr,Logistic Regression,0.7306,0.796,0.8033,0.7375,0.7689,0.4475,0.4499,2.866
lda,Linear Discriminant Analysis,0.7281,0.7938,0.809,0.732,0.7685,0.441,0.4444,0.274
ada,Ada Boost Classifier,0.7225,0.7879,0.787,0.7347,0.7599,0.4322,0.4338,0.249
knn,K Neighbors Classifier,0.6598,0.7025,0.7576,0.6735,0.713,0.2988,0.302,2.081
dt,Decision Tree Classifier,0.7031,0.699,0.7346,0.7339,0.7341,0.398,0.3983,0.102
nb,Naive Bayes,0.5681,0.5313,0.9624,0.5665,0.7132,0.0362,0.0739,0.041


In [37]:
blended = blend_models(estimator_list=best_2, fold=5, method='soft')

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.7714,0.8537,0.8336,0.7742,0.8028,0.5319,0.5339
1,0.7657,0.8523,0.8294,0.769,0.798,0.5201,0.5222
2,0.77,0.8548,0.8396,0.7694,0.8029,0.528,0.531
3,0.7814,0.8558,0.8403,0.7834,0.8109,0.5527,0.5547
4,0.7594,0.8514,0.8403,0.7558,0.7958,0.505,0.5092
Mean,0.7696,0.8536,0.8366,0.7703,0.8021,0.5276,0.5302
Std,0.0072,0.0016,0.0044,0.009,0.0052,0.0156,0.0149


In [38]:
final_model = finalize_model(blended)

In [39]:
predictions = predict_model(final_model, test_x, raw_score=True)

In [42]:
predictions

Unnamed: 0,Q1,Q2,Q3,Q4,Q5,Q6,Q7,Q8,Q9,Q10,...,married_3.0,ASD_0.0,ASD_1.0,ASD_2.0,log_introelapse,log_testelapse,log_surveyelapse,Label,Score_0,Score_1
0,0.75,0.75,0.50,1.00,1.00,1.00,0.5,1.00,0.75,1.00,...,0.0,0.0,0.0,1.0,0.183102,0.347240,0.493681,0,0.6910,0.3090
1,0.75,1.00,0.75,0.75,1.00,0.75,1.0,1.00,1.00,0.75,...,0.0,0.0,0.0,1.0,0.240864,0.334566,0.475124,1,0.1932,0.8068
2,1.00,1.00,1.00,1.00,0.75,1.00,1.0,1.00,1.00,0.75,...,0.0,0.0,0.0,1.0,0.115525,0.293182,0.466631,1,0.1551,0.8449
3,1.00,0.75,0.50,0.75,1.00,0.75,1.0,0.75,0.75,1.00,...,0.0,0.0,0.0,1.0,0.648914,0.408685,0.500410,1,0.3452,0.6548
4,1.00,1.00,1.00,1.00,1.00,0.50,1.0,1.00,1.00,1.00,...,0.0,0.0,0.0,1.0,0.264838,0.418750,0.513158,1,0.1705,0.8295
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
35447,0.75,1.00,1.00,0.50,0.50,0.75,1.0,1.00,0.75,0.50,...,0.0,0.0,0.0,1.0,0.191882,0.459663,0.498655,1,0.1594,0.8406
35448,1.00,1.00,1.00,1.00,1.00,0.75,1.0,0.50,1.00,0.50,...,0.0,0.0,0.0,1.0,0.518548,0.382531,0.443168,1,0.2853,0.7147
35449,1.00,1.00,1.00,1.00,1.00,1.00,1.0,1.00,1.00,1.00,...,0.0,0.0,0.0,1.0,0.274653,0.359158,0.466631,1,0.0848,0.9152
35450,1.00,1.00,0.75,1.00,1.00,0.00,1.0,0.00,1.00,1.00,...,0.0,0.0,0.0,1.0,0.091551,0.212089,0.399415,0,0.7820,0.2180


In [43]:
submission['nerdiness'] = predictions['Score_1']

In [44]:
submission.to_csv(outputs + '/pycaret_preprocess_submission.csv', index=False)