# Pycaret

## Requirements

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from pycaret.classification import *
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import OneHotEncoder

pd.set_option('display.max_rows', 1000)
pd.set_option('display.max_columns', 1000)
pd.set_option('mode.chained_assignment',  None)

In [2]:
inputs = 'inputs'
outputs = 'outputs'

train = pd.read_csv(inputs + '/train.csv')
test = pd.read_csv(inputs + '/test.csv')
submission = pd.read_csv(outputs + '/sample_submission.csv')

In [3]:
train.columns

Index(['index', 'Q1', 'Q2', 'Q3', 'Q4', 'Q5', 'Q6', 'Q7', 'Q8', 'Q9', 'Q10',
       'Q11', 'Q12', 'Q13', 'Q14', 'Q15', 'Q16', 'Q17', 'Q18', 'Q19', 'Q20',
       'Q21', 'Q22', 'Q23', 'Q24', 'Q25', 'Q26', 'country', 'introelapse',
       'testelapse', 'surveyelapse', 'TIPI1', 'TIPI2', 'TIPI3', 'TIPI4',
       'TIPI5', 'TIPI6', 'TIPI7', 'TIPI8', 'TIPI9', 'TIPI10', 'VCL1', 'VCL2',
       'VCL3', 'VCL4', 'VCL5', 'VCL6', 'VCL7', 'VCL8', 'VCL9', 'VCL10',
       'VCL11', 'VCL12', 'VCL13', 'VCL14', 'VCL15', 'VCL16', 'education',
       'urban', 'gender', 'engnat', 'age', 'hand', 'religion', 'orientation',
       'voted', 'married', 'familysize', 'ASD', 'nerdiness'],
      dtype='object')

## Model

In [4]:
numberic_features = ['index', 'introelapse', 'testelapse', 'familysize', 'age']
clf = setup(data=train, target='nerdiness', numeric_features=numberic_features)

Unnamed: 0,Description,Value
0,session_id,4035
1,Target,nerdiness
2,Target Type,Binary
3,Label Encoded,
4,Original Data,"(15000, 70)"
5,Missing Values,True
6,Numeric Features,6
7,Categorical Features,63
8,Ordinal Features,False
9,High Cardinality Features,False


In [5]:
best_2 = compare_models(sort='AUC', n_select=2)

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
et,Extra Trees Classifier,0.7667,0.8547,0.8296,0.7686,0.7979,0.5231,0.5252,0.68
rf,Random Forest Classifier,0.7684,0.8504,0.8351,0.768,0.8001,0.5259,0.5286,0.533
lightgbm,Light Gradient Boosting Machine,0.7464,0.8172,0.8131,0.7507,0.7806,0.4811,0.4834,0.167
gbc,Gradient Boosting Classifier,0.7346,0.8015,0.8032,0.7407,0.7706,0.4571,0.4593,1.73
lda,Linear Discriminant Analysis,0.7234,0.794,0.7843,0.7352,0.7589,0.4353,0.4367,0.557
ada,Ada Boost Classifier,0.7223,0.7892,0.7781,0.7366,0.7567,0.4337,0.4348,0.502
lr,Logistic Regression,0.7159,0.786,0.7759,0.7298,0.7519,0.4202,0.4215,2.358
nb,Naive Bayes,0.6367,0.7492,0.5767,0.7418,0.6075,0.2831,0.3129,0.076
dt,Decision Tree Classifier,0.6892,0.6852,0.722,0.7193,0.7205,0.3705,0.3706,0.241
knn,K Neighbors Classifier,0.5204,0.514,0.6106,0.5627,0.5856,0.0187,0.0188,1.073


In [6]:
blended = blend_models(estimator_list=best_2, fold=5, method='soft')

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.7681,0.8506,0.8448,0.763,0.8018,0.5241,0.5281
1,0.7662,0.8472,0.8379,0.7639,0.7992,0.5208,0.524
2,0.7505,0.8363,0.8412,0.743,0.789,0.4865,0.4922
3,0.7681,0.8559,0.8361,0.7669,0.8,0.5253,0.5281
4,0.7718,0.8594,0.8326,0.7735,0.802,0.5336,0.5356
Mean,0.7649,0.8499,0.8385,0.7621,0.7984,0.5181,0.5216
Std,0.0075,0.008,0.0042,0.0102,0.0048,0.0163,0.0152


In [7]:
final_model = finalize_model(blended)

In [8]:
predictions = predict_model(final_model, test, raw_score=True)

In [9]:
predictions

Unnamed: 0,index,Q1,Q2,Q3,Q4,Q5,Q6,Q7,Q8,Q9,...,hand,religion,orientation,voted,married,familysize,ASD,Label,Score_0,Score_1
0,0,4.0,4.0,3.0,5.0,5.0,5.0,3.0,5.0,4.0,...,1.0,4.0,4.0,1.0,1.0,3.0,2.0,0,0.855,0.145
1,1,4.0,5.0,4.0,4.0,5.0,4.0,5.0,5.0,5.0,...,1.0,1.0,5.0,2.0,1.0,5.0,2.0,1,0.285,0.715
2,2,5.0,5.0,5.0,5.0,4.0,5.0,5.0,5.0,5.0,...,1.0,4.0,5.0,2.0,1.0,3.0,2.0,1,0.260,0.740
3,3,5.0,4.0,3.0,4.0,5.0,4.0,5.0,4.0,4.0,...,1.0,2.0,2.0,2.0,1.0,3.0,2.0,1,0.465,0.535
4,4,5.0,5.0,5.0,5.0,5.0,3.0,5.0,5.0,5.0,...,1.0,4.0,5.0,2.0,1.0,2.0,2.0,1,0.085,0.915
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
35447,35447,4.0,5.0,5.0,3.0,3.0,4.0,5.0,5.0,4.0,...,3.0,10.0,4.0,2.0,1.0,3.0,2.0,1,0.115,0.885
35448,35448,5.0,5.0,5.0,5.0,5.0,4.0,5.0,3.0,5.0,...,3.0,2.0,5.0,2.0,1.0,2.0,2.0,1,0.300,0.700
35449,35449,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,...,1.0,1.0,5.0,1.0,1.0,2.0,2.0,1,0.065,0.935
35450,35450,5.0,5.0,4.0,5.0,5.0,1.0,5.0,1.0,5.0,...,1.0,12.0,1.0,2.0,1.0,,2.0,0,0.810,0.190


In [10]:
submission['nerdiness'] = predictions['Score_1']

In [11]:
submission.to_csv(outputs + '/pycaret_submission.csv', index=False)