In [0]:
# import the required libraries
!pip install vecstack
from vecstack import stacking
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.neural_network import MLPClassifier
#from sklearn.ensemble import RandomForestClassifier
from imblearn.over_sampling import SMOTE 
from sklearn.svm import SVC
from collections import Counter #for Smote, 

import warnings
warnings.filterwarnings("ignore")






*   Data Cleaning was done using Excel
*   Only the feature SPORTS has close to 18% missing data that is 'na'




In [0]:
# read the test and train datasets
trainfile = r'Train.csv'
trainData = pd.read_csv(trainfile) #creates a dataframe
testfile = r'Test.csv'
testData = pd.read_csv(testfile) #creates a dataframe

In [0]:
# basic checks on the read data
print(trainData.shape)
print(testData.shape)
print(trainData.head())
print(testData.head())


Imbalanced Data Check | Not a bad case of Imbalanced Data

In [0]:
# Imbalanced Data Check

# Get the different classes in the target
trainData.TARGET.unique()

array([0, 1])

In [0]:
# Imbalanced Data Set Check
class1 = len(trainData.TARGET[trainData.TARGET == 1])
class0 = len(trainData.TARGET[trainData.TARGET == 0])
print("Class1:",class1)
print("Class0",class0)
print("Minority Class:",class1/(class1+class0))

Class1: 643
Class0 1052
Minority Class: 0.3793510324483776


One Hot Encoding of Categorical Variables

In [0]:
# One Hot Encoding of Categorical Variables
categoricalFeatures = ["CONVB","SEX","SPORTS","ARTS","TRAVEL","EDUC"]

# Copy Train and Test data excluding TARGET
trainData_Copy = trainData.iloc[:, :-1].copy()
testData_Copy = testData.iloc[:, :-1].copy()

#Combine Train and test for one Hot Encoding
combined_Data = pd.concat([trainData_Copy,testData_Copy], keys=[0,1])

#Do one Hot encoding for categorical features
combined_Data = pd.get_dummies(combined_Data,columns=categoricalFeatures)

#Separate Train data and test data
X_train = combined_Data.xs(0)
X_test = combined_Data.xs(1)
y_train=trainData["TARGET"]
y_test=testData["TARGET"]

#Select just Target Column
y_train = trainData.iloc[:, -1]
y_test = testData.iloc[:, -1]

print(X_train.shape)
print(X_test.head()) 

print(y_train.shape)
print(y_test.head()) 

(1695, 40)
   MOUMO  MOUPMO  MOU3MO  ...  EDUC_HighSchool  EDUC_Masters  EDUC_PhD
0    344     232     455  ...                0             0         0
1    555     444     678  ...                0             0         1
2      2       0       4  ...                0             0         0
3   5678    3457    2156  ...                0             1         0
4    333     122     144  ...                0             1         0

[5 rows x 40 columns]
(1695,)
0    0
1    0
2    0
3    0
4    1
Name: TARGET, dtype: int64


Default Decision Tree Classifier

In [0]:
# Construct Default Decision Tree
clf = DecisionTreeClassifier()
clf.fit(X_train, y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=None, splitter='best')

In [0]:
# Decision Tree Hard Predictions
clf_predict = pd.DataFrame(clf.predict(X_test), columns = ["DTC"])

# Check the Default Decision Tree Metrics
print("------------------------------------------------------------------\n")
print("Default Decision Tree Metrics\n")
print("Test Accuracy Score: " + str(accuracy_score(y_test, clf_predict)))
print("Test Precision Score: " + str(precision_score(y_test, clf_predict)))
print("Test Recall Score: " + str(recall_score(y_test, clf_predict)))
print("Test F1 Score: " + str(f1_score(y_test, clf_predict)))
print("Test AUC Score: " + str(roc_auc_score(y_test, clf_predict)))
print("\n")
print("------------------------------------------------------------------\n")


------------------------------------------------------------------

Default Decision Tree Metrics

Test Accuracy Score: 0.9946666666666667
Test Precision Score: 1.0
Test Recall Score: 0.9857142857142858
Test F1 Score: 0.9928057553956835
Test AUC Score: 0.9928571428571429


------------------------------------------------------------------



Default Random Forest Classifier

In [0]:
# Construct Default Random Forest
rfc = RandomForestClassifier()
rfc.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=10,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [0]:
# Random Forest Classifier Hard Predictions
rfc_predict = pd.DataFrame(rfc.predict(X_test), columns = ["RFC"])

# Check the Random Forest Classifier Metrics
print("------------------------------------------------------------------\n")
print("Default Random Forest Metrics\n")
print("Test Accuracy Score: " + str(accuracy_score(y_test, rfc_predict)))
print("Test Precision Score: " + str(precision_score(y_test, rfc_predict)))
print("Test Recall Score: " + str(recall_score(y_test, rfc_predict)))
print("Test F1 Score: " + str(f1_score(y_test, rfc_predict)))
print("Test AUC Score: " + str(roc_auc_score(y_test, rfc_predict)))
print("\n")
print("------------------------------------------------------------------\n")


------------------------------------------------------------------

Default Random Forest Metrics

Test Accuracy Score: 0.9946666666666667
Test Precision Score: 1.0
Test Recall Score: 0.9857142857142858
Test F1 Score: 0.9928057553956835
Test AUC Score: 0.9928571428571429


------------------------------------------------------------------



Default Multi-layer Perceptron

In [0]:
# Train the MLP Classifier
from sklearn.neural_network import MLPClassifier
mlpc = MLPClassifier()
mlpc.fit(X_train, y_train)

MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
              beta_2=0.999, early_stopping=False, epsilon=1e-08,
              hidden_layer_sizes=(100,), learning_rate='constant',
              learning_rate_init=0.001, max_iter=200, momentum=0.9,
              n_iter_no_change=10, nesterovs_momentum=True, power_t=0.5,
              random_state=None, shuffle=True, solver='adam', tol=0.0001,
              validation_fraction=0.1, verbose=False, warm_start=False)

In [0]:
# Hard Predictions with the MLP Classifier
mlpc_predict = pd.DataFrame(mlpc.predict(X_test), columns = ["MLPC"])

# Check the MLP Classifier Metrics
print("------------------------------------------------------------------\n")
print("Default MLP Metrics\n")
print("Test Accuracy Score: " + str(accuracy_score(y_test, mlpc_predict)))
print("Test Precision Score: " + str(precision_score(y_test, mlpc_predict)))
print("Test Recall Score: " + str(recall_score(y_test, mlpc_predict)))
print("Test F1 Score: " + str(f1_score(y_test, mlpc_predict)))
print("Test AUC Score: " + str(roc_auc_score(y_test, mlpc_predict)))
print("\n")
print("------------------------------------------------------------------\n")


------------------------------------------------------------------

Default MLP Metrics

Test Accuracy Score: 0.6773333333333333
Test Precision Score: 0.5372549019607843
Test Recall Score: 0.9785714285714285
Test F1 Score: 0.6936708860759494
Test AUC Score: 0.7382218844984803


------------------------------------------------------------------



Support Vector Machines

In [0]:
# Train the SVM Classifier
from sklearn import svm
svmc = svm.SVC()
svmc.fit(X_train, y_train)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
    kernel='rbf', max_iter=-1, probability=False, random_state=None,
    shrinking=True, tol=0.001, verbose=False)

In [0]:
# Hard Predictions with the SVM Classifier
svmc_predict = pd.DataFrame(svmc.predict(X_test), columns = ["SVMC"])

# Check the SVM Classifier Metrics
print("------------------------------------------------------------------\n")
print("Default SVM Metrics\n")
print("Test Accuracy Score: " + str(accuracy_score(y_test, svmc_predict)))
print("Test Precision Score: " + str(precision_score(y_test, svmc_predict)))
print("Test Recall Score: " + str(recall_score(y_test, svmc_predict)))
print("Test F1 Score: " + str(f1_score(y_test, svmc_predict)))
print("Test AUC Score: " + str(roc_auc_score(y_test, svmc_predict)))
print("\n")
print("------------------------------------------------------------------\n")


------------------------------------------------------------------

Default SVM Metrics

Test Accuracy Score: 0.9946666666666667
Test Precision Score: 1.0
Test Recall Score: 0.9857142857142858
Test F1 Score: 0.9928057553956835
Test AUC Score: 0.9928571428571429


------------------------------------------------------------------



Hyper Parameter Tuning with cross validation on the default Decision Tree

In [0]:
# Get the Default Decision Tree Parameters
print("Default Decision Tree Parameters\n")
print(clf.get_params(),"\n") # parameters used in the current model
print("Number of leaves:",clf.get_n_leaves()) # Number of leaves
print("Depth:", clf.get_depth()) # Max Depth
print("Criterion:",clf.get_params()['criterion'],"\n") # Split Criterion


Default Decision Tree Parameters

{'class_weight': None, 'criterion': 'gini', 'max_depth': None, 'max_features': None, 'max_leaf_nodes': None, 'min_impurity_decrease': 0.0, 'min_impurity_split': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'presort': False, 'random_state': None, 'splitter': 'best'} 

Number of leaves: 19
Depth: 7
Criterion: gini 



In [0]:
#Hyperparameter tuning with cross validation on decision tree classifier

#Pick parameters to tune
parameters = {'max_leaf_nodes': range(15,20,1),'max_depth': range(5,10,1),'criterion':['gini','entropy']}

#Method: Random Search | Non Exhaustive search | Quick but may end up with sub optimal solution
print("Randomized Search with Cross Validation - Decision tree")
clf_random = RandomizedSearchCV(clf,parameters,n_iter=20,cv=5)
clf_random.fit(X_train, y_train)
rand_parm_rs = clf_random.best_params_
print(rand_parm_rs)

#Using the parameters obtained from HyperParameterTuning in the DecisionTreeClassifier
clf_gs = DecisionTreeClassifier(**rand_parm_rs) #** to get parameter entries for the function DecisionTreeClassifier from the dictionary grid_parm_rs
clf_gs.fit(X_train,y_train)

# Get Hard Predictions using the tuned Ensemble
clfgs_predict = pd.DataFrame(clf_gs.predict(X_test), columns = ["DTCtuned"])

# Check the Tuned DT Classifier Metrics
print("------------------------------------------------------------------\n")
print("Tuned DT Metrics\n")
print("Test Accuracy Score: " + str(accuracy_score(y_test, clfgs_predict)))
print("Test Precision Score: " + str(precision_score(y_test, clfgs_predict)))
print("Test Recall Score: " + str(recall_score(y_test, clfgs_predict)))
print("Test F1 Score: " + str(f1_score(y_test, clfgs_predict)))
print("Test AUC Score: " + str(roc_auc_score(y_test, clfgs_predict)))
print("\n")
print("------------------------------------------------------------------\n")


Randomized Search with Cross Validation - Decision tree
{'max_leaf_nodes': 17, 'max_depth': 5, 'criterion': 'gini'}
------------------------------------------------------------------

Tuned DT Metrics

Test Accuracy Score: 0.9946666666666667
Test Precision Score: 1.0
Test Recall Score: 0.9857142857142858
Test F1 Score: 0.9928057553956835
Test AUC Score: 0.9928571428571429


------------------------------------------------------------------



Hyper Parameter tuning with cross validation on default random forest classifier

In [0]:
#Hyperparameter tuning with cross validation on Random Forest classifier

#Pick parameters to tune
rfc_parameters = {'min_samples_leaf' : range(5,100,4),'max_depth': range(1,32,2),'max_features':[5,10,15,20],'n_estimators':[20,30,40,50]}

#Method: Random Search | Non Exhaustive search | Quick but may end up with sub optimal solution
print("Randomized Search with Cross Validation - Random Forest")
rfc_random = RandomizedSearchCV(rfc,rfc_parameters,n_iter=50,cv=5)
rfc_random.fit(X_train, y_train)
rfc_parm_rs = rfc_random.best_params_
print(rfc_parm_rs)
print("\n")

#Using the parameters obtained from HyperParameterTuning in the RFC
clf_gs = RandomForestClassifier(**rfc_parm_rs) #** to get parameter entries for the function RFC from the dictionary grid_parm_rs
clf_gs.fit(X_train,y_train)

# Get Hard Predictions using the tuned Ensemble
clfgs_predict = pd.DataFrame(clf_gs.predict(X_test), columns = ["RFCtuned"])

# Check the Tuned RF Classifier Metrics
print("------------------------------------------------------------------\n")
print("Tuned RF Metrics\n")
print("Test Accuracy Score: " + str(accuracy_score(y_test, clfgs_predict)))
print("Test Precision Score: " + str(precision_score(y_test, clfgs_predict)))
print("Test Recall Score: " + str(recall_score(y_test, clfgs_predict)))
print("Test F1 Score: " + str(f1_score(y_test, clfgs_predict)))
print("Test AUC Score: " + str(roc_auc_score(y_test, clfgs_predict)))
print("\n")
print("------------------------------------------------------------------\n")




Randomized Search with Cross Validation - Random Forest
{'n_estimators': 30, 'min_samples_leaf': 5, 'max_features': 15, 'max_depth': 5}


------------------------------------------------------------------

Tuned RF Metrics

Test Accuracy Score: 0.9946666666666667
Test Precision Score: 1.0
Test Recall Score: 0.9857142857142858
Test F1 Score: 0.9928057553956835
Test AUC Score: 0.9928571428571429


------------------------------------------------------------------



Hyper Parameter tuning with cross validation on default MLP classifier

In [0]:
#Hyperparameter tuning with cross validation on MLP classifier

#Pick parameters to tune
rfc_parameters = {'solver':['lbfgs', 'sgd', 'adam'], 'activation':['identity', 'logistic', 'tanh', 'relu'], 'alpha': [0.00005,0.0001,0.00015,0.0002,0.0005,]}

#Method: Random Search | Non Exhaustive search | Quick but may end up with sub optimal solution
print("Randomized Search with Cross Validation - MLP")
rfc_random = RandomizedSearchCV(mlpc,rfc_parameters,n_iter=20,cv=5)
rfc_random.fit(X_train, y_train)
rfc_parm_rs = rfc_random.best_params_
print(rfc_parm_rs)
print("\n")

#Using the parameters obtained from HyperParameterTuning in the MLP
clf_gs = MLPClassifier(**rfc_parm_rs) #** to get parameter entries for the function MLP from the dictionary grid_parm_rs
clf_gs.fit(X_train,y_train)

# Get Hard Predictions using the tuned Ensemble
clfgs_predict = pd.DataFrame(clf_gs.predict(X_test), columns = ["MLPtuned"])

# Check the Tuned MLP Classifier Metrics
print("------------------------------------------------------------------\n")
print("Tuned MLP Metrics\n")
print("Test Accuracy Score: " + str(accuracy_score(y_test, clfgs_predict)))
print("Test Precision Score: " + str(precision_score(y_test, clfgs_predict)))
print("Test Recall Score: " + str(recall_score(y_test, clfgs_predict)))
print("Test F1 Score: " + str(f1_score(y_test, clfgs_predict)))
print("Test AUC Score: " + str(roc_auc_score(y_test, clfgs_predict)))
print("\n")
print("------------------------------------------------------------------\n")




Randomized Search with Cross Validation - MLP
{'solver': 'adam', 'alpha': 0.0001, 'activation': 'relu'}


------------------------------------------------------------------

Tuned MLP Metrics

Test Accuracy Score: 0.5546666666666666
Test Precision Score: 0.45514950166112955
Test Recall Score: 0.9785714285714285
Test F1 Score: 0.6213151927437641
Test AUC Score: 0.6403495440729483


------------------------------------------------------------------



Hyper Parameter tuning with cross validation on default SVM CLassifier

In [0]:
#Hyperparameter tuning with cross validation on SVM classifier
#Used linear SVM due to time crunch

from sklearn.svm import LinearSVC
svmc = LinearSVC()

#Pick parameters to tune
rfc_parameters = {'max_iter' : range(500,1500,50)}

#Method: Random Search | Non Exhaustive search | Quick but may end up with sub optimal solution
print("Randomized Search with Cross Validation - SVM")
rfc_random = RandomizedSearchCV(svmc,rfc_parameters,n_iter=10,cv=5)
rfc_random.fit(X_train, y_train)
rfc_parm_rs = rfc_random.best_params_
print(rfc_parm_rs)
print("\n")

#Using the parameters obtained from HyperParameterTuning in the SVC
clf_gs = LinearSVC(**rfc_parm_rs) #** to get parameter entries for the function SVC from the dictionary grid_parm_rs
clf_gs.fit(X_train,y_train)

# Get Hard Predictions using the tuned Ensemble
clfgs_predict = pd.DataFrame(clf_gs.predict(X_test), columns = ["SVCtuned"])

# Check the Tuned SVM Classifier Metrics
print("------------------------------------------------------------------\n")
print("Tuned SVM Metrics\n")
print("Test Accuracy Score: " + str(accuracy_score(y_test, clfgs_predict)))
print("Test Precision Score: " + str(precision_score(y_test, clfgs_predict)))
print("Test Recall Score: " + str(recall_score(y_test, clfgs_predict)))
print("Test F1 Score: " + str(f1_score(y_test, clfgs_predict)))
print("Test AUC Score: " + str(roc_auc_score(y_test, clfgs_predict)))
print("\n")
print("------------------------------------------------------------------\n")


Randomized Search with Cross Validation - SVM
{'max_iter': 1000}


------------------------------------------------------------------

Tuned SVM Metrics

Test Accuracy Score: 0.6426666666666667
Test Precision Score: 0.75
Test Recall Score: 0.06428571428571428
Test F1 Score: 0.11842105263157894
Test AUC Score: 0.5257598784194528


------------------------------------------------------------------





*   Stacking of the top three models using Random Forest Classifier
*   Top Three Models: Default Decision Tree, Default Random Forest, Default SVM




In [0]:
# Stacking Models

models = [ RandomForestClassifier(), DecisionTreeClassifier(), svm.SVC() ]
      
S_Train, S_Test = stacking(models,                   
                           X_train, y_train, X_test,   
                           regression=False, 
     
                           mode='oof_pred_bag', 
       
                           needs_proba=False,
         
                           save_dir=None, 
            
                           metric=accuracy_score, 
    
                           n_folds=4, 
                 
                           stratified=True,
            
                           shuffle=True,  
            
                           random_state=0,    
         
                           verbose=2)


task:         [classification]
n_classes:    [2]
metric:       [accuracy_score]
mode:         [oof_pred_bag]
n_models:     [3]

model  0:     [RandomForestClassifier]
    fold  0:  [0.98349057]
    fold  1:  [0.98584906]
    fold  2:  [0.99528302]
    fold  3:  [0.98581560]
    ----
    MEAN:     [0.98760956] + [0.00453227]
    FULL:     [0.98761062]

model  1:     [DecisionTreeClassifier]
    fold  0:  [0.98349057]
    fold  1:  [0.98584906]
    fold  2:  [0.99528302]
    fold  3:  [0.98581560]
    ----
    MEAN:     [0.98760956] + [0.00453227]
    FULL:     [0.98761062]

model  2:     [SVC]
    fold  0:  [0.98349057]
    fold  1:  [0.98584906]
    fold  2:  [0.99528302]
    fold  3:  [0.98581560]
    ----
    MEAN:     [0.98760956] + [0.00453227]
    FULL:     [0.98761062]



In [0]:
# Stacking - Random Forest Model
model = RandomForestClassifier()
    
model = model.fit(S_Train, y_train)
y_pred = model.predict(S_Test)

# Check the Stacked Classifier Metrics
print("------------------------------------------------------------------\n")
print("Stacket Model Metrics\n")
print("Test Accuracy Score: " + str(accuracy_score(y_test, y_pred)))
print("Test Precision Score: " + str(precision_score(y_test, y_pred)))
print("Test Recall Score: " + str(recall_score(y_test, y_pred)))
print("Test F1 Score: " + str(f1_score(y_test, y_pred)))
print("Test AUC Score: " + str(roc_auc_score(y_test, y_pred)))
print("\n")
print("------------------------------------------------------------------\n")



------------------------------------------------------------------

Stacket Model Metrics

Test Accuracy Score: 0.9946666666666667
Test Precision Score: 1.0
Test Recall Score: 0.9857142857142858
Test F1 Score: 0.9928057553956835
Test AUC Score: 0.9928571428571429


------------------------------------------------------------------

