In [1]:
# Import libraries
from vecstack import stacking
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from imblearn.over_sampling import SMOTE 
from sklearn.svm import LinearSVC
from sklearn import svm
from collections import Counter

import warnings
warnings.filterwarnings("ignore")

In [2]:
# Loading training and testing datasets
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
print(train.shape)
print(test.shape)

(65000, 596)
(173836, 596)


In [3]:
# Finding out how imbalanced our dataset is

TotalQuotes=train["QuoteConversion_Flag"].value_counts()
nonconverted_quotes=np.round(100*TotalQuotes[0]/(TotalQuotes[0]+TotalQuotes[1]),2)
converted_quotes=np.round(100*TotalQuotes[1]/(TotalQuotes[0]+TotalQuotes[1]),2)

print('Non-converted quotes:', nonconverted_quotes, '%')
print('Converted quotes:', converted_quotes, '%')

Non-converted quotes: 81.14 %
Converted quotes: 18.86 %


In [4]:
# Deleting categorical variable that's not present in training dataset
del test['GeographicField64']
test.head()

# Adding the Quote Conversion column to testing dataset for predictions
test['QuoteConversion_Flag'] = ""

test.head()

Unnamed: 0,CoverageField11A,CoverageField11B,CoverageField1A,CoverageField1B,CoverageField2A,CoverageField2B,CoverageField3A,CoverageField3B,CoverageField4A,CoverageField4B,...,PropertyField38_N,PropertyField38_Y,GeographicField63_,GeographicField63_N,GeographicField63_Y,GeographicField64_CA,GeographicField64_IL,GeographicField64_NJ,GeographicField64_TX,QuoteConversion_Flag
0,13,22,4,4,4,4,3,3,3,4,...,1,0,0,0,1,0,0,0,0,
1,4,5,8,14,8,14,7,12,8,13,...,1,0,0,1,0,0,0,0,0,
2,3,3,11,18,11,18,10,16,10,18,...,1,0,0,1,0,0,0,0,0,
3,5,9,14,22,15,22,13,20,22,25,...,1,0,0,1,0,0,0,0,0,
4,12,21,4,5,4,5,4,4,4,5,...,1,0,0,1,0,0,0,0,0,


In [5]:
# Splitting the datasets
XTrain = train[train.columns.difference(['QuoteConversion_Flag'])]
XTest = test[test.columns.difference(['QuoteConversion_Flag'])]
YTrain = train["QuoteConversion_Flag"]

X_Train, X_Test, Y_Train, Y_Test = train_test_split(XTrain, YTrain, test_size=0.2, random_state=42)

In [6]:
# Using SMOTE to overcome class imbalance in our dataset
print('Original dataset shape %s' % Counter(Y_Train))
sm = SMOTE(random_state=42, sampling_strategy=0.5)
X_res, y_res = sm.fit_resample(X_Train, Y_Train)
print('Resampled dataset shape %s' % Counter(y_res))

Original dataset shape Counter({0: 42183, 1: 9817})
Resampled dataset shape Counter({0: 42183, 1: 21091})


In [7]:
# Decision Tree Classifier after SMOTE
dt = DecisionTreeClassifier()
dt.fit(X_res, y_res)
dt_predict=dt.predict(X_Test)
print("Decision Tree Accuracy:{0:6f}".format(dt.score(X_Test,Y_Test)))  
print("Confusion Matrix - Decision Tree")
print(confusion_matrix(Y_Test,dt_predict))
print("Classification Report")
print(classification_report(Y_Test,dt_predict))

Decision Tree Accuracy:0.884615
Confusion Matrix - Decision Tree
[[9750  805]
 [ 695 1750]]
Classification Report
              precision    recall  f1-score   support

           0       0.93      0.92      0.93     10555
           1       0.68      0.72      0.70      2445

    accuracy                           0.88     13000
   macro avg       0.81      0.82      0.81     13000
weighted avg       0.89      0.88      0.89     13000



In [8]:
# Preparing decision tree submission file for kaggle
dt_predict=dt.predict(XTest)
dt_prediction = pd.DataFrame({"QuoteNumber":XTest['QuoteNumber'],"QuoteConversion_Flag":dt_predict})  
print(dt_prediction['QuoteConversion_Flag'].value_counts())
dt_prediction.to_csv('DecisionTree_Submission.csv', index=False)

0    120836
1     53000
Name: QuoteConversion_Flag, dtype: int64


In [9]:
# Random Forest Classifier after SMOTE
rfc = RandomForestClassifier()
rfc.fit(X_res, y_res)
rfc_predict=rfc.predict(X_Test)
print("Random Forest Accuracy:{0:6f}".format(rfc.score(X_Test,Y_Test)))
print("Confusion Matrix for Random Forest:")
print(confusion_matrix(Y_Test,rfc_predict))
print("Classification Report")
print(classification_report(Y_Test,rfc_predict))

Random Forest Accuracy:0.906154
Confusion Matrix for Random Forest:
[[10225   330]
 [  890  1555]]
Classification Report
              precision    recall  f1-score   support

           0       0.92      0.97      0.94     10555
           1       0.82      0.64      0.72      2445

    accuracy                           0.91     13000
   macro avg       0.87      0.80      0.83     13000
weighted avg       0.90      0.91      0.90     13000



In [10]:
# Preparing random forest submission file for kaggle
rfc_predict=rfc.predict(XTest)
rf_prediction = pd.DataFrame({"QuoteNumber":XTest['QuoteNumber'],"QuoteConversion_Flag":rfc_predict})  
print(rf_prediction['QuoteConversion_Flag'].value_counts())
rf_prediction.to_csv('RandomForest_Submission.csv', index=False)

0    144559
1     29277
Name: QuoteConversion_Flag, dtype: int64


In [11]:
# Multi Layer Perceptron Classifier after SMOTE
mlp = MLPClassifier()
mlp.fit(X_res, y_res)
mlp_predict=mlp.predict(X_Test)
print("Multi Layer Perceptron Accuracy:{0:6f}".format(mlp.score(X_Test,Y_Test)))
print("Confusion Matrix for Multilayer Perceptron:")
print(confusion_matrix(Y_Test,mlp_predict))
print("Classification Report")
print(classification_report(Y_Test,mlp_predict))

Multi Layer Perceptron Accuracy:0.713538
Confusion Matrix for Multilayer Perceptron:
[[7096 3459]
 [ 265 2180]]
Classification Report
              precision    recall  f1-score   support

           0       0.96      0.67      0.79     10555
           1       0.39      0.89      0.54      2445

    accuracy                           0.71     13000
   macro avg       0.68      0.78      0.67     13000
weighted avg       0.86      0.71      0.74     13000



In [12]:
# Preparing MLP submission file for kaggle
mlp_predict=mlp.predict(XTest)
mlp_prediction = pd.DataFrame({"QuoteNumber":XTest['QuoteNumber'],"QuoteConversion_Flag":mlp_predict})  
print(mlp_prediction['QuoteConversion_Flag'].value_counts())
dt_prediction.to_csv('MultiLayerPerceptron_Submission.csv', index=False)

1    140493
0     33343
Name: QuoteConversion_Flag, dtype: int64


In [13]:
# Support Vector Machine Classifier after SMOTE
svc = LinearSVC()
svc.fit(X_res, y_res)
svc_predict=svc.predict(X_Test)
print("Support Vector Machine Accuracy:{0:6f}:{0:6f}".format(svc.score(X_Test,Y_Test)))
print("Confusion Matrix for Support Vector Machines:")
print(confusion_matrix(Y_Test,svc_predict))
print("Classification Report")
print(classification_report(Y_Test,svc_predict))

Support Vector Machine Accuracy:0.811538:0.811538
Confusion Matrix for Support Vector Machines:
[[10524    31]
 [ 2419    26]]
Classification Report
              precision    recall  f1-score   support

           0       0.81      1.00      0.90     10555
           1       0.46      0.01      0.02      2445

    accuracy                           0.81     13000
   macro avg       0.63      0.50      0.46     13000
weighted avg       0.75      0.81      0.73     13000



In [14]:
# Preparing SVM submission file for kaggle
svm_predict=svc.predict(XTest)
svm_prediction = pd.DataFrame({"QuoteNumber":XTest['QuoteNumber'],"QuoteConversion_Flag":svm_predict})  
print(svm_prediction['QuoteConversion_Flag'].value_counts())
dt_prediction.to_csv('SVM_Submission.csv', index=False)

0    173700
1       136
Name: QuoteConversion_Flag, dtype: int64


In [15]:
# K Nearest Neighbor Classifier after SMOTE
knn = KNeighborsClassifier()
knn.fit(X_res, y_res)
knn_predict = knn.predict(X_Test)
print("K Nearest Neighbor Accuracy:{0:6f}".format(knn.score(X_Test,Y_Test)))
print("Confusion Matrix for K Nearest Neighbors:")
print(confusion_matrix(Y_Test,knn_predict))
print("Classification Report")
print(classification_report(Y_Test,knn_predict))

K Nearest Neighbor Accuracy:0.677077
Confusion Matrix for K Nearest Neighbors:
[[8179 2376]
 [1822  623]]
Classification Report
              precision    recall  f1-score   support

           0       0.82      0.77      0.80     10555
           1       0.21      0.25      0.23      2445

    accuracy                           0.68     13000
   macro avg       0.51      0.51      0.51     13000
weighted avg       0.70      0.68      0.69     13000



In [16]:
# Preparing KNN submission file for kaggle
knn_predict=knn.predict(XTest)
knn_prediction = pd.DataFrame({"QuoteNumber":XTest['QuoteNumber'],"QuoteConversion_Flag":knn_predict})  
print(knn_prediction['QuoteConversion_Flag'].value_counts())
knn_prediction.to_csv('KNN_Submission.csv', index=False)

0    139765
1     34071
Name: QuoteConversion_Flag, dtype: int64


In [17]:
# Stacking the models built so far

models = [ MLPClassifier(), RandomForestClassifier(), DecisionTreeClassifier(), KNeighborsClassifier(), LinearSVC()]
      
S_Train, S_Test = stacking(models,                   
                           X_res, y_res, X_Test,   
                           regression=False, 
     
                           mode='oof_pred_bag', 
       
                           needs_proba=False,
         
                           save_dir=None, 
            
                           metric=accuracy_score, 
    
                           n_folds=4, 
                 
                           stratified=True,
            
                           shuffle=True,  
            
                           random_state=0,    
         
                           verbose=2)

task:         [classification]
n_classes:    [2]
metric:       [accuracy_score]
mode:         [oof_pred_bag]
n_models:     [5]

model  0:     [MLPClassifier]
    fold  0:  [0.75333460]
    fold  1:  [0.75731715]
    fold  2:  [0.73732457]
    fold  3:  [0.72272095]
    ----
    MEAN:     [0.74267432] + [0.01373691]
    FULL:     [0.74267472]

model  1:     [RandomForestClassifier]
    fold  0:  [0.91782034]
    fold  1:  [0.92382578]
    fold  2:  [0.92167151]
    fold  3:  [0.91876343]
    ----
    MEAN:     [0.92052027] + [0.00237844]
    FULL:     [0.92052028]

model  2:     [DecisionTreeClassifier]
    fold  0:  [0.89285037]
    fold  1:  [0.89778115]
    fold  2:  [0.89638387]
    fold  3:  [0.89474017]
    ----
    MEAN:     [0.89543889] + [0.00184173]
    FULL:     [0.89543888]

model  3:     [KNeighborsClassifier]
    fold  0:  [0.69498704]
    fold  1:  [0.69283773]
    fold  2:  [0.69692755]
    fold  3:  [0.69541029]
    ----
    MEAN:     [0.69504065] + [0.00146226]
    FUL

In [18]:
# Building Random Forest stacked model
model = RandomForestClassifier()
    
model = model.fit(S_Train, y_res)
y_pred = model.predict(S_Test)
print('Accuracy:{%.8f}' % accuracy_score(Y_Test, y_pred))
print("Confusion Matrix after STACKING for RF:")
print(confusion_matrix(Y_Test,y_pred))
print("Classification Report")
print(classification_report(Y_Test,y_pred))

Accuracy:{0.90438462}
Confusion Matrix after STACKING for RF:
[[10309   246]
 [  997  1448]]
Classification Report
              precision    recall  f1-score   support

           0       0.91      0.98      0.94     10555
           1       0.85      0.59      0.70      2445

    accuracy                           0.90     13000
   macro avg       0.88      0.78      0.82     13000
weighted avg       0.90      0.90      0.90     13000



In [19]:
# Hyperparameter tuning for random forest stacked model using random search
rand_param={'max_depth': range(5,15,2),
                 'n_estimators':[10,25,50,100], 'min_samples_split': [10,100,10]}

rf1 = RandomForestClassifier()
rf1.fit(X_Train, Y_Train)
rf1_predict=rf1.predict(X_Test)
print("accuracy Score:{0:6f}".format(rf1.score(X_Test,Y_Test)))
print("Confusion Matrix:")
print(confusion_matrix(Y_Test,rf1_predict))
print("Classification Report")
print(classification_report(Y_Test,rf1_predict))

rf1_grid = GridSearchCV(rf,rand_param,cv=5)
rf1_grid.fit(X_Train, Y_Train)
grid_param_rf1=rf1_grid.best_params_
print(grid_param_rf1)

model1 = RandomForestClassifier(**grid_param_rf1)
model1.fit(S_Train, y_res)
Test_Prediction = model1.predict(S_Test)

# Preparing stacked model submission file for kaggle
Test_Prediction = model1.predict(S_Test)
Prediction = pd.DataFrame({"QuoteNumber":XTest['QuoteNumber'],"QuoteConversion_Flag":Test_Prediction})  

print(Prediction['QuoteConversion_Flag'].value_counts())
Prediction.to_csv('StackedRF_Submission.csv', index=False)

accuracy Score:0.908923
Confusion Matrix:
[[10368   187]
 [  997  1448]]
Classification Report
              precision    recall  f1-score   support

           0       0.91      0.98      0.95     10555
           1       0.89      0.59      0.71      2445

    accuracy                           0.91     13000
   macro avg       0.90      0.79      0.83     13000
weighted avg       0.91      0.91      0.90     13000

{'max_depth': 5, 'min_samples_split': 100, 'n_estimators': 10}
