In [100]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from statsmodels.formula.api import ols
import statsmodels.api as sm
from scipy.stats import chi2_contingency
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, accuracy_score
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB

import warnings
warnings.filterwarnings("ignore")

Read the train & test file

In [120]:
train_df = pd.read_csv('train_Df64byy.csv')

In [121]:
train_df.head()

Unnamed: 0,ID,City_Code,Region_Code,Accomodation_Type,Reco_Insurance_Type,Upper_Age,Lower_Age,Is_Spouse,Health Indicator,Holding_Policy_Duration,Holding_Policy_Type,Reco_Policy_Cat,Reco_Policy_Premium,Response
0,1,C3,3213,Rented,Individual,36,36,No,X1,14+,3.0,22,11628.0,0
1,2,C5,1117,Owned,Joint,75,22,No,X2,,,22,30510.0,0
2,3,C5,3732,Owned,Individual,32,32,No,,1.0,1.0,19,7450.0,1
3,4,C24,4378,Owned,Joint,52,48,No,X1,14+,3.0,19,17780.0,0
4,5,C8,2190,Rented,Individual,44,44,No,X2,3.0,1.0,16,10404.0,0


In [122]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50882 entries, 0 to 50881
Data columns (total 14 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   ID                       50882 non-null  int64  
 1   City_Code                50882 non-null  object 
 2   Region_Code              50882 non-null  int64  
 3   Accomodation_Type        50882 non-null  object 
 4   Reco_Insurance_Type      50882 non-null  object 
 5   Upper_Age                50882 non-null  int64  
 6   Lower_Age                50882 non-null  int64  
 7   Is_Spouse                50882 non-null  object 
 8   Health Indicator         39191 non-null  object 
 9   Holding_Policy_Duration  30631 non-null  object 
 10  Holding_Policy_Type      30631 non-null  float64
 11  Reco_Policy_Cat          50882 non-null  int64  
 12  Reco_Policy_Premium      50882 non-null  float64
 13  Response                 50882 non-null  int64  
dtypes: float64(2), int64(6

In [123]:
train_df.isna().sum()

ID                             0
City_Code                      0
Region_Code                    0
Accomodation_Type              0
Reco_Insurance_Type            0
Upper_Age                      0
Lower_Age                      0
Is_Spouse                      0
Health Indicator           11691
Holding_Policy_Duration    20251
Holding_Policy_Type        20251
Reco_Policy_Cat                0
Reco_Policy_Premium            0
Response                       0
dtype: int64

In [124]:
#lets impute the missing values with the mode
mode_Health = train_df['Health Indicator'].mode()[0]
mode_Duration = train_df['Holding_Policy_Duration'].mode()[0]
mode_Type = train_df['Holding_Policy_Type'].mode()[0]

train_df['Health Indicator'].fillna(mode_Health, inplace=True)
train_df['Holding_Policy_Duration'].fillna(mode_Duration, inplace=True)
train_df['Holding_Policy_Type'].fillna(mode_Type, inplace=True)

In [125]:
train_df.isna().sum()

ID                         0
City_Code                  0
Region_Code                0
Accomodation_Type          0
Reco_Insurance_Type        0
Upper_Age                  0
Lower_Age                  0
Is_Spouse                  0
Health Indicator           0
Holding_Policy_Duration    0
Holding_Policy_Type        0
Reco_Policy_Cat            0
Reco_Policy_Premium        0
Response                   0
dtype: int64

In [126]:
train_df['Region_Code'].nunique()

5316

In [127]:
#we can drop the ID field since it is a running number
train_df.drop(['ID', 'Region_Code'], axis=1, inplace=True)

In [135]:
#before applying encodings, let's split the data into train and test
X = train_df.drop('Response', axis=1)
y = train_df['Response']

In [136]:
train_x, test_x, train_y, test_y = train_test_split(X, y, test_size=0.25, random_state=100)

In [137]:
city_freq = train_x.groupby('City_Code').size()/len(train_x)
train_x['City_Code'] = train_x['City_Code'].map(city_freq)
test_x['City_Code'] = test_x['City_Code'].map(city_freq)

# region_freq = train_x.groupby('Region_Code').size()/len(train_x)
# train_x['Region_Code'] = train_x['Region_Code'].map(region_freq)
# test_x['Region_Code'] = test_x['Region_Code'].map(region_freq)

acco_freq = train_x.groupby('Accomodation_Type').size()/len(train_x)
train_x['Accomodation_Type'] = train_x['Accomodation_Type'].map(acco_freq)
test_x['Accomodation_Type'] = test_x['Accomodation_Type'].map(acco_freq)

reco_freq = train_x.groupby('Reco_Insurance_Type').size()/len(train_x)
train_x['Reco_Insurance_Type'] = train_x['Reco_Insurance_Type'].map(reco_freq)
test_x['Reco_Insurance_Type'] = test_x['Reco_Insurance_Type'].map(reco_freq)

spos_freq = train_x.groupby('Is_Spouse').size()/len(train_x)
train_x['Is_Spouse'] = train_x['Is_Spouse'].map(spos_freq)
test_x['Is_Spouse'] = test_x['Is_Spouse'].map(spos_freq)

health_freq = train_x.groupby('Health Indicator').size()/len(train_x)
train_x['Health Indicator'] = train_x['Health Indicator'].map(health_freq)
test_x['Health Indicator'] = test_x['Health Indicator'].map(health_freq)

duration_freq = train_x.groupby('Holding_Policy_Duration').size()/len(train_x)
train_x['Holding_Policy_Duration'] = train_x['Holding_Policy_Duration'].map(duration_freq)
test_x['Holding_Policy_Duration'] = test_x['Holding_Policy_Duration'].map(duration_freq)

type_freq = train_x.groupby('Holding_Policy_Type').size()/len(train_x)
train_x['Holding_Policy_Type'] = train_x['Holding_Policy_Type'].map(type_freq)
test_x['Holding_Policy_Type'] = test_x['Holding_Policy_Type'].map(type_freq)

cat_freq = train_x.groupby('Reco_Policy_Cat').size()/len(train_x)
train_x['Reco_Policy_Cat'] = train_x['Reco_Policy_Cat'].map(cat_freq)
test_x['Reco_Policy_Cat'] = test_x['Reco_Policy_Cat'].map(cat_freq)

In [138]:
test_x.isna().sum()

City_Code                  0
Accomodation_Type          0
Reco_Insurance_Type        0
Upper_Age                  0
Lower_Age                  0
Is_Spouse                  0
Health Indicator           0
Holding_Policy_Duration    0
Holding_Policy_Type        0
Reco_Policy_Cat            0
Reco_Policy_Premium        0
dtype: int64

Some of the region code present in test are not available in train, so we can drop those records.

In [139]:
#merge the test_x and test_y before dropping
test_data = test_x.join(test_y).dropna()

In [140]:
test_x = test_data.drop('Response', axis=1)
test_y = test_data['Response']

In [141]:
#lets train the model with adaboost classifier
#by default the base estimator is considered as Decision tree

adaboost_classifier = AdaBoostClassifier()
adaboost_classifier.fit(train_x, train_y)
predict = adaboost_classifier.predict(test_x)
print('accuracy_score:', accuracy_score(test_y, predict))

accuracy_score: 0.7546576527002594


In [98]:
#lets change the base classifier to logistic regression
adaboost_classifier = AdaBoostClassifier(base_estimator = LogisticRegression(random_state=1,solver='liblinear'))
adaboost_classifier.fit(train_x, train_y)
predict = adaboost_classifier.predict(test_x)
print('accuracy_score:', accuracy_score(test_y, predict))

accuracy_score: 0.7546576527002594


In [142]:
#lets change the base classifier to Gaussian Naive Bayes
adaboost_classifier = AdaBoostClassifier(base_estimator=GaussianNB())
adaboost_classifier.fit(train_x, train_y)
predict = adaboost_classifier.predict(test_x)
print('accuracy_score:', accuracy_score(test_y, predict))

accuracy_score: 0.7409794827450672


#### Gradient Boosting

In [111]:
#lets redo the same step using Gradient boosting classifier
gradientboost_classifier = GradientBoostingClassifier()
gradientboost_classifier.fit(train_x, train_y)
predict = gradientboost_classifier.predict(test_x)
print('accuracy_score:', accuracy_score(test_y, predict))

accuracy_score: 0.7542646018394781


In [112]:
#lets redo the same step using Gradient boosting classifier
gradientboost_classifier = GradientBoostingClassifier(init = LogisticRegression(random_state=1,solver='liblinear'))
gradientboost_classifier.fit(train_x, train_y)
predict = gradientboost_classifier.predict(test_x)
print('accuracy_score:', accuracy_score(test_y, predict))

accuracy_score: 0.7542646018394781
