### Code for Term Deposit Subscription by Customers

#### Readme: Preprocessing, Standardization and Visualization has been carried out in R.  R script has been uploaded separately.

To successfully execute the below code, please add the .ipnyb files for Libraries. Below are the same

1. ML_Classfication_Libs.ipynb 
2. ML_Classification_Metrics.ipynb
3. Essential_Libs.ipynb

Load the above mentioned Libraries
Using the external ipynb files to load the libraries. They are divided into 4 categories.

In [1]:
%run ./ML_Classfication_Libs.ipynb
%run ./ML_Classification_Metrics.ipynb
%run ./Essential_Libs.ipynb

Read the Train and Validation data which is pre-processed in R. 

In [110]:
train_data = pd.read_csv("Upgrad_Train_Std.csv")
val_data = pd.read_csv("Upgrad_Val_Std.csv")

In [111]:
train_data.head(2)

Unnamed: 0,campaign,previous,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,age,job,marital,education,default,housing,loan,poutcome,y
0,-0.56312,-0.348902,0.64393,0.72025,0.881397,0.708776,0.327534,Middle,admin.,married,basic.6y,no,no,no,nonexistent,no
1,-0.56312,-0.348902,0.64393,0.72025,0.881397,0.708776,0.327534,Middle,services,married,basic.9y,unknown,no,no,nonexistent,no


In [112]:
train_y = train_data["y"]
val_y = val_data["y"]

train_data.drop('y', axis = 1, inplace=True)
val_data.drop('y', axis = 1, inplace=True)

In [113]:
train_data.dtypes

campaign          float64
previous          float64
emp.var.rate      float64
cons.price.idx    float64
cons.conf.idx     float64
euribor3m         float64
nr.employed       float64
age                object
job                object
marital            object
education          object
default            object
housing            object
loan               object
poutcome           object
dtype: object

Conversion to Categorical 

In [118]:
for col in ['age', 'job', 'marital','education','default','housing','loan','poutcome']:
    train_data[col] = train_data[col].astype('category')
    
for col in ['age', 'job', 'marital','education','default','housing','loan','poutcome']:
    val_data[col] = val_data[col].astype('category')

In [119]:
print(train_data.shape,val_data.shape)

(28832, 15) (12356, 15)


In [120]:
train_num = train_data.select_dtypes(exclude="category")
val_num = val_data.select_dtypes(exclude="category")

train_cat = train_data.select_dtypes(include="category")
val_cat = val_data.select_dtypes(include="category")

In [121]:
print(train_num.shape, train_cat.shape, val_num.shape, val_cat.shape)

(28832, 7) (28832, 8) (12356, 7) (12356, 8)


Dummyfication

In [122]:
train_cat_dum = pd.get_dummies(train_cat,prefix_sep='_',drop_first=True)
val_cat_dum = pd.get_dummies(val_cat,prefix_sep='_',drop_first=True)

In [123]:
print(train_cat_dum.shape,val_cat_dum.shape)

(28832, 31) (12356, 31)


In [124]:
train_x = pd.concat([train_num,train_cat_dum], axis = 1)
val_x = pd.concat([val_num,val_cat_dum], axis = 1)

In [125]:
print(train_x.shape, val_x.shape)

(28832, 38) (12356, 38)


### Model Building

### XGBoost Classifier

In [540]:
xg_1 = XGBClassifier(learning_rate=0.009,max_depth=25,n_estimators=900,n_jobs=-1,scale_pos_weight=0.6)

In [541]:
%time xg_1.fit(train_x, train_y)

Wall time: 5min 4s


XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.009, max_delta_step=0,
       max_depth=25, min_child_weight=1, missing=None, n_estimators=900,
       n_jobs=-1, nthread=None, objective='binary:logistic',
       random_state=0, reg_alpha=0, reg_lambda=1, scale_pos_weight=0.6,
       seed=None, silent=True, subsample=1)

In [542]:
%time xg_train = xg_1.predict(train_x)

Wall time: 16.3 s


In [543]:
%time xg_val = xg_1.predict(val_x)

Wall time: 5.56 s


In [544]:
print(accuracy_score(train_y, xg_train))

0.9455812985571587


In [545]:
print(accuracy_score(val_y, xg_val))

0.8915506636451926


In [546]:
print(classification_report(train_y,xg_train))

              precision    recall  f1-score   support

          no       0.94      1.00      0.97     25584
         yes       0.99      0.52      0.68      3248

   micro avg       0.95      0.95      0.95     28832
   macro avg       0.97      0.76      0.83     28832
weighted avg       0.95      0.95      0.94     28832



In [547]:
print(classification_report(val_y,xg_val))

              precision    recall  f1-score   support

          no       0.91      0.98      0.94     10964
         yes       0.55      0.22      0.31      1392

   micro avg       0.89      0.89      0.89     12356
   macro avg       0.73      0.60      0.63     12356
weighted avg       0.87      0.89      0.87     12356



In [548]:
cm_train = pd.DataFrame(confusion_matrix(train_y,xg_train));print(cm_train)

       0     1
0  25571    13
1   1556  1692


In [549]:
cm_val = pd.DataFrame(confusion_matrix(val_y,xg_val));print(cm_val)

       0    1
0  10709  255
1   1085  307


### SVC

In [134]:
svc_1 = SVC(C=0.01,gamma='scale',class_weight='balanced')

In [135]:
%time svc_1.fit(train_x,train_y)

Wall time: 59.4 s


SVC(C=0.01, cache_size=200, class_weight='balanced', coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [136]:
%time svc_train = svc_1.predict(train_x)

Wall time: 24.6 s


In [137]:
%time svc_val = svc_1.predict(val_x)

Wall time: 10.6 s


In [138]:
print(accuracy_score(train_y, svc_train))

0.7205882352941176


In [139]:
print(accuracy_score(val_y, svc_val))

0.7180317254775008


In [140]:
print(classification_report(train_y,svc_train))

              precision    recall  f1-score   support

          no       0.95      0.72      0.82     25584
         yes       0.24      0.71      0.36      3248

   micro avg       0.72      0.72      0.72     28832
   macro avg       0.60      0.71      0.59     28832
weighted avg       0.87      0.72      0.77     28832



In [141]:
print(classification_report(val_y,svc_val))

              precision    recall  f1-score   support

          no       0.95      0.72      0.82     10964
         yes       0.25      0.72      0.37      1392

   micro avg       0.72      0.72      0.72     12356
   macro avg       0.60      0.72      0.59     12356
weighted avg       0.87      0.72      0.77     12356



In [142]:
cm_train = pd.DataFrame(confusion_matrix(train_y,svc_train));print(cm_train)

       0     1
0  18481  7103
1    953  2295


In [143]:
cm_val = pd.DataFrame(confusion_matrix(val_y,svc_val));print(cm_val)

      0     1
0  7867  3097
1   387  1005


### ADA Boost

In [641]:
ada_11 = AdaBoostClassifier()

In [642]:
 %time ada_11.fit(train_x,train_y)

Wall time: 2.1 s


AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None,
          learning_rate=1.0, n_estimators=50, random_state=None)

In [643]:
%time ada_train = ada_11.predict(train_x)

Wall time: 209 ms


In [644]:
%time ada_val = ada_11.predict(val_x)

Wall time: 76.8 ms


In [645]:
print(accuracy_score(train_y, ada_train))

0.8994519977802442


In [646]:
print(accuracy_score(val_y, ada_val))

0.8982680479119456


In [647]:
print(classification_report(train_y,ada_train))

              precision    recall  f1-score   support

          no       0.91      0.99      0.95     25584
         yes       0.67      0.21      0.32      3248

   micro avg       0.90      0.90      0.90     28832
   macro avg       0.79      0.60      0.63     28832
weighted avg       0.88      0.90      0.88     28832



In [648]:
print(classification_report(val_y,ada_val))

              precision    recall  f1-score   support

          no       0.91      0.99      0.95     10964
         yes       0.65      0.21      0.31      1392

   micro avg       0.90      0.90      0.90     12356
   macro avg       0.78      0.60      0.63     12356
weighted avg       0.88      0.90      0.87     12356



In [649]:
cm_train = pd.DataFrame(confusion_matrix(train_y,ada_train));print(cm_train)

       0    1
0  25253  331
1   2568  680


In [650]:
cm_val = pd.DataFrame(confusion_matrix(val_y,ada_val));print(cm_val)

       0    1
0  10811  153
1   1104  288


### ADABoost Fine Tuning

In [631]:
ada_1 = AdaBoostClassifier(n_estimators=700,learning_rate=0.004)

In [632]:
%time ada_1.fit(train_x,train_y)

Wall time: 28.4 s


AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None,
          learning_rate=0.004, n_estimators=700, random_state=None)

In [633]:
%time ada_train = ada_1.predict(train_x)

Wall time: 3.05 s


In [634]:
%time ada_val = ada_1.predict(val_x)

Wall time: 974 ms


In [635]:
print(accuracy_score(train_y, ada_train))

0.8994866814650389


In [636]:
print(accuracy_score(val_y, ada_val))

0.8987536419553254


In [637]:
print(classification_report(train_y,ada_train))

              precision    recall  f1-score   support

          no       0.90      0.99      0.95     25584
         yes       0.73      0.17      0.28      3248

   micro avg       0.90      0.90      0.90     28832
   macro avg       0.82      0.58      0.61     28832
weighted avg       0.88      0.90      0.87     28832



In [638]:
print(classification_report(val_y,ada_val))

              precision    recall  f1-score   support

          no       0.90      0.99      0.95     10964
         yes       0.71      0.17      0.28      1392

   micro avg       0.90      0.90      0.90     12356
   macro avg       0.80      0.58      0.61     12356
weighted avg       0.88      0.90      0.87     12356



In [639]:
cm_train = pd.DataFrame(confusion_matrix(train_y,ada_train));print(cm_train)

       0    1
0  25372  212
1   2686  562


In [640]:
cm_val = pd.DataFrame(confusion_matrix(val_y,ada_val));print(cm_val)

       0    1
0  10863  101
1   1150  242


### Random Forest

In [600]:
rfc_1 = RandomForestClassifier(n_estimators=900,class_weight='balanced',n_jobs=-1,max_depth=3,max_leaf_nodes=6,
                               min_samples_leaf=100)

In [601]:
%time rfc_1.fit(train_x,train_y)

Wall time: 5.49 s


RandomForestClassifier(bootstrap=True, class_weight='balanced',
            criterion='gini', max_depth=3, max_features='auto',
            max_leaf_nodes=6, min_impurity_decrease=0.0,
            min_impurity_split=None, min_samples_leaf=100,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=900, n_jobs=-1, oob_score=False,
            random_state=None, verbose=0, warm_start=False)

In [602]:
%time rfc_train = rfc_1.predict(train_x)

Wall time: 2.27 s


In [603]:
%time rfc_val = rfc_1.predict(val_x)

Wall time: 1.45 s


In [604]:
print(accuracy_score(train_y, rfc_train))

0.7633185349611543


In [605]:
print(accuracy_score(val_y, rfc_val))

0.7641631595985756


In [606]:
print(classification_report(train_y,rfc_train))

              precision    recall  f1-score   support

          no       0.95      0.77      0.85     25584
         yes       0.28      0.68      0.39      3248

   micro avg       0.76      0.76      0.76     28832
   macro avg       0.61      0.73      0.62     28832
weighted avg       0.87      0.76      0.80     28832



In [607]:
print(classification_report(val_y,rfc_val))

              precision    recall  f1-score   support

          no       0.95      0.77      0.85     10964
         yes       0.28      0.69      0.40      1392

   micro avg       0.76      0.76      0.76     12356
   macro avg       0.62      0.73      0.63     12356
weighted avg       0.88      0.76      0.80     12356



In [608]:
cm_train = pd.DataFrame(confusion_matrix(train_y,rfc_train));print(cm_train)

       0     1
0  19792  5792
1   1032  2216


In [609]:
cm_val = pd.DataFrame(confusion_matrix(val_y,rfc_val));print(cm_val)

      0     1
0  8480  2484
1   430   962
