In [108]:
import pandas as pd
import numpy as np
import os
from sklearn.preprocessing import StandardScaler

# DATA Analysis

In [109]:
df = pd.read_csv('train_wn75k28.csv')

In [110]:
df.describe()

Unnamed: 0,id,campaign_var_1,campaign_var_2,products_purchased,user_activity_var_1,user_activity_var_2,user_activity_var_3,user_activity_var_4,user_activity_var_5,user_activity_var_6,user_activity_var_7,user_activity_var_8,user_activity_var_9,user_activity_var_10,user_activity_var_11,user_activity_var_12,buy
count,39161.0,39161.0,39161.0,18250.0,39161.0,39161.0,39161.0,39161.0,39161.0,39161.0,39161.0,39161.0,39161.0,39161.0,39161.0,39161.0,39161.0
mean,19581.0,6.523812,6.452746,2.154137,0.400092,0.006716,0.102832,0.011465,0.151503,0.499834,0.286612,0.174434,0.01144,0.000383,0.218942,0.000562,0.05102
std,11304.951283,3.472944,2.614296,0.779815,0.509194,0.081676,0.303743,0.106463,0.359681,0.558166,0.455784,0.379689,0.106346,0.019568,0.431544,0.023696,0.220042
min,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,9791.0,4.0,5.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,19581.0,6.0,6.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,29371.0,9.0,8.0,3.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
max,39161.0,16.0,15.0,4.0,3.0,1.0,1.0,1.0,2.0,3.0,2.0,2.0,1.0,1.0,4.0,1.0,1.0


# Data Splitting using Startified split over Buy

In [111]:
from sklearn.model_selection import train_test_split
train, test = train_test_split(df, test_size=0.2, stratify=df['buy'], shuffle=1000, random_state=0)

In [112]:
len(train), len(test)

(31328, 7833)

In [113]:
train_y = train['buy']
train_x = train.drop('buy', axis=1)
test_y = test['buy']
test_x = test.drop('buy', axis=1)

# Preprocessing and Feature Engineering

In [114]:
def preprocess(data):
    data[['created_at', 'signup_date']] = data[['created_at', 'signup_date']].apply(pd.to_datetime)
    max_date = data['created_at'].max()
    data['created_days'] = data['created_at'].apply(lambda x: (max_date-x).days)
    data['is_signed'] = data['signup_date'].apply(lambda x: 0 if pd.isna(x) else 1)
    data['signed_days'] = data.apply(lambda x: (x['created_at']-x['signup_date']).days, axis=1).fillna(-1)
    
    float_cols = data.select_dtypes(np.float64)
    float_cols.fillna(0, inplace=True)
    float_cols = float_cols.apply(np.int64)
    
    num_cols = data.select_dtypes(np.int64)
    num_cols.drop('id', axis=1, inplace=True)
    num_cols.fillna(0, inplace=True)
    
    features_data = pd.concat([num_cols, float_cols], axis=1)
    features = features_data.columns
    
    return features_data

In [115]:
from sklearn.metrics import confusion_matrix, precision_recall_fscore_support, classification_report
def evaluate(model, test_features, test_labels):
    predictions = model.predict(test_features)
    class_res = classification_report(predictions, test_labels)
    print(class_res)
    return class_res

# Testing different Modelling Approaches

In [116]:
## RandomForest
from sklearn.preprocessing import StandardScaler
train_feats = preprocess(train_x)
features = train_feats.columns
scaler =  StandardScaler()
train_feats = scaler.fit_transform(train_feats)

from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier()
clf.fit(train_feats, train_y)

test_feats = preprocess(test_x)
test_data = scaler.transform(test_feats)
result = evaluate(clf, test_data, test_y)

              precision    recall  f1-score   support

           0       1.00      0.98      0.99      7584
           1       0.56      0.91      0.70       249

    accuracy                           0.97      7833
   macro avg       0.78      0.94      0.84      7833
weighted avg       0.98      0.97      0.98      7833



In [96]:
# applying Randomized Search CV
n_estimators  =[int(x) for x in np.linspace(100, 500, num=5)]
min_samples_split = [int(x) for x in np.linspace(2, 5, num=2)]
min_samples_leaf = [1,2,3]
max_depth = [int(x) for x in np.linspace(10, 100, num = 10)]
max_depth.append(None)
bootstrap = [True, False]


random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}

from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

clf = RandomForestClassifier()
grid_search = RandomizedSearchCV(estimator = clf, param_distributions= random_grid, 
                          cv = 3, n_jobs = -1, verbose = 2, n_iter=50)


In [97]:
grid_search.fit(train_feats, train_y)

Fitting 3 folds for each of 50 candidates, totalling 150 fits


RandomizedSearchCV(cv=3, estimator=RandomForestClassifier(), n_iter=50,
                   n_jobs=-1,
                   param_distributions={'bootstrap': [True, False],
                                        'max_depth': [10, 20, 30, 40, 50, 60,
                                                      70, 80, 90, 100, None],
                                        'max_features': ['auto', 'sqrt'],
                                        'min_samples_leaf': [1, 2, 3],
                                        'min_samples_split': [2, 5],
                                        'n_estimators': [100, 200, 300, 400,
                                                         500]},
                   verbose=2)

In [98]:
grid_search.best_params_
model = grid_search.best_estimator_

test_feats = preprocess(test_x)
test_data = scaler.transform(test_feats)
result = evaluate(model, test_data, test_y)

              precision    recall  f1-score   support

           0       1.00      0.98      0.99      7590
           1       0.56      0.92      0.70       243

    accuracy                           0.98      7833
   macro avg       0.78      0.95      0.84      7833
weighted avg       0.98      0.98      0.98      7833



In [99]:
# creating submission file
test_df = pd.read_csv('test_Wf7sxXF.csv')
test_data = preprocess(test_df)
test_data = scaler.transform(test_data)
preds = model.predict(test_data)
test_df['buy'] = preds
submit_df = test_df[['id', 'buy']]
submit_df.to_csv("submission_forest.csv", index=False)

In [101]:
## SVM
train_feats = preprocess(train_x)
features = train_feats.columns
scaler =  StandardScaler()
train_feats = scaler.fit_transform(train_feats)

from sklearn.svm import SVC
clf = SVC()
clf.fit(train_feats, train_y)

test_feats = preprocess(test_x)
test_data = scaler.transform(test_feats)
result = evaluate(clf, test_data, test_y)

              precision    recall  f1-score   support

           0       1.00      0.98      0.99      7599
           1       0.55      0.94      0.70       234

    accuracy                           0.98      7833
   macro avg       0.78      0.96      0.84      7833
weighted avg       0.98      0.98      0.98      7833



In [102]:
## XGBOOST
from sklearn.preprocessing import StandardScaler
train_feats = preprocess(train_x)
features = train_feats.columns
scaler =  StandardScaler()
train_feats = scaler.fit_transform(train_feats)

from sklearn.ensemble import GradientBoostingClassifier
clf = GradientBoostingClassifier()
clf.fit(train_feats, train_y)

test_feats = preprocess(test_x)
test_data = scaler.transform(test_feats)
result = evaluate(clf, test_data, test_y)

              precision    recall  f1-score   support

           0       1.00      0.98      0.99      7588
           1       0.57      0.93      0.70       245

    accuracy                           0.98      7833
   macro avg       0.78      0.95      0.85      7833
weighted avg       0.98      0.98      0.98      7833



In [103]:
test_df = pd.read_csv('test_Wf7sxXF.csv')

In [104]:
# prediction using SVM
test_data = preprocess(test_df)
test_data = scaler.transform(test_data)
preds = clf.predict(test_data)
test_df['buy'] = preds

In [105]:
submit_df = test_df[['id', 'buy']]
submit_df.to_csv("submission_svm.csv", index=False)

In [117]:
# trying all binary classifiction

from sklearn.neural_network import MLPClassifier
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.ensemble import AdaBoostClassifier
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis

In [118]:
names = [
    "Neural Net",
    "AdaBoost",
    "QDA"
]

classifiers = [
    MLPClassifier(),
    AdaBoostClassifier(),
    QuadraticDiscriminantAnalysis()
]

In [119]:
for name, clf in zip(names, classifiers):
    print("============================")
    print("Training classifier:", name)
    train_feats = preprocess(train_x)
    features = train_feats.columns
    scaler =  StandardScaler()
    train_feats = scaler.fit_transform(train_feats)
    clf.fit(train_feats, train_y)
    test_feats = preprocess(test_x)
    test_data = scaler.transform(test_feats)
    result = evaluate(clf, test_data, test_y)
    
    #submission
    test_df = pd.read_csv('test_Wf7sxXF.csv')
    test_data = preprocess(test_df)
    test_data = scaler.transform(test_data)
    preds = clf.predict(test_data)
    test_df['buy'] = preds
    
    submit_df = test_df[['id', 'buy']]
    submit_df.to_csv("submission_{}.csv".format(name), index=False)

Training classifier: Neural Net




              precision    recall  f1-score   support

           0       1.00      0.98      0.99      7559
           1       0.59      0.87      0.71       274

    accuracy                           0.97      7833
   macro avg       0.80      0.92      0.85      7833
weighted avg       0.98      0.97      0.98      7833

Training classifier: AdaBoost
              precision    recall  f1-score   support

           0       1.00      0.98      0.99      7591
           1       0.57      0.94      0.71       242

    accuracy                           0.98      7833
   macro avg       0.78      0.96      0.85      7833
weighted avg       0.98      0.98      0.98      7833

Training classifier: QDA




              precision    recall  f1-score   support

           0       0.95      0.97      0.96      7214
           1       0.53      0.34      0.41       619

    accuracy                           0.92      7833
   macro avg       0.74      0.66      0.69      7833
weighted avg       0.91      0.92      0.92      7833



In [120]:
# PCA and classification
from sklearn.decomposition import PCA

train_feats = preprocess(train_x)
features = train_feats.columns
scaler =  StandardScaler()
train_feats = scaler.fit_transform(train_feats)

pca = PCA()
train_pca = pca.fit_transform(train_feats)


clf = MLPClassifier()
clf.fit(train_pca, train_y)


test_feats = preprocess(test_x)
test_data = scaler.transform(test_feats)
test_pca = pca.transform(test_data)

result = evaluate(clf, test_pca, test_y)
score = clf.score(test_pca, test_y)
print("Test SCORE: ", score)

result = evaluate(clf, train_pca, train_y)
score = clf.score(train_pca, train_y)
print("Train SCORE: ", score)



              precision    recall  f1-score   support

           0       1.00      0.98      0.99      7565
           1       0.59      0.88      0.70       268

    accuracy                           0.97      7833
   macro avg       0.79      0.93      0.85      7833
weighted avg       0.98      0.97      0.98      7833

Test SCORE:  0.9747223286097281
              precision    recall  f1-score   support

           0       1.00      0.98      0.99     30323
           1       0.61      0.97      0.75      1005

    accuracy                           0.98     31328
   macro avg       0.80      0.97      0.87     31328
weighted avg       0.99      0.98      0.98     31328

Train SCORE:  0.9789645045965271


In [121]:
# GRID search on Neural Nets
from sklearn.model_selection import GridSearchCV
grid_params = {'activation': ['tanh', 'relu'],
              'alpha': [0.1, 0.001, 0.0001],
              'early_stopping': [True],
              'hidden_layer_sizes': [(50,),(100,), (150,)]
              }
model = MLPClassifier()
grid_search = GridSearchCV(model, grid_params, cv = 3, n_jobs = 3, verbose = 2)

In [122]:
train_feats = preprocess(train_x)
features = train_feats.columns
scaler =  StandardScaler()
train_feats = scaler.fit_transform(train_feats)

grid_search.fit(train_feats, train_y)

Fitting 3 folds for each of 18 candidates, totalling 54 fits


GridSearchCV(cv=3, estimator=MLPClassifier(), n_jobs=3,
             param_grid={'activation': ['tanh', 'relu'],
                         'alpha': [0.1, 0.001, 0.0001],
                         'early_stopping': [True],
                         'hidden_layer_sizes': [(50,), (100,), (150,)]},
             verbose=2)

In [125]:
params = grid_search.best_params_

In [126]:
best_model = grid_search.best_estimator_

test_feats = preprocess(test_x)
test_data = scaler.transform(test_feats)
result = evaluate(best_model, test_data, test_y)
score = best_model.score(test_data, test_y)
print("Test SCORE: ", score)

result = evaluate(best_model, train_feats, train_y)
score = best_model.score(train_feats, train_y)
print("Train SCORE: ", score)

              precision    recall  f1-score   support

           0       1.00      0.98      0.99      7583
           1       0.57      0.91      0.70       250

    accuracy                           0.98      7833
   macro avg       0.78      0.94      0.84      7833
weighted avg       0.98      0.98      0.98      7833

Test SCORE:  0.9752329886378144
              precision    recall  f1-score   support

           0       1.00      0.98      0.99     30339
           1       0.57      0.92      0.71       989

    accuracy                           0.98     31328
   macro avg       0.78      0.95      0.85     31328
weighted avg       0.98      0.98      0.98     31328

Train SCORE:  0.9757724719101124


In [127]:
 #submission
test_df = pd.read_csv('test_Wf7sxXF.csv')
test_data = preprocess(test_df)
test_data = scaler.transform(test_data)
preds = best_model.predict(test_data)
test_df['buy'] = preds

submit_df = test_df[['id', 'buy']]
submit_df.to_csv("submission_{}.csv".format('Neural_net_GCV'), index=False)

In [128]:
# Train model on complete data
train_y = df['buy']
train_x = df.drop('buy', axis=1)

train_feats = preprocess(train_x)
features = train_feats.columns
scaler =  StandardScaler()
train_feats = scaler.fit_transform(train_feats)

clf = MLPClassifier()
clf.fit(train_feats, train_y)


test_feats = preprocess(test_x)
test_data = scaler.transform(test_feats)
result = evaluate(clf, test_data, test_y)
score = clf.score(test_data, test_y)
print("Test SCORE: ", score)

result = evaluate(clf, train_feats, train_y)
score = clf.score(train_feats, train_y)
print("Train SCORE: ", score)


              precision    recall  f1-score   support

           0       1.00      0.98      0.99      7579
           1       0.60      0.94      0.73       254

    accuracy                           0.98      7833
   macro avg       0.80      0.96      0.86      7833
weighted avg       0.99      0.98      0.98      7833

Test SCORE:  0.9777862887782459
              precision    recall  f1-score   support

           0       1.00      0.98      0.99     37905
           1       0.60      0.96      0.74      1256

    accuracy                           0.98     39161
   macro avg       0.80      0.97      0.86     39161
weighted avg       0.99      0.98      0.98     39161

Train SCORE:  0.9783968744414085


In [130]:
 #submission
test_df = pd.read_csv('test_Wf7sxXF.csv')
test_data = preprocess(test_df)
test_data = scaler.transform(test_data)
preds = clf.predict(test_data)
test_df['buy'] = preds

submit_df = test_df[['id', 'buy']]
submit_df.to_csv("submission_{}.csv".format('Neural_netfinal'), index=False)

Final submission On Neural Nets as they gave the Highest F-score on label 1