# Tabel of Contents
  - [Intro](#Intro)
  - [Data](#Data)
  - [Feature selection](#Feature-selection)
  - [Fitting the model](#Fitting-the-model)
    - [LogisticRegression](#LogisticRegression)
    - [XGboost](#XGboost)
  - [Ensemble](#Ensemble)
    - [RandomForestClassifier](#RandomForestClassifier)
    - [ExtraTreesClassifier](#ExtraTreesClassifier)
  - [Submission](#Submission)

# Intro
The [Santander Customer Competition](https://www.kaggle.com/c/santander-customer-satisfaction) on Kaggle provides us with 370 numerical variables and we have to predict whether a customer is satisfied or not. The evaluation metric is ROC AUC.

First, I try to run the Logistic Regression classifier from `sklearn` and then I run the xgbBoost classifier from `xgb` and compare the results.

# Data
First, I read in data.

In [83]:
import pandas as pd

train = pd.read_csv("train.csv")

In [84]:
train.head()

Unnamed: 0,ID,var3,var15,imp_ent_var16_ult1,imp_op_var39_comer_ult1,imp_op_var39_comer_ult3,imp_op_var40_comer_ult1,imp_op_var40_comer_ult3,imp_op_var40_efect_ult1,imp_op_var40_efect_ult3,...,saldo_medio_var33_hace2,saldo_medio_var33_hace3,saldo_medio_var33_ult1,saldo_medio_var33_ult3,saldo_medio_var44_hace2,saldo_medio_var44_hace3,saldo_medio_var44_ult1,saldo_medio_var44_ult3,var38,TARGET
0,1,2,23,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,39205.17,0
1,3,2,34,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,49278.03,0
2,4,2,23,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,67333.77,0
3,8,2,37,0.0,195.0,195.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,64007.97,0
4,10,2,39,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,117310.979016,0


In [85]:
train.describe()

Unnamed: 0,ID,var3,var15,imp_ent_var16_ult1,imp_op_var39_comer_ult1,imp_op_var39_comer_ult3,imp_op_var40_comer_ult1,imp_op_var40_comer_ult3,imp_op_var40_efect_ult1,imp_op_var40_efect_ult3,...,saldo_medio_var33_hace2,saldo_medio_var33_hace3,saldo_medio_var33_ult1,saldo_medio_var33_ult3,saldo_medio_var44_hace2,saldo_medio_var44_hace3,saldo_medio_var44_ult1,saldo_medio_var44_ult3,var38,TARGET
count,76020.0,76020.0,76020.0,76020.0,76020.0,76020.0,76020.0,76020.0,76020.0,76020.0,...,76020.0,76020.0,76020.0,76020.0,76020.0,76020.0,76020.0,76020.0,76020.0,76020.0
mean,75964.050723,-1523.199277,33.212865,86.208265,72.363067,119.529632,3.55913,6.472698,0.412946,0.567352,...,7.935824,1.365146,12.21558,8.784074,31.505324,1.858575,76.026165,56.614351,117235.8,0.039569
std,43781.947379,39033.462364,12.956486,1614.757313,339.315831,546.266294,93.155749,153.737066,30.604864,36.513513,...,455.887218,113.959637,783.207399,538.439211,2013.125393,147.786584,4040.337842,2852.579397,182664.6,0.194945
min,1.0,-999999.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5163.75,0.0
25%,38104.75,2.0,23.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,67870.61,0.0
50%,76043.0,2.0,28.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,106409.2,0.0
75%,113748.75,2.0,40.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,118756.3,0.0
max,151838.0,238.0,105.0,210000.0,12888.03,21024.81,8237.82,11073.57,6600.0,6600.0,...,50003.88,20385.72,138831.63,91778.73,438329.22,24650.01,681462.9,397884.3,22034740.0,1.0


In [87]:
# Number of columns including dependent variable TARGET
train.shape

(76020, 371)

In [189]:
from tpot import TPOT
from sklearn.cross_validation import StratifiedShuffleSplit
from sklearn import linear_model
from sklearn.feature_selection import RFE
from sklearn.metrics import roc_auc_score
import numpy as np

# split data into train and test
X=train.drop(["TARGET"],axis=1)
sss = StratifiedShuffleSplit(train.TARGET.values, n_iter=10, train_size=0.75, test_size=0.25)

for train_index, test_index in sss:
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = train.TARGET[train_index], train.TARGET[test_index]

# Feature selection

In [194]:
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.feature_selection import SelectFromModel

clf = ExtraTreesClassifier()
selector = clf.fit(X_train, y_train)
# clf.feature_importances_ 
fs = SelectFromModel(selector, prefit=True)

X_train = fs.transform(X_train)
X_test = fs.transform(X_test)

print(X_train.shape, X_test.shape)

((57015, 49), (19005, 49))


# Fitting a model
## LogisticRegression

In [203]:
# Logistic classifier from sklearn
m1_log = linear_model.LogisticRegression(verbose=3, max_iter=2000)
m1_log.fit(X_test, y_test)

# calculate the auc
roc_auc_score(y_test, m1_log.predict(X_test),\
              average='macro')

[LibLinear]

0.50249521792395535

## Xgboost

In [202]:
import xgboost as xgb

# logistic classifier from xgboost
m2_xgb = xgb.XGBClassifier(n_estimators=20)
m2_xgb.fit(X_train, y_train, eval_metric="auc",\
        eval_set=[(X_test, y_test)])


[0]	validation_0-auc:0.797640
[1]	validation_0-auc:0.803615
[2]	validation_0-auc:0.810037
[3]	validation_0-auc:0.812200
[4]	validation_0-auc:0.812731
[5]	validation_0-auc:0.814204
[6]	validation_0-auc:0.815038
[7]	validation_0-auc:0.814962
[8]	validation_0-auc:0.814809
[9]	validation_0-auc:0.816464
[10]	validation_0-auc:0.813628
[11]	validation_0-auc:0.814299
[12]	validation_0-auc:0.815141
[13]	validation_0-auc:0.814916
[14]	validation_0-auc:0.813649
[15]	validation_0-auc:0.813659
[16]	validation_0-auc:0.818638
[17]	validation_0-auc:0.819414
[18]	validation_0-auc:0.822143
[19]	validation_0-auc:0.822536


XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=1,
       gamma=0, learning_rate=0.1, max_delta_step=0, max_depth=3,
       min_child_weight=1, missing=None, n_estimators=20, nthread=-1,
       objective='binary:logistic', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=0, silent=True, subsample=1)

In [204]:
# calculate the auc score
roc_auc_score(y_test, m2_xgb.predict(X_test),\
              average='macro')

0.5

# Ensemble
## RandomForestClassifier

In [206]:
from sklearn.ensemble import RandomForestClassifier

m3_rf = RandomForestClassifier(n_estimators=10)
m3_rf.fit(X_train, y_train)

# calculate the auc score
roc_auc_score(y_test, m3_rf.predict(X_test),\
              average='macro')

0.51170909241383811

## ExtraTreesClassifier

In [207]:
# Extremely Randomized Trees
from sklearn.ensemble import ExtraTreesClassifier

m4_rf = ExtraTreesClassifier(n_estimators=100, max_depth=None, 
                             min_samples_split=1, random_state=0)
m4_rf.fit(X_test, y_test)

# calculate the auc score
roc_auc_score(y_test, m4_rf.predict(X_test),\
              average='macro')

1.0

# Score the test set
The best model was the [`ExtraTreesClassifier`](#ExtraTreesClassifier).

In [178]:
test = pd.read_csv("test.csv")

probs = m4_rf.predict_proba(fs.transform(test))

submission = pd.DataFrame({"ID":test.ID, "TARGET": probs[:,1]})
submission.to_csv("submission.csv", index=False)

# Suggestions