# Intro
The [Santander Customer Competition](https://www.kaggle.com/c/santander-customer-satisfaction) on Kaggle provides us with numerical data and we have to predict whether a customer is satisfied or not. The evaluation metric is ROC AUC.

First, I try to run the Logistic Regression classifier from `sklearn` and then I run the xgbBoost classifier from `xgb` and compare the results.

# Data
First, I read in data.

In [83]:
import pandas as pd

train = pd.read_csv("train.csv")

In [84]:
train.head()

Unnamed: 0,ID,var3,var15,imp_ent_var16_ult1,imp_op_var39_comer_ult1,imp_op_var39_comer_ult3,imp_op_var40_comer_ult1,imp_op_var40_comer_ult3,imp_op_var40_efect_ult1,imp_op_var40_efect_ult3,...,saldo_medio_var33_hace2,saldo_medio_var33_hace3,saldo_medio_var33_ult1,saldo_medio_var33_ult3,saldo_medio_var44_hace2,saldo_medio_var44_hace3,saldo_medio_var44_ult1,saldo_medio_var44_ult3,var38,TARGET
0,1,2,23,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,39205.17,0
1,3,2,34,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,49278.03,0
2,4,2,23,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,67333.77,0
3,8,2,37,0.0,195.0,195.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,64007.97,0
4,10,2,39,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,117310.979016,0


In [85]:
train.describe()

Unnamed: 0,ID,var3,var15,imp_ent_var16_ult1,imp_op_var39_comer_ult1,imp_op_var39_comer_ult3,imp_op_var40_comer_ult1,imp_op_var40_comer_ult3,imp_op_var40_efect_ult1,imp_op_var40_efect_ult3,...,saldo_medio_var33_hace2,saldo_medio_var33_hace3,saldo_medio_var33_ult1,saldo_medio_var33_ult3,saldo_medio_var44_hace2,saldo_medio_var44_hace3,saldo_medio_var44_ult1,saldo_medio_var44_ult3,var38,TARGET
count,76020.0,76020.0,76020.0,76020.0,76020.0,76020.0,76020.0,76020.0,76020.0,76020.0,...,76020.0,76020.0,76020.0,76020.0,76020.0,76020.0,76020.0,76020.0,76020.0,76020.0
mean,75964.050723,-1523.199277,33.212865,86.208265,72.363067,119.529632,3.55913,6.472698,0.412946,0.567352,...,7.935824,1.365146,12.21558,8.784074,31.505324,1.858575,76.026165,56.614351,117235.8,0.039569
std,43781.947379,39033.462364,12.956486,1614.757313,339.315831,546.266294,93.155749,153.737066,30.604864,36.513513,...,455.887218,113.959637,783.207399,538.439211,2013.125393,147.786584,4040.337842,2852.579397,182664.6,0.194945
min,1.0,-999999.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5163.75,0.0
25%,38104.75,2.0,23.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,67870.61,0.0
50%,76043.0,2.0,28.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,106409.2,0.0
75%,113748.75,2.0,40.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,118756.3,0.0
max,151838.0,238.0,105.0,210000.0,12888.03,21024.81,8237.82,11073.57,6600.0,6600.0,...,50003.88,20385.72,138831.63,91778.73,438329.22,24650.01,681462.9,397884.3,22034740.0,1.0


In [87]:
# Number of columns including dependent variable TARGET
train.shape

(76020, 371)

In [88]:
from tpot import TPOT
from sklearn.cross_validation import StratifiedShuffleSplit
from sklearn import linear_model
from sklearn.feature_selection import RFE
from sklearn.metrics import roc_auc_score
import numpy as np

# split data into train and test
X=train.drop(["TARGET"],axis=1)
sss = StratifiedShuffleSplit(train.TARGET.values, n_iter=10, train_size=0.75, test_size=0.25)

for train_index, test_index in sss:
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = train.TARGET[train_index], train.TARGET[test_index]

In [99]:
# Logistic classifier from sklearn
m1_log = linear_model.LogisticRegression(verbose=3, max_iter=2000)
selector = RFE(m1_log, 140, step=1, verbose = 2)
selector = selector.fit(X_test, y_test)
# selector.ranking_
# selector.support_

# calculate the auc
roc_auc_score(y_test, selector.predict(X_test),\
              average='macro')

Fitting estimator with 370 features.
[LibLinear]Fitting estimator with 369 features.
[LibLinear]Fitting estimator with 368 features.
[LibLinear]Fitting estimator with 367 features.
[LibLinear]Fitting estimator with 366 features.
[LibLinear]Fitting estimator with 365 features.
[LibLinear]Fitting estimator with 364 features.
[LibLinear]Fitting estimator with 363 features.
[LibLinear]Fitting estimator with 362 features.
[LibLinear]Fitting estimator with 361 features.
[LibLinear]Fitting estimator with 360 features.
[LibLinear]Fitting estimator with 359 features.
[LibLinear]Fitting estimator with 358 features.
[LibLinear]Fitting estimator with 357 features.
[LibLinear]Fitting estimator with 356 features.
[LibLinear]Fitting estimator with 355 features.
[LibLinear]Fitting estimator with 354 features.
[LibLinear]Fitting estimator with 353 features.
[LibLinear]Fitting estimator with 352 features.
[LibLinear]Fitting estimator with 351 features.
[LibLinear]Fitting estimator with 350 features.
[Li

0.5

# Xgboost

In [96]:
import xgboost as xgb

# logistic classifier from xgboost
m2_xgb = xgb.XGBClassifier(n_estimators=100, learning_rate=0.1)
m2_xgb.fit(X_train, y_train, eval_metric="auc",\
        eval_set=[(X_test, y_test)])


[0]	validation_0-auc:0.780637
[1]	validation_0-auc:0.806404
[2]	validation_0-auc:0.813139
[3]	validation_0-auc:0.816229
[4]	validation_0-auc:0.817942
[5]	validation_0-auc:0.820899
[6]	validation_0-auc:0.820286
[7]	validation_0-auc:0.820499
[8]	validation_0-auc:0.820525
[9]	validation_0-auc:0.822723
[10]	validation_0-auc:0.821481
[11]	validation_0-auc:0.824863
[12]	validation_0-auc:0.824845
[13]	validation_0-auc:0.823746
[14]	validation_0-auc:0.823989
[15]	validation_0-auc:0.825502
[16]	validation_0-auc:0.826346
[17]	validation_0-auc:0.826773
[18]	validation_0-auc:0.827537
[19]	validation_0-auc:0.828185
[20]	validation_0-auc:0.828431
[21]	validation_0-auc:0.828534
[22]	validation_0-auc:0.828015
[23]	validation_0-auc:0.828945
[24]	validation_0-auc:0.829813
[25]	validation_0-auc:0.830925
[26]	validation_0-auc:0.831623
[27]	validation_0-auc:0.832362
[28]	validation_0-auc:0.833567
[29]	validation_0-auc:0.834324
[30]	validation_0-auc:0.834278
[31]	validation_0-auc:0.834121
[32]	validation_0-

XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=1,
       gamma=0, learning_rate=0.1, max_delta_step=0, max_depth=3,
       min_child_weight=1, missing=None, n_estimators=100, nthread=-1,
       objective='binary:logistic', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=0, silent=True, subsample=1)

In [97]:
# calculate the auc score
roc_auc_score(y_test, m2_xgb.predict(X_test),\
              average='macro')

0.50196728809370883

# Suggestions