In [1]:
from __future__ import division
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler,StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.svm import SVC
from sklearn.decomposition import PCA
from sklearn.manifold import MDS
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import accuracy_score,roc_auc_score
import seaborn as sns
import matplotlib.pyplot as plt
#from analytics_tools.analytics_tools3 import freq_discrete
%matplotlib inline

In [2]:
df = pd.read_excel('Data/churn.xlsx')

In [3]:
df.head()

Unnamed: 0,V_ACCT_LGHT,C_IP,C_VMP,V_N_VM_MESS,V_T_DAY_CHG,V_T_EVE_CALLS,V_T_EVE_CHG,V_T_NIG_CALLS,V_T_NIG_CHG,V_T_INT_CALLS,V_T_INT_CHG,V_CS_CALLS,TARGET,ID
0,128,No,Yes,25,45.07,99,16.78,91,11.01,3,2.7,1,0,1
1,107,No,Yes,26,27.47,103,16.62,103,11.45,3,3.7,1,0,2
2,137,No,No,0,41.38,110,10.3,104,7.32,5,3.29,0,0,3
3,65,No,No,0,21.95,83,19.42,111,9.4,6,3.43,4,1,11
4,168,No,No,0,21.9,71,8.92,128,6.35,2,3.02,1,0,13


In [4]:
df_mini = df.sample(frac=0.1)

In [5]:
var_disc = [v for v in df.columns if v[:2]=='C_']
var_cont = [v for v in df.columns if v[:2]=='V_']

In [6]:
for v in var_disc:
    df[v] = (df[v]=='Yes').astype(int)
    df_mini[v] = (df_mini[v]=='Yes').astype(int)

In [7]:
df.head()

Unnamed: 0,V_ACCT_LGHT,C_IP,C_VMP,V_N_VM_MESS,V_T_DAY_CHG,V_T_EVE_CALLS,V_T_EVE_CHG,V_T_NIG_CALLS,V_T_NIG_CHG,V_T_INT_CALLS,V_T_INT_CHG,V_CS_CALLS,TARGET,ID
0,128,0,1,25,45.07,99,16.78,91,11.01,3,2.7,1,0,1
1,107,0,1,26,27.47,103,16.62,103,11.45,3,3.7,1,0,2
2,137,0,0,0,41.38,110,10.3,104,7.32,5,3.29,0,0,3
3,65,0,0,0,21.95,83,19.42,111,9.4,6,3.43,4,1,11
4,168,0,0,0,21.9,71,8.92,128,6.35,2,3.02,1,0,13


In [8]:
X = df[var_cont+var_disc].copy()
y = df['TARGET'].copy()
X_mini = df_mini[var_cont+var_disc].copy()
y_mini = df_mini['TARGET'].copy()

In [9]:
pca_pipe = make_pipeline(StandardScaler(),PCA(n_components=10),MinMaxScaler())

In [10]:
pca_pipe.fit(X)

Pipeline(steps=[('standardscaler', StandardScaler(copy=True, with_mean=True, with_std=True)), ('pca', PCA(copy=True, iterated_power='auto', n_components=10, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False)), ('minmaxscaler', MinMaxScaler(copy=True, feature_range=(0, 1)))])

In [11]:
pca_pipe.steps[1][1].explained_variance_ratio_.cumsum()

array([0.16364366, 0.2557176 , 0.34409294, 0.4301247 , 0.51546015,
       0.59912232, 0.68175932, 0.76319946, 0.84206515, 0.92016482])

In [12]:
Xp = pd.DataFrame(pca_pipe.transform(X),columns=['p%d'%i for i in range(1,11)])

In [13]:
mds_pipe = make_pipeline(MinMaxScaler(),MDS(n_components=3))

In [14]:
mds_pipe.fit(X_mini)

Pipeline(steps=[('minmaxscaler', MinMaxScaler(copy=True, feature_range=(0, 1))), ('mds', MDS(dissimilarity='euclidean', eps=0.001, max_iter=300, metric=True,
  n_components=3, n_init=4, n_jobs=1, random_state=None, verbose=0))])

In [15]:
Xm = pd.DataFrame(mds_pipe.fit_transform(X_mini),columns=['d1','d2','d3'])

In [16]:
Xm.head()

Unnamed: 0,d1,d2,d3
0,0.988621,0.28355,0.455957
1,-0.73323,1.01156,0.347799
2,0.304958,-0.371884,0.114151
3,0.066059,-0.727116,-0.454919
4,0.147532,-0.251279,-0.093327


In [30]:
Xt,Xv,yt,yv = train_test_split(Xm,y_mini,train_size=0.7)

In [31]:
def metricas(model,Xt,Xv,yt,yv):
    print "Accuracy_Train %.2f, Accuracy_Validate %.2f "%(accuracy_score(y_pred=model.predict(Xt),y_true=yt),accuracy_score(y_pred=model.predict(Xv),y_true=yv))
    print "ROC_Train %.2f, ROC_Validate %.2f "%(roc_auc_score(y_score=model.predict_proba(Xt)[:,1],y_true=yt),roc_auc_score(y_score=model.predict_proba(Xv)[:,1],y_true=yv))


In [32]:
model = SVC(probability=True)

In [33]:
param_grid = dict(kernel = ['linear', 'poly', 'rbf', 'sigmoid'],
                  degree = range(1,6),
                  gamma = np.arange(0.5,1,0.05),
                  coef0 = range(1,13),
                  decision_function_shape = ['ovo', 'ovr']
                )

In [34]:
grid = RandomizedSearchCV(cv=3,
                          verbose=True,
                          param_distributions=param_grid,
                          estimator=model,
                          n_jobs=-1,
                          n_iter=300,
                          scoring='roc_auc')

In [35]:
%%time
grid.fit(Xm,y_mini)

Fitting 3 folds for each of 300 candidates, totalling 900 fits


[Parallel(n_jobs=-1)]: Done 290 tasks      | elapsed:   16.2s


CPU times: user 2.18 s, sys: 239 ms, total: 2.42 s
Wall time: 39.2 s


[Parallel(n_jobs=-1)]: Done 900 out of 900 | elapsed:   39.1s finished


RandomizedSearchCV(cv=3, error_score='raise',
          estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=True, random_state=None, shrinking=True,
  tol=0.001, verbose=False),
          fit_params={}, iid=True, n_iter=300, n_jobs=-1,
          param_distributions={'decision_function_shape': ['ovo', 'ovr'], 'coef0': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12], 'gamma': array([0.5 , 0.55, 0.6 , 0.65, 0.7 , 0.75, 0.8 , 0.85, 0.9 , 0.95]), 'degree': [1, 2, 3, 4, 5], 'kernel': ['linear', 'poly', 'rbf', 'sigmoid']},
          pre_dispatch='2*n_jobs', random_state=None, refit=True,
          return_train_score=True, scoring='roc_auc', verbose=True)

In [36]:
grid.best_estimator_

SVC(C=1.0, cache_size=200, class_weight=None, coef0=3,
  decision_function_shape='ovo', degree=2, gamma=0.9500000000000004,
  kernel='poly', max_iter=-1, probability=True, random_state=None,
  shrinking=True, tol=0.001, verbose=False)

In [37]:
metricas(grid.best_estimator_,Xt,Xv,yt,yv)

Accuracy_Train 0.90, Accuracy_Validate 0.87 
ROC_Train 0.23, ROC_Validate 0.24 
