## Modeling - Part 2
- In this part we'll do over-sampling to our data
- I will just use default parameter settings due to limitation of my pc
- Pseudo cross validation (random seeds)

### Gradient Boosting Machines

In [18]:
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import roc_auc_score
from sklearn.metrics import brier_score_loss
from sklearn.metrics import confusion_matrix
import pandas as pd

train_df = pd.read_csv('./all.csv')

# We don't need ids here so just drop it for a while
ids = list(train_df.id)
train_gbm = train_df.drop(['id'], axis=1)

X = train_gbm.drop(['churn'], axis=1).as_matrix()
y = train_gbm['churn'].as_matrix()

def oversample(X, y, seed):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=seed)
    X_train_re, y_train_re = SMOTE().fit_sample(X_train, y_train)
    X_test_re, y_test_re = SMOTE().fit_sample(X_test, y_test)

    gbm = GradientBoostingClassifier(n_estimators=100)
    gbm.fit(X_train_re, y_train_re)
    y_pred = gbm.predict_proba(X_test_re)
    y_positive = [x[1] for x in y_pred]
    y_binary = [0 if x[0] > x[1] else 1 for x in y_pred]

    print ("Seed: ", seed)
    auroc_gbm = roc_auc_score(y_test_re, y_positive)
    brier_gbm = brier_score_loss(y_test_re, y_positive)
    confusion = confusion_matrix(y_test_re, y_binary).ravel()
    print ("AUROC score: ", auroc_gbm)
    print ("Brier loss: ", brier_gbm)
    print ("[tn, fp, fn, tp]: ", confusion)
    return gbm
    
gbms = []
for i in range (10):
    gbms.append(oversample(X, y, i))

Seed:  0
AUROC score:  0.949903194258
Brier loss:  0.0803949679215
[tn, fp, fn, tp]:  [4584   91  761 3914]
Seed:  1
AUROC score:  0.949671322679
Brier loss:  0.0836135940213
[tn, fp, fn, tp]:  [4563   95  852 3806]
Seed:  2
AUROC score:  0.951311177547
Brier loss:  0.0794194603115
[tn, fp, fn, tp]:  [4574  114  719 3969]
Seed:  3
AUROC score:  0.95151581992
Brier loss:  0.0798976065483
[tn, fp, fn, tp]:  [4594   68  792 3870]
Seed:  4
AUROC score:  0.94635743944
Brier loss:  0.0835334397001
[tn, fp, fn, tp]:  [4567   95  827 3835]
Seed:  5
AUROC score:  0.942484934675
Brier loss:  0.0838280883991
[tn, fp, fn, tp]:  [4546  104  870 3780]
Seed:  6
AUROC score:  0.94852636335
Brier loss:  0.081966453975
[tn, fp, fn, tp]:  [4537  138  739 3936]
Seed:  7
AUROC score:  0.954063646079
Brier loss:  0.0773856908504
[tn, fp, fn, tp]:  [4595  109  721 3983]
Seed:  8
AUROC score:  0.950604912923
Brier loss:  0.0811442680311
[tn, fp, fn, tp]:  [4602   73  794 3881]
Seed:  9
AUROC score:  0.9526603

### Random Forest

In [21]:
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score
from sklearn.metrics import brier_score_loss
from sklearn.metrics import confusion_matrix
import pandas as pd

train_df = pd.read_csv('./all.csv')

# We don't need ids here so just drop it for a while
ids = list(train_df.id)
train_rf = train_df.drop(['id'], axis=1)

X = train_rf.drop(['churn'], axis=1).as_matrix()
y = train_rf['churn'].as_matrix()

def oversample_rf(X, y, seed):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=seed)
    X_train_re, y_train_re = SMOTE().fit_sample(X_train, y_train)
    X_test_re, y_test_re = SMOTE().fit_sample(X_test, y_test)

    rf = RandomForestClassifier(n_estimators=50)
    rf.fit(X_train_re, y_train_re)
    y_pred = rf.predict_proba(X_test_re)
    y_positive = [x[1] for x in y_pred]
    y_binary = [0 if x[0] > x[1] else 1 for x in y_pred]

    print ("Seed: ", seed)
    auroc_rf = roc_auc_score(y_test_re, y_positive)
    brier_rf = brier_score_loss(y_test_re, y_positive)
    confusion = confusion_matrix(y_test_re, y_binary).ravel()
    print ("AUROC score: ", auroc_rf)
    print ("Brier loss: ", brier_rf)
    print ("[tn, fp, fn, tp]: ", confusion)
    return gbm
    
rfs = []
for i in range (10):
    rfs.append(oversample_rf(X, y, i))

Seed:  0
AUROC score:  0.949715576654
Brier loss:  0.0839949946524
[tn, fp, fn, tp]:  [4596   79  882 3793]
Seed:  1
AUROC score:  0.949087669593
Brier loss:  0.0856335766423
[tn, fp, fn, tp]:  [4597   61  922 3736]
Seed:  2
AUROC score:  0.950181286692
Brier loss:  0.0830218003413
[tn, fp, fn, tp]:  [4621   67  843 3845]
Seed:  3
AUROC score:  0.94788905471
Brier loss:  0.0849760617761
[tn, fp, fn, tp]:  [4597   65  874 3788]
Seed:  4
AUROC score:  0.950072291449
Brier loss:  0.0850107679108
[tn, fp, fn, tp]:  [4593   69  887 3775]
Seed:  5
AUROC score:  0.948413689444
Brier loss:  0.0847737634409
[tn, fp, fn, tp]:  [4584   66  896 3754]
Seed:  6
AUROC score:  0.947378695416
Brier loss:  0.0861891764706
[tn, fp, fn, tp]:  [4582   93  881 3794]
Seed:  7
AUROC score:  0.95070320725
Brier loss:  0.0835778911565
[tn, fp, fn, tp]:  [4627   77  877 3827]
Seed:  8
AUROC score:  0.947794515142
Brier loss:  0.0899721925134
[tn, fp, fn, tp]:  [4598   77  987 3688]
Seed:  9
AUROC score:  0.95505

### How about Neural Nets?

In [20]:
import numpy as np
from imblearn.over_sampling import SMOTE
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import roc_auc_score
from sklearn.metrics import brier_score_loss

# We don't need ids here so just drop it for a while
ids = list(train_df.id)
train_mlp = train_df.drop(['id'], axis=1)

# scale the data to (0,1)
for col in list(train_mlp.columns[:-1]):
    temp = list(train_mlp[col])
    temp = [float(x) for x in temp]
    max_val = max(temp)
    train_mlp[col] = [x / max_val for x in temp]
    
X = train_mlp.drop(['churn'], axis=1).as_matrix()
y = train_mlp['churn'].as_matrix()

def oversample_mlp(X, y, seed):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=seed)
    X_train_re, y_train_re = SMOTE().fit_sample(X_train, y_train)
    X_test_re, y_test_re = SMOTE().fit_sample(X_test, y_test)

    mlp = MLPClassifier(hidden_layer_sizes=200, solver='adam')
    mlp.fit(X_train_re, y_train_re)
    y_pred = mlp.predict_proba(X_test_re)
    y_positive = [x[1] for x in y_pred]
    y_binary = [0 if x[0] > x[1] else 1 for x in y_pred]

    print ("Seed: ", seed)
    auroc_mlp = roc_auc_score(y_test_re, y_positive)
    brier_mlp = brier_score_loss(y_test_re, y_positive)
    confusion = confusion_matrix(y_test_re, y_binary).ravel()
    print ("AUROC score: ", auroc_mlp)
    print ("Brier loss: ", brier_mlp)
    print ("[tn, fp, fn, tp]: ", confusion)
    return mlp

mlps = []
for i in range(10):
    mlps.append(oversample_mlp(X, y, i))

Seed:  0
AUROC score:  0.673891595413
Brier loss:  0.231871836268
[tn, fp, fn, tp]:  [2709 1966 1559 3116]
Seed:  1
AUROC score:  0.679966422952
Brier loss:  0.236222938282
[tn, fp, fn, tp]:  [3171 1487 1933 2725]
Seed:  2
AUROC score:  0.666600022277
Brier loss:  0.233880111199
[tn, fp, fn, tp]:  [2938 1750 1888 2800]
Seed:  3
AUROC score:  0.664521894573
Brier loss:  0.239667509681
[tn, fp, fn, tp]:  [3106 1556 1944 2718]
Seed:  4
AUROC score:  0.66949887928
Brier loss:  0.232069854396
[tn, fp, fn, tp]:  [3254 1408 2168 2494]
Seed:  5
AUROC score:  0.673658550121
Brier loss:  0.23069034753
[tn, fp, fn, tp]:  [2563 2087 1415 3235]
Seed:  6
AUROC score:  0.656768452058
Brier loss:  0.239725040761
[tn, fp, fn, tp]:  [2472 2203 1548 3127]
Seed:  7
AUROC score:  0.676837396311
Brier loss:  0.233038447965
[tn, fp, fn, tp]:  [3029 1675 1803 2901]
Seed:  8
AUROC score:  0.67025934971
Brier loss:  0.233773672445
[tn, fp, fn, tp]:  [2497 2178 1314 3361]
Seed:  9
AUROC score:  0.67188626163
Bri

### Of course Linear model still doesn't work

In [23]:
import numpy as np
from imblearn.over_sampling import SMOTE
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
from sklearn.metrics import brier_score_loss

# We don't need ids here so just drop it for a while
ids = list(train_df.id)
train_lr = train_df.drop(['id'], axis=1)

# scale the data to (0,1)
for col in list(train_lr.columns[:-1]):
    temp = list(train_lr[col])
    temp = [float(x) for x in temp]
    max_val = max(temp)
    train_lr[col] = [x / max_val for x in temp]
    
X = train_lr.drop(['churn'], axis=1).as_matrix()
y = train_lr['churn'].as_matrix()

def oversample_lr(X, y, seed):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=seed)
    X_train_re, y_train_re = SMOTE().fit_sample(X_train, y_train)
    X_test_re, y_test_re = SMOTE().fit_sample(X_test, y_test)

    lr = LogisticRegression()
    lr.fit(X_train_re, y_train_re)
    y_pred = lr.predict_proba(X_test_re)
    y_positive = [x[1] for x in y_pred]
    y_binary = [0 if x[0] > x[1] else 1 for x in y_pred]

    print ("Seed: ", seed)
    auroc_lr = roc_auc_score(y_test_re, y_positive)
    brier_lr = brier_score_loss(y_test_re, y_positive)
    confusion = confusion_matrix(y_test_re, y_binary).ravel()
    print ("AUROC score: ", auroc_lr)
    print ("Brier loss: ", brier_lr)
    print ("[tn, fp, fn, tp]: ", confusion)
    return lr

lrs = []
for i in range(10):
    lrs.append(oversample_lr(X, y, i))

Seed:  0
AUROC score:  0.640617781464
Brier loss:  0.236317630224
[tn, fp, fn, tp]:  [2537 2138 1617 3058]
Seed:  1
AUROC score:  0.651604436455
Brier loss:  0.232288053047
[tn, fp, fn, tp]:  [2692 1966 1770 2888]
Seed:  2
AUROC score:  0.652136946121
Brier loss:  0.233541624158
[tn, fp, fn, tp]:  [2571 2117 1519 3169]
Seed:  3
AUROC score:  0.654067700721
Brier loss:  0.232216099662
[tn, fp, fn, tp]:  [2595 2067 1514 3148]
Seed:  4
AUROC score:  0.641526753818
Brier loss:  0.235277909203
[tn, fp, fn, tp]:  [2635 2027 1736 2926]
Seed:  5
AUROC score:  0.641530997803
Brier loss:  0.235405842619
[tn, fp, fn, tp]:  [2576 2074 1662 2988]
Seed:  6
AUROC score:  0.632028688267
Brier loss:  0.236875690695
[tn, fp, fn, tp]:  [2592 2083 1694 2981]
Seed:  7
AUROC score:  0.626626745511
Brier loss:  0.237426437193
[tn, fp, fn, tp]:  [2613 2091 1844 2860]
Seed:  8
AUROC score:  0.651694701021
Brier loss:  0.232624191984
[tn, fp, fn, tp]:  [2578 2097 1632 3043]
Seed:  9
AUROC score:  0.651427462036