## Modeling - Part 1
### What we learnt from correlation matrix:
- Maybe not for a linear model
- price-related columns, power&energy-consumption-related columns are correlated
- How will imbalance data influence modeling?

In [1]:
import pandas as pd

train_df = pd.read_csv('./all.csv')
cols = train_df.columns
cols

Index(['id', 'cons_12m', 'cons_gas_12m', 'cons_last_month',
       'cons_last_month.1', 'forecast_cons_12m', 'forecast_cons_year',
       'forecast_discount_energy', 'forecast_meter_rent_12m',
       'forecast_price_energy_p1', 'forecast_price_energy_p2',
       'forecast_price_pow_p1', 'has_gas', 'imp_cons', 'margin_gross_pow_ele',
       'margin_net_pow_ele', 'nb_prod_act', 'net_margin', 'num_years_antig',
       'pow_max', 'channel0', 'channel1', 'channel2', 'channel3', 'channel4',
       'channel5', 'channel6', 'days_activ', 'days_end', 'days_modif',
       'days_renewal', 'origin0', 'origin1', 'origin2', 'origin3', 'origin4',
       'prices_p1_var', 'prices_p2_var', 'prices_p3_var', 'prices_p1_fix',
       'prices_p2_fix', 'prices_p3_fix', 'churn'],
      dtype='object')

## Linear Approaches

### 1.　Logistic Regression

In [4]:
import numpy as np
from sklearn.model_selection import cross_val_predict
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
from sklearn.metrics import brier_score_loss

# We don't need ids here so just drop it for a while
ids = list(train_df.id)
train_lr = train_df.drop(['id'], axis=1)

# scale the data to (0,1)
for col in list(train_lr.columns[:-1]):
    temp = list(train_lr[col])
    temp = [float(x) for x in temp]
    max_val = max(temp)
    train_lr[col] = [x / max_val for x in temp]
    
X = train_lr.drop(['churn'], axis=1).as_matrix()
y = train_lr['churn'].as_matrix()

# Now train our LR model using Leave-One-Out cross validation
lr = LogisticRegression()
y_pred = cross_val_predict(lr, X, y, cv=5, method='predict_proba')
y_positive_pred = [x[1] for x in y_pred]

auroc_lr = roc_auc_score(y, y_positive_pred)
brier_lr = brier_score_loss(y, y_positive_pred)

print ('AUROC score: ', auroc_lr)
print ('Brier score: ', brier_lr)

AUROC score:  0.63723398816
Brier score:  0.0857957663748


### 2.　Linear SVMs

In [12]:
import numpy as np
from sklearn.model_selection import cross_val_predict
from sklearn.svm import SVC
from sklearn.metrics import roc_auc_score
from sklearn.metrics import brier_score_loss

# We don't need ids here so just drop it for a while
ids = list(train_df.id)
train_svm = train_df.drop(['id'], axis=1)

# scale the data to (0,1)
for col in list(train_svm.columns[:-1]):
    temp = list(train_svm[col])
    temp = [float(x) for x in temp]
    max_val = max(temp)
    train_svm[col] = [x / max_val for x in temp]
    
X = train_svm.drop(['churn'], axis=1).as_matrix()
y = train_svm['churn'].as_matrix()

# Now train our SVMs model using Leave-One-Out cross validation
svc = SVC(kernel='linear', probability=True)
y_pred = cross_val_predict(svc, X, y, cv=5, method='predict_proba')
y_positive_pred = [x[1] for x in y_pred]

auroc_svm = roc_auc_score(y, y_positive_pred)
brier_svm = brier_score_loss(y, y_positive_pred)

print ('AUROC score: ', auroc_svm)
print ('Brier score: ', brier_svm)

AUROC score:  0.542053111265
Brier score:  0.0874521740319


## Non-linear Approaches

### 1.　Random Forest

In [3]:
import numpy as np
from multiprocessing import Pool
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_predict
from sklearn.model_selection import LeaveOneOut
from sklearn.metrics import roc_auc_score
from sklearn.metrics import brier_score_loss

# We don't need ids here so just drop it for a while
ids = list(train_df.id)
train_rf = train_df.drop(['id'], axis=1)

X = train_rf.drop(['churn'], axis=1).as_matrix()
y = train_rf['churn'].as_matrix()

# Grid search parameters for random forest classifier
params = []
for n_trees in [10, 20, 50]:
    for max_feature in ['sqrt', 'log2', None]:
        for max_depth in [2, 5, 10, None]:
            for min_smp in [1, 5, 10]:
                for balanced in ['balanced', None]:
                    params.append([n_trees, max_feature, max_depth, min_smp, balanced])

results_dict = {'params' : [], 'auroc': [], 'brier': []}

# Now train our grid search classifiers
def parallel_learner(param):
    rf = RandomForestClassifier(n_estimators=param[0], \
                                max_features=param[1], \
                                max_depth=param[2], \
                                min_samples_leaf=param[3], \
                                class_weight=param[4], \
                                n_jobs=8)
    loo = LeaveOneOut()
    y_pred = cross_val_predict(rf, X, y, cv=5, method='predict_proba')
    y_positive_pred = [x[1] for x in y_pred]
    
    auroc_rf = roc_auc_score(y, y_positive_pred)
    brier_rf = brier_score_loss(y, y_positive_pred)
    
    results_dict['params'].append(param)
    results_dict['auroc'].append(auroc_rf)
    results_dict['brier'].append(brier_rf)
    
for p in params:
    parallel_learner(p)

In [10]:
results_dict['params'][183]

[50, 'log2', 10, 5, None]

In [11]:
results_dict['auroc'][183]

0.69789495995180828

### 2.　Gradient Boosting Machines

In [24]:
import numpy as np
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import cross_val_predict
from sklearn.model_selection import LeaveOneOut
from sklearn.metrics import roc_auc_score
from sklearn.metrics import brier_score_loss

# We don't need ids here so just drop it for a while
ids = list(train_df.id)
train_rf = train_df.drop(['id'], axis=1)

X = train_rf.drop(['churn'], axis=1).as_matrix()
y = train_rf['churn'].as_matrix()

# Now train our grid search classifiers
gbm = GradientBoostingClassifier(n_estimators=200)
y_pred = cross_val_predict(gbm, X, y, cv=5, method='predict_proba')
y_positive_pred = [x[1] for x in y_pred]

auroc_rf = roc_auc_score(y, y_positive_pred)
brier_rf = brier_score_loss(y, y_positive_pred)

print (auroc_rf, brier_rf)

0.693088502971 0.0814817589464


### Performances are not very good. But our guess about linearity and non-linearity is right. Poor performance may have something to do with imbalance.