In [None]:
# Loading Data.
import pandas as pd
import numpy as np

train = pd.read_csv("train.csv", index_col=None)
# Split values and class for convenience.
X, Y = train.iloc[:,1:], train["Class"]
train.head(10)

## Testing Pipelines
### Build a Pipeline

In [None]:
# Building a Random Forest, Support Vector Machine and Bernoulli Niave Bayes classifier pipelines.
#
# Based on the code from the following source:
########################################
# Title: Tutorial 3-Titanic5 Building Pipelines and Model Comparison
# Author: Chuan Lu
# Date: 06/04/2017
# Code Version: 47c58c0
# Available: https://github.com/aberML/CSM6420/
#######################################
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import Imputer
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_regression

from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import BernoulliNB

from sklearn.grid_search import GridSearchCV

scaler = StandardScaler()
anova_filter = SelectKBest(f_regression)

# Random Forest Classifier.
clf = RandomForestClassifier()
pipeline_rf = Pipeline([
    ('anova', anova_filter),
    ('rf', clf)
])

# Support Vector Machine.
clf = SVC(probability=True)
pipeline_svm = Pipeline([
        ('scale', scaler), 
        ('anova', anova_filter), 
        ('svc', clf)
])

# Bernoulli (true or false) Niave Bayes Classifier.
clf = BernoulliNB()
pipeline_bnb = Pipeline([
    ('anova', anova_filter),
    ('bnb', clf)
])


### Grid Search Parameters for the classifiers.

In [None]:
# Using 8 k-fold cross-validation.
kfolds = 8

# Set parameters for grid searches.
pipeline_dict = {'rf': pipeline_rf, 'svm': pipeline_svm, 'bnb': pipeline_bnb} 
parameter_grid_dict = {}
parameter_grid_dict['rf'] = {
            'anova__k': [10, 100, 'all'],
            'rf__n_estimators': [10, 100, 1000],
            'rf__max_depth': [5, 10, 50, 100, None],
        }

parameter_grid_dict['svm'] = {
            'anova__k': [5, 100, 500, 'all'],
            'svc__kernel': ['rbf', 'poly', 'sigmoid'],
            'svc__C': [0.01, 0.1, 1, 10],
            'svc__gamma': [0, 0.1, 1]
        }

parameter_grid_dict['bnb'] = {
            'anova__k': [1, 5, 10, 'all'],
            'bnb__alpha': np.linspace(0,1,11),
            'bnb__fit_prior': [True, False],
        }

grid_results = {}
for alg in ['rf', 'svm', 'bnb']:
    pipeline = pipeline_dict[alg]
    parameter_grid = parameter_grid_dict[alg]    
    grid_search = GridSearchCV(pipeline, parameter_grid, cv=kfolds, verbose=3, n_jobs=8)
    grid_search.fit(X, Y)

    sorted(grid_search.grid_scores_, key=lambda x: x.mean_validation_score)
    print(grid_search.best_score_)
    print(grid_search.best_params_)
    
    grid_results[alg] = grid_search 

### Best hyperparameters.

In [None]:
# Analysis of the grid search results 
for alg in grid_results:
    grid_search = grid_results[alg]
    sorted(grid_search.grid_scores_, key=lambda x: x.mean_validation_score)
    print('Best accuracy for %s :' % alg)
    print(grid_search.best_score_)
    print(grid_search.best_params_)

In [None]:
optimal_params = {}
optimal_params['rf'] = {'anova__k': 100, 'rf__max_depth': 100, 'rf__n_estimators': 1000}
optimal_params['svm'] = {'anova__k': 100, 'svc__C': 0.01, 'svc__gamma': 0.1, 'svc__kernel': 'poly'}
optimal_params['bnb'] = {'anova__k': 'all', 'bnb__alpha': 1.0, 'bnb__fit_prior': True}

## Cross-Validation

### ROC curves and AUC values code.

In [None]:
%matplotlib inline 
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve, auc

# ROC plot and AUC statistic, based on the code from the following source:
########################################
# Title: Tutorial 3-Titanic3 Cross-validation with ROC analysis
# Author: Chuan Lu
# Date: 06/04/2017
# Code Version: 47c58c0
# Available: https://github.com/aberML/CSM6420/
########################################

# (Hacked together, but does the job ;-) )
mean_acc = 0.0
mean_auc = 0.0
all_tpr = []
all_acc = []
all_auc = []

i=0

def ROC_AUC(y_test, y_prob):
    global i, mean_acc, mean_auc, all_tpr, all_acc, all_auc
    i += 1
    # Get prediction on class label from the model
    y_prediction = np.around(y_prob, decimals=0)
    
    # Get probability output from the model
    acc = np.sum(y_test == y_prediction)*1./len(y_test)
    print("Prediction accuracy:", acc)
    # Compute ROC curve and area the curve
    fpr, tpr, thresholds = roc_curve(y_test, y_prob)
    roc_auc = auc(fpr, tpr)
    print("Area under ROC curve (AUC):", roc_auc)
    plt.plot(fpr, tpr, lw=1, label='ROC fold %d (area = %0.2f)' % (i, roc_auc))    
    all_acc.append(acc)
    all_auc.append(roc_auc)
    return acc, roc_auc

def display_plot(title = 'Receiver operating characteristic example'):
    global i, mean_acc, mean_auc, all_tpr, all_acc, all_auc
    plt.plot([0, 1], [0, 1], '--', color=(0.6, 0.6, 0.6))
    plt.xlim([-0.05, 1.05])
    plt.ylim([-0.05, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title(title)
    plt.legend(loc="lower right")

    all_acc=np.asarray(all_acc)
    all_auc=np.asarray(all_auc)
    print(all_acc)
    # print 95% C.I. for both accuracy and AUC based on CV
    print("Mean Accuracy: %0.3f (+/- %0.3f)" % (all_acc.mean(), all_acc.std() * 1.96))
    print(all_auc)
    print("Mean AUC: %0.3f (+/- %0.3f)" % (all_auc.mean(), all_auc.std() * 1.96))
    
    # Reset values for re-use.
    mean_acc = 0.0
    mean_auc = 0.0
    all_tpr = []
    all_acc = []
    all_auc = []

    i=0


### K-fold Cross-Validation.

In [None]:
from sklearn.cross_validation import StratifiedKFold as SKFold

random_seed = 1234
scv = SKFold(y=Y, n_folds=kfolds, random_state=random_seed)

acc_clfs = pd.DataFrame()
auc_clfs = pd.DataFrame()

for alg, title in [('rf', "Random Forest"),
                   ('svm', "Support Vector Machine"),
                   ('bnb', "Bernoulli Niave Bayes")]:
    pipeline = pipeline_dict[alg]
    params = optimal_params[alg]
    plt.figure()
    
    best_acc = best_auc = 0
    
    for training_set, test_set in scv:  
        X_train = X.iloc[training_set]
        y_train = Y.iloc[training_set]
        X_test = X.iloc[test_set]
        y_test = Y.iloc[test_set]
        print("Shape of training:")
        print("X:", X_train.shape, "y:", y_train.shape)
        print("Shape of testing:")
        print("X:", X_test.shape, "y:", y_test.shape)

        pipeline.set_params(**params)
        pipeline.fit(X_train, y_train)

        y_pred = pipeline.predict_proba(X_test)[:,1]
        
        # Plot.
        acc, roc_auc = ROC_AUC(y_test, y_pred)
        
        # Crudely assume the best model has the highest combined acc and ROC_AUC.
        if acc + roc_auc > best_auc + best_acc:
            # Save predictions for the best result.
            best_acc = acc
            best_auc = roc_auc
            
            # Prepare test predictions for submission.
            test = pd.read_csv("test.csv")
            X_test, Id = test.iloc[:,1:], test["TestId"]

            Y_pred = pipeline.predict_proba(X_test)[:,1]

            pred_df = pd.DataFrame(Id)
            pred_df = pred_df.join(pd.DataFrame({"PredictedScore": Y_pred}))
            pred_df.to_csv("pred"+alg+".csv", index = False)
    
    acc_clfs[alg] = np.asarray(all_acc)
    auc_clfs[alg] = np.asarray(all_auc)
    display_plot(title + " Receiver Operating Characteristic")
    plt.savefig("ROC_" + alg + ".pdf")
    
acc_clfs.plot(kind='box', title='Accuracy from 8-fold CV')
plt.savefig("acc.pdf")
auc_clfs.plot(kind='box', title='AUC from 8-fold CV')
plt.savefig("auc.pdf")

In [None]:
RF:
Mean Accuracy: 0.869 (+/- 0.065)
Mean AUC: 0.877 (+/- 0.081)
SVM:
Mean Accuracy: 0.841 (+/- 0.037)
Mean AUC: 0.846 (+/- 0.071)
BNB:
Mean Accuracy: 0.864 (+/- 0.057)
Mean AUC: 0.877 (+/- 0.089)

### System info.

In [1]:
import sys
import pandas as pd
import sklearn
print('Python: ', sys.version_info)
print('Pandas: ', pd.__version__)
print('Sklearn: ', sklearn.__version__)

Python:  sys.version_info(major=3, minor=6, micro=5, releaselevel='final', serial=0)
Pandas:  0.22.0
Sklearn:  0.19.1
