### Objective:  In this notebook, I am doing a grid search using SGDClassifier for elastic net

I will do the grid search using 10 fold CV on a subset of the training data. 

### Load modules and the data

In [1]:
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import SGDClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, roc_curve, confusion_matrix, make_scorer

import numpy as np
import pandas as pd
import seaborn as sns
from os.path import join, split
from os import listdir
import matplotlib.pyplot as plt
%matplotlib inline

pd.set_option('display.max_rows',999)
pd.set_option('display.max_columns', 999)
%matplotlib inline


In [2]:
base = join('/Users/crivera5/Documents/NonIntuitProjects/Kaggle/KaggleEEG','data_local')
data= pd.read_csv(join(base,'cleaned_training.csv'))

X = data.iloc[:, :-3].copy()
y = data.iloc[:,-1].copy()

### Split the data into test and train


In [3]:
X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=4)

### Set up the pipeline and fit the data

The pipeline below: 

1. scales the data, 
2. Trains a SDGlogistic regressor with GridSearchCV. 

I am trying 3 different parameters in my grid search: 

1. C: the tunning parameter. 
2. Class_weight: Toggling whether the regressor uses balanced or not balanced wieghting strategy to account for imbalances in the classes. 
3. penalty: 'L1' or 'L2'

In [4]:
# the space to search 
number_alpha= 40
start = -2
end = 2

scalar = StandardScaler()

sgd_clf = SGDClassifier(loss='hinge', 
                       penalty='elasticnet', 
                       l1_ratio=0.15, 
                       fit_intercept=True, 
                       n_iter=20, 
                       shuffle=True, verbose=0,
                       n_jobs=1, random_state=444, 
                       learning_rate='optimal', 
                       power_t=0.5, class_weight=None, 
                       warm_start=True, 
                       average=False)

sgd_pipeline = Pipeline([('scalar',StandardScaler()),
                     ('SGDclf', sgd_clf)])
parameters = {'SGDclf__alpha':np.logspace(start,end,number_alpha), 'SGDclf__class_weight':[None, 'balanced'],
              'SDGclf__l1_ratio':np.arange(0.05,1,0.05)}
sgd_model = GridSearchCV(sgd_pipeline, parameters, n_jobs = -1, scoring = 'roc_auc',verbose = True, cv = 10)
sgd_model

GridSearchCV(cv=10, error_score='raise',
       estimator=Pipeline(steps=[('scalar', StandardScaler(copy=True, with_mean=True, with_std=True)), ('SGDclf', SGDClassifier(alpha=0.0001, average=False, class_weight=None, epsilon=0.1,
       eta0=0.0, fit_intercept=True, l1_ratio=0.15,
       learning_rate='optimal', loss='hinge', n_iter=20, n_jobs=1,
       penalty='elasticnet', power_t=0.5, random_state=444, shuffle=True,
       verbose=0, warm_start=True))]),
       fit_params={}, iid=True, n_jobs=-1,
       param_grid={'SGDclf__class_weight': [None, 'balanced'], 'SDGclf__l1_ratio': array([ 0.05,  0.1 ,  0.15,  0.2 ,  0.25,  0.3 ,  0.35,  0.4 ,  0.45,
        0.5 ,  0.55,  0.6 ,  0.65,  0.7 ,  0.75,  0.8 ,  0.85,  0.9 ,  0.95]), 'SGDclf__alpha': array([  1.00000e-02,   1.26638e-02,   1.60372e-02,   2.03...,   3.07029e+01,   3.88816e+01,
         4.92388e+01,   6.23551e+01,   7.89652e+01,   1.00000e+02])},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring

In [None]:
sgd_model.fit(X_train, y_train)

Fitting 10 folds for each of 1520 candidates, totalling 15200 fits


In [None]:
# save the model
from sklearn.externals import joblib
filename = join(base,'SGDRegressionModel.joblib.pkl')
joblib.dump(sgd_model, filename, compress=3)

### Get the best estimator of the bunch

Below I am getting the best model of the bunch. 

In [None]:
model.cv_results_.keys()

In [None]:
best_model  = model.best_estimator_

In [None]:
best_model.get_params()

In [None]:
best_model.get_params()

### Plot an ROC curve

In [None]:
def plot_roc_curves(model_data, lw = 0.8):
    '''plots an roc curve
    Parameters: 
        model_data: dicts obs and probs
        classifier
    Returns: 
        None
    '''
    for model in model_data:
        print model
        obs = model_data[model]['obs']
        probs = model_data[model]['probs']
        fpr, tpr, _= roc_curve(obs, probs)
        
        plt.plot(fpr, tpr,label = model, lw=lw)
    plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver operating characteristic curves')
    plt.legend(loc = 0)
    
    return None

In [None]:
probs = [pred[1] for pred in best_model.predict_proba(X_test)]
model_data = {'Best L1 Logistic Regression': {'obs':y_test,
                                              'probs': probs}}

In [None]:
plot_roc_curves(model_data)

In [None]:
roc_auc_score(y_test, probs)

## Retrain the best model with all of the data

In [None]:
best_model.fit(X,y)

### Plot the Coeffecients

In [None]:
coefs =pd.Series(best_model.get_params()['clf'].coef_.tolist()[0])
coefs =pd.Series(best_model.get_params()['clf'].coef_.tolist()[0])
coefs.index = X_test.columns
coefs = coefs[coefs>0]

coefs.sort_values(inplace = True)

In [None]:
coefs.plot.barh(figsize= (8,26))

In [None]:
len(coefs)

### Predict the test data

In [None]:
def load_features(path):
    """loads the features of the data and removes undesired columns """
    
    remove = ['channel 4 RIR 1 ', 'channel 4 RIR 2 ', 'channel 4 RIR 3 ', 'channel 4 RIR 4 ', 'channel 4 RIR 5 ',
              'channel 4 RIR 6 ', 'channel 10 RIR 1 ', 'channel 10 RIR 2 ', 'channel 10 RIR 3 ', 'channel 10 RIR 4 ',
              'channel 10 RIR 5 ', 'channel 10 RIR 6 ', 'SpectralEntropy 4', 'SpectralEntropy 10']
    
    # read in the data 
    data =pd.read_csv(path)
        
    # drop the undesired columns 
    return data.drop(remove, axis = 1)


test_data = load_features(join(base,'testing.csv'))

In [None]:
X_testing = test_data.iloc[:,:-1].copy()
file_names = test_data['filename'].copy()

In [None]:
# There are Nan's so fill them
X_testing =X_testing.fillna(X_testing.mean(),axis = 0)

In [None]:
predictions = [pred[1] for pred in best_model.predict_proba(X_testing)]

In [None]:
submission = pd.DataFrame({'File':file_names, 'Class': predictions})
submission = submission[['File','Class']]

In [None]:
submission.head()

In [None]:
submission.Class.plot.density()

It seems to me that the first ones are more likely to be preictal. 

In [None]:
submission.Class.plot()

In [None]:
len(submission)

### Get files not in the data
The problem with this is that i did not have all the files. 

In [None]:
base2 = join('/Users/crivera5/Documents/NonIntuitProjects/Kaggle/KaggleEEG')
paths = [join(base2, 'test_1_new'), 
         join(base2,'test_2_new'),
         join(base2,'test_3_new')]

In [None]:
def get_data_files(list_o_paths):
    """This gets the data matlab files"""
    file_names = []
    for path in list_o_paths:
        files = [join(path, f) for f in listdir(path) if f.endswith('.mat')]
        file_names.extend(files)

    return file_names

In [None]:
unscored_files = [split(f)[1] for f in get_data_files(paths)]
unscored_files = [f for f in unscored_files if f not in list(submission['File'])]
unscored_files =pd.DataFrame({'File':unscored_files})
unscored_files['Class'] = submission.Class.mean()
unscored_files.head()

In [None]:
# merge them 
submission = pd.concat([submission, unscored_files])
submission.sort_values('File', inplace = True)

In [None]:
print len(submission)
submission.head()


In [None]:
submission.to_csv(join(base,'submission1.csv'), index = False)

### SDG Classifier with elasticnet

In [None]:
from sklearn.linear_model import SGDClassifier

In [None]:
# the space to search 
number_alpha= 40
start = -2
end = 2

scalar = StandardScaler()

sgd_clf = SGDClassifier(loss='hinge', 
                       penalty='elasticnet', 
                       l1_ratio=0.15, 
                       fit_intercept=True, 
                       n_iter=20, 
                       shuffle=True, verbose=0,
                       n_jobs=1, random_state=444, 
                       learning_rate='optimal', 
                       power_t=0.5, class_weight=None, 
                       warm_start=True, 
                       average=False)

sgd_pipeline = Pipeline([('scalar',StandardScaler()),
                     ('SGDclf', sgd_clf)])
parameters = {'SGDclf__alpha':np.logspace(start,end,number_alpha), 'SGDclf__class_weight':[None, 'balanced'],
              'SDGclf__l1_ratio':np.arange(0.05,1,0.05)}
sgd_model = GridSearchCV(sgd_pipeline, parameters, n_jobs = -1, scoring = 'roc_auc',verbose = True, cv = 10)
sgd_model

In [None]:
sgd_model.fit(X_train,y_train)