In [2]:
import pandas as pd
import numpy as np
from sklearn.decomposition import IncrementalPCA
from sklearn.cross_validation import train_test_split
from sklearn.cross_validation import cross_val_score

from sklearn.grid_search import GridSearchCV
import xgboost as xgb
from xgboost.sklearn import XGBClassifier

from imblearn.over_sampling import SMOTE

from sklearn.metrics import roc_auc_score, accuracy_score
from sklearn.cross_validation import StratifiedShuffleSplit 

from sklearn.preprocessing import normalize
import os

In [3]:
# Read the file with the features here
base_dir_train = '/home/hareesh/IPython/Seizure_Pred/'
df1 = pd.read_csv(base_dir_train+"New_Features/train_3.csv")
df1.columns = ['file', 'label'] + range(1,232) 

# Drop rows which are empty and have no features extracted
df1.dropna(inplace=True)
df1.reset_index(inplace=True, drop=True)

In [4]:
# Remove files marked as SAFE = 0 from training set
safe_indicator_list = pd.read_csv("train_and_test_data_labels_safe.csv")
safe_files_list = list(safe_indicator_list[safe_indicator_list['safe']==1]['image'])
safe_index = [i for i in range(len(df1['file'])) if df1.ix[i, 'file'] in safe_files_list] 
df1 = df1.ix[safe_index,]
df1.reset_index(inplace=True, drop=True)

### Model Fitting
For cross-validation while model fitting, we train.csv to train and test.

In [5]:
# Randomly select 80 rows of pre-ictal and 200 rows of interictal for training
np.random.seed(5)
train_ix = list(np.random.choice(df1[df1['label']==1].index, size=80, replace=False))+list(np.random.choice(df1[df1['label']==0].index, size=1000, replace=False))

#The remaining rows will serve as the test set
test_ix = list(set(df1.index)-set(train_ix))

X = df1.ix[:,2:]
y = df1.ix[:,1]


In [6]:
# Normalizing the entire data to values between 0 and 1
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
scaler.fit(X)
# Now apply the transformations to the data:
X = pd.DataFrame(scaler.transform(X))

#### Principal Component Analysis
Since the features extracted had high correlations, use PCA to extract linearly uncorrelated features in the directions of maximum variation

In [7]:
pca = IncrementalPCA(n_components=25)
pca.fit(X)
X = pd.DataFrame(pca.transform(X))

#### Synthetic Minority Oversampling Technique
SMOTE is a oversampling technique that creates fake or synthetic samples in the neighbourhood of the minority class data points. By doing this, we try to address the imbalance in the data (i.e., very few pre-ictals when compared to interictals)

In [8]:
# All columns except the last has the predictor X's
X_subset = X.ix[train_ix,:]

# The last column is the class label of 0s and 1s
y_subset = y[train_ix]

svm_args={'class_weight': 'balanced'}
smote = SMOTE(ratio='auto', kind='svm')
x_, y_ = smote.fit_sample(X_subset, y_subset)

X_train = pd.DataFrame(x_)
y_train = pd.Series(y_)

X_test =  X.ix[test_ix,:]
X_test.reset_index(inplace=True, drop=True)
y_test =  y.ix[test_ix]
y_test.reset_index(inplace=True, drop=True)


In [9]:
# Run classifier with cross-validation 
cv = StratifiedShuffleSplit(y_train, n_iter=5, test_size=0.1, random_state=11)

classifier = XGBClassifier(n_estimators = 500, gamma=0, subsample=0.8, max_depth=9,
                            objective='binary:logistic',learning_rate=0.1)

#### Grid-search Cross Validation
Specify the parameters that you want to tune

In [10]:
param_grid = { 
#     'n_estimators': [200,500],
#     'gamma':[0,0.1,1],
#     'subsample':[0.8,1.0],
#     'max_depth':[9],
#    'learning_rate':[0.1,0.01]
}

#### Prediction
XGB Boost classifier <br>
* Tune Parameters
* Extract most important features


In [11]:
accuracy = []
auc = []
percent_ones = []
# most_imp_100_features_xgb = set()
for (train, test) in cv:
    
    CV_xgb = GridSearchCV(estimator=classifier, param_grid=param_grid, cv= 3)
    
    probas_ = CV_xgb.fit(X_train.ix[train,:], y_train[train]).predict_proba(X_test)
    y_pred = CV_xgb.predict(X_test)
    print CV_xgb.best_params_, roc_auc_score(y_test, y_pred)        
    
#     # Feature Importances
#     importances = classifier.feature_importances_
#     indices = np.argsort(importances)[::-1]
#     most_imp_100_features_xgb = most_imp_100_features_gb.union(set(indices[:100]))

    accuracy.append(accuracy_score(y_test, y_pred))
    auc.append(roc_auc_score(y_test, y_pred))
    percent_ones.append(y_pred.mean())
    
print 'Average Accuracy:', np.mean(accuracy)
print 'Average AUC:', np.mean(auc)
print 'Average Percentage of 1s in prediction:', np.mean(percent_ones)

{} 0.738397790055
{} 0.736187845304
{} 0.722586935327
{} 0.71523399415
{} 0.747960675983
Average Accuracy: 0.941829393628
Average AUC: 0.732073448164
Average Percentage of 1s in prediction: 0.0565262076053


Execute the block below to obtain 3 output files with predicted labels (one for each patient)

In [14]:
#Set Patient Number
num = 3

# Read the file with the features here
base_dir_train = '/home/hareesh/IPython/Seizure_Pred/New_Features/'
df_train = pd.read_csv(base_dir_train+"train_"+str(num)+".csv", na_values='')
df_train.columns = ['file', 'label'] + range(1,232) 
df_train.dropna(inplace=True)
df_train.reset_index(inplace=True, drop=True)

# Remove files marked as SAFE = 0 from training set
safe_indicator_list = pd.read_csv(base_dir_train+"train_and_test_data_labels_safe.csv")
safe_files_list = list(safe_indicator_list[safe_indicator_list['safe']==1]['image'])
safe_index = [i for i in range(len(df_train['file'])) if df_train.ix[i, 'file'] in safe_files_list] 
df_train = df_train.ix[safe_index,]
df_train.reset_index(inplace=True, drop=True)

df_test = pd.read_csv(base_dir_train+"test_"+str(num)+"_new.csv", na_values='')
df_test.columns = ['file'] + range(1,232) 
df_test.dropna(inplace=True)
df_test.reset_index(inplace=True, drop=True)
X_test = df_test.ix[:,1:]

# All columns except the last has the predictor X's
X = pd.concat([df_train.ix[:,2:], df_test.ix[:,1:]],ignore_index=True)

#Normalizing the data
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
scaler.fit(X)
# Now apply the transformations to the data:
X = pd.DataFrame(scaler.transform(X))

# Do PCA for dimensionality reduction. Experiment with number of principal components.
# Comment it out if not required

pca = IncrementalPCA(n_components=25)
pca.fit(X)
X = pd.DataFrame(pca.transform(X))

# The last column is the class label of 0s and 1s
y_train = df_train.ix[:,1]

X_train = X.ix[range(df_train.shape[0]),:]
X_train.reset_index(inplace=True, drop=True)
y_train.reset_index(inplace=True, drop=True)

X_test = X.ix[df_train.shape[0]:,:]
X_test.reset_index(inplace=True, drop=True)

svm_args={'class_weight': 'balanced'}
smote = SMOTE(ratio='auto', kind='svm')
x_, y_ = smote.fit_sample(X_train, y_train)

X_train = pd.DataFrame(x_)
y_train = pd.Series(y_)

classifier = XGBClassifier(n_estimators = 500, gamma=0, subsample=0.8, max_depth=9,
                             objective='binary:logistic',learning_rate=0.1)
classifier.fit(X_train, y_train)

y_pred = classifier.predict(X_test)

output = pd.DataFrame(y_pred, index=df_test['file'], columns=['Class'])
output.index.name = 'File'
output['Class'] = [p[1] for p in classifier.predict_proba(X_test)]
output.to_csv("test"+str(num)+"_output.csv")

Prepare the submission file as per the competition format: sample_submission.csv <br>
Keep test_1_output.csv, test_2_output.csv and test_2_output.csv in the sample folder Pred

In [15]:
# Combining results into 1 file according to the submission format
submission_dir = "/home/hareesh/IPython/Seizure_Pred/"
submission_file = pd.read_csv(submission_dir + "sample_submission.csv")

pred_dir = submission_dir + "Pred/"
test_files = os.listdir(pred_dir)
df = pd.DataFrame(columns=['File', 'Class'])
for file_name in test_files:
        df_1 = pd.read_csv(pred_dir + file_name)
        df = pd.concat([df,df_1])
        print df.shape
submission_file = pd.merge(submission_file, df, how='left', on = ['File'], suffixes=['_1',''])
submission_file['Class'] = submission_file['Class'].fillna(0)
submission_file.drop(['Class_1'], axis=1, inplace=True)
submission_file.to_csv("Submission.csv", index=False)

(205, 2)
(894, 2)
(1888, 2)
