In [1]:
from pyspark import SparkContext, SparkConf
from scipy.io import loadmat
import numpy as np
from pyspark.sql.types import Row
from pyspark.ml import Pipeline
from pyspark.ml.classification import RandomForestClassifier, RandomForestClassificationModel
from pyspark.ml.feature import IndexToString, StringIndexer, VectorIndexer
from pyspark.mllib.evaluation import MulticlassMetrics
from pyspark.sql import SQLContext
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import StringIndexer
import os
import time
from sklearn.metrics import roc_curve, auc
from pyspark.ml import Pipeline
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
import json
import gcsfs



def generateCV(labeled_df, kfolds):
    weight = 1 / kfolds
    parts = labeled_df.randomSplit( [weight] *  kfolds)

    cv_groups = []
    
    for i in range(kfolds):
        print(parts[i].count())
        test_df = parts[i]
        train_df = sqlContext.createDataFrame(sc.emptyRDD(), labeled_df.schema)
        for j in range(kfolds):
            if j != i:
                train_df = train_df.unionAll(parts[j])
        cv_groups.append((train_df, test_df))
    return cv_groups



def cross_validate(gs_dir, subjects, sc, fs, num_nodes):
    '''Train the model with preset params with full scope of labelled data from provided subjects list 
    and save the trained classfier to google cloud storage'''

    json_str_rdd = sc.textFile(gs_dir + '/SETTINGS.json')
    json_str = ''.join(json_str_rdd.collect())
    settings = json.loads(json_str)
    
    proj_name = settings['gcp-project-name']
    proj_dir = settings['gcp-bucket-project-dir']
    dataset_dir = settings['dataset-dir']
    fs = gcsfs.GCSFileSystem(project = proj_name)
    
    
    subjects_ave_aucs = []
    subjects_cfn_mtxs = []
    for subject in subjects:
        
        #Load data into rdd
        start_time = time.time()
        loader = dataloader('/'.join([proj_dir,dataset_dir,subject]), fs)
        ictal_raw = loader.load_ictal_data()
        interictal_raw = loader.load_interictal_data()
        partitionNum = num_nodes * 10
        ictal_rdd = sc.parallelize(ictal_raw, partitionNum)
        interictal_rdd = sc.parallelize(interictal_raw, partitionNum)
        end_time = time.time()
        #print('--- '+ subject + ": Data Loading %s seconds ---" % (end_time - start_time))
        #Data preprocessing and transformation
        start_time = time.time()
        transformed_ictal_rdd = ictal_rdd.map(lambda x: process_raw_sample(x, True, sample_transform)).cache()
        transformed_interictal_rdd = interictal_rdd.map(lambda x: process_raw_sample(x, False, sample_transform)).cache()

        def rddToDf(x):
            '''Convert rdd to  and pass this function in Row() args'''
            sample_X, sample_y = x
            d = {}
            d['features'] = Vectors.dense(sample_X)
            d['labels'] = sample_y
            return d

        ictal_df = transformed_ictal_rdd.map(lambda x: Row(**rddToDf(x))).toDF()
        interictal_df = transformed_interictal_rdd.map(lambda x: Row(**rddToDf(x))).toDF()
        labeled_df = ictal_df.unionAll(interictal_df)
        labeled_df.cache()
        labeled_df.rdd.count()
        end_time = time.time()
        #print('--- '+ subject + ": Data Transformation %s seconds ---" % (end_time - start_time))

        #Generate training and test pairs
#         rf = set_model(3000, 'labels', seed = 130, maxDepth = 5)
#         start_time = time.time()
#         model = rf.fit(labeled_df)
#         end_time = time.time()
#         print('--- '+ subject + ": Model Training %s seconds ---" % (end_time - start_time))
#         print('--- '+ subject + ": Saving Trained Model ---" )
        cv_iterations  = generateCV(labeled_df, 4)
        count = 0
        cfsn_mtxs = []
        ave_aucs = []
        for (training_df, test_df) in cv_iterations:
            rf = set_model(numTrees = 2500, labelCol = 'labels', seed = 1, maxDepth = 5, subSampling = 0.95)
            model = rf.fit(training_df)
            result = model.transform(test_df)
            print('Fold '+ str(count) + ':')
            count += 1
            metric = evaluateClassifer(result)
            cfsn_mtxs.append(metric.confusionMatrix().toArray())
            ave_auc = customEvaluate(result)
            print('Average Roc_auc: ' + str(ave_auc))
            ave_aucs.append(ave_auc)
            del training_df
            del test_df
        subjects_ave_aucs.append(ave_aucs)
        subjects_cfn_mtxs.append(cfsn_mtxs)
        
        


    return (subjects_ave_aucs, subjects_cfn_mtxs)
        






num_nodes = 4
subjects = ['Patient_8']
gs_dir = "gs://seizure_detection_data/notebooks/seizure_detection_spark_gcp"


appName = 'seizure_detection'
conf = SparkConf().setAppName(appName).setMaster('local')
conf = (conf.setMaster('local[*]')
        .set("spark.executor.instances", str(2 * num_nodes))
        .set('spark.executor.memory', '15G')
        .set('spark.driver.memory', '15G')
        .set('spark.driver.maxResultSize', '15G'))
try:
    sc.stop()
except:
    pass
sc = SparkContext(conf = conf)

sqlContext = SQLContext(sc)
    

json_str_rdd = sc.textFile(gs_dir + '/SETTINGS.json')
json_str = ''.join(json_str_rdd.collect())
settings = json.loads(json_str)

proj_name = settings['gcp-project-name']
proj_dir = settings['gcp-bucket-project-dir']

fs = gcsfs.GCSFileSystem(project=proj_name)
fopen = fs.open(proj_dir + '/spark_data_io.py')
exec(fopen.read())
fopen.close()
fopen = fs.open(proj_dir + '/spark_transform.py')
exec(fopen.read())
fopen.close()
fopen = fs.open(proj_dir + '/spark_processing.py')
exec(fopen.read())
fopen.close()
fopen = fs.open(proj_dir + '/spark_evaluate.py')
exec(fopen.read())
fopen.close()



subjects_ave_aucs, subjects_cfn_mtxs = cross_validate(gs_dir, subjects, sc, fs, num_nodes)



485
459
497
449
Fold 0:
Accuracy: 0.9876288659793815
Average F1 score: 0.9877718919801443
Average Precision: 0.9877718919801443
Average Recall: 0.9876288659793815
Average Roc_auc: 0.9656227180527382
Fold 1:
Accuracy: 0.9803921568627451
Average F1 score: 0.9820100453974693
Average Precision: 0.9820100453974693
Average Recall: 0.9803921568627452
Average Roc_auc: 0.9330144183150865
Fold 2:
Accuracy: 0.9758551307847082
Average F1 score: 0.9768780460284113
Average Precision: 0.9768780460284113
Average Recall: 0.9758551307847083
Average Roc_auc: 0.9178987201248103
Fold 3:
Accuracy: 0.9933184855233853
Average F1 score: 0.9933987921877677
Average Precision: 0.9933987921877677
Average Recall: 0.9933184855233853
Average Roc_auc: 0.9897524680627019
