In [1]:
from pyspark import SparkContext, SparkConf
from scipy.io import loadmat
import numpy as np
from pyspark.sql.types import Row
from pyspark.ml import Pipeline
from pyspark.ml.classification import RandomForestClassifier, RandomForestClassificationModel
from pyspark.mllib.evaluation import MulticlassMetrics
from pyspark.sql import SQLContext
from pyspark.ml.linalg import Vectors
import os
import time
from sklearn.metrics import roc_curve, auc
from pyspark.ml import Pipeline
import gcsfs
import json
from pyspark.sql.types import *
import pandas as pd


def process_test_sample(sample, with_latency, transforms): 
            name, sample_X = sample
            data = sample_X['data']
            transformed_data = transforms(data)
            return (name, transformed_data)




def predict_subjects(gs_dir, subjects, sc, num_nodes):
    
    json_str_rdd = sc.textFile(gs_dir + '/SETTINGS.json')
    json_str = ''.join(json_str_rdd.collect())
    settings = json.loads(json_str)
    
    proj_name = settings['gcp-project-name']
    proj_dir = settings['gcp-bucket-project-dir']
    dataset_dir = settings['dataset-dir']
    result_dir = settings["submission-dir"]
    model_dir = settings["data-cache-dir"]
    
    fs = gcsfs.GCSFileSystem(project = proj_name)
    
    results = []
    
    for subject in subjects:
        
        #Load data into rdd
        start_time = time.time()
        loader = dataloader('/'.join([proj_dir,dataset_dir,subject]), fs)
        test_raw, test_names = loader.load_test_data()

        partitionNum = num_nodes * 4
        test_raw_names = list(zip(test_names, test_raw))
        end_time = time.time()
        print('--- '+ subject + ": Test Data Loading %s seconds ---" % (end_time - start_time))
        #Data preprocessing and transformation
        start_time = time.time()
        test_rdd = sc.parallelize(test_raw_names, partitionNum)
        transformed_test_rdd = test_rdd.map(lambda x: process_test_sample(x, True, sample_transform)).cache()
        #transformed_interictal_rdd = interictal_rdd.map(lambda x: process_raw_sample(x, False, sample_transform)).cache()

        def rddToDf(x):
            '''Convert rdd to  and pass this function in Row() args'''
            name, sample_X = x
            d = {}
            d['clip'] = name
            d['features'] = Vectors.dense(sample_X)
            return d

        test_df = transformed_test_rdd.map(lambda x: Row(**rddToDf(x))).toDF()

        test_df.cache()
        

        end_time = time.time()
        
        print('--- '+ subject + ": Test Data Transformation %s seconds ---" % (end_time - start_time))

        #Predicting samples with saved models or retrain a new model for prediction if not exists
        
        model = load_model(gs_dir, subject)
        if not model:
            model = train_model(gs_dir, [subject], sc, fs, num_nodes)[0]
        start_time = time.time()
        result = model.transform(test_df)
        end_time = time.time()
        print('--- '+ subject + ": Making Predictions %s seconds ---" % (end_time - start_time))
        print('--- '+ subject + ": Saving Trained Model ---" )
        results.append(result)


    return results
        
    
def generate_pred_result(results, gs_dir, sc):
    
    def resultRddToDf(x_and_name):
        '''Convert rdd to  and pass this function in Row() args'''
        name, x = x_and_name
        d = {}
        d['clip'] = name
        d['seizure'] = float(x[1] + x[2])
        d['early'] = float(x[2])
        return d

    submission_df = results[0].select(['clip','probability']).rdd.map(lambda x: Row(**resultRddToDf(x))).toDF()
    
    for i in range(1, len(results)):
        result_prob_df = result_df.select(['clip','probability']).rdd.map(lambda x: Row(**resultRddToDf(x))).toDF()
        submission_df = submission_df.unionAll(result_prob_df)
    return submission_df
    

#Main Function for testing 
num_nodes = 4
appName = 'seizure_detection'
conf = SparkConf().setAppName(appName).setMaster('local')
conf = (conf.setMaster('local[*]')
        .set("spark.executor.instances", str(2 * num_nodes))
        .set('spark.executor.memory', '15G')
        .set('spark.driver.memory', '15G')
        .set('spark.driver.maxResultSize', '15G'))
try:
    sc.stop()
except:
    pass
sc = SparkContext(conf = conf)

sqlContext = SQLContext(sc)
    
gs_dir = "gs://seizure_detection_data/notebooks/seizure_detection_spark_gcp"
json_str_rdd = sc.textFile(gs_dir + '/SETTINGS.json')
json_str = ''.join(json_str_rdd.collect())
settings = json.loads(json_str)

proj_name = settings['gcp-project-name']
proj_dir = settings['gcp-bucket-project-dir']
result_dir = settings["submission-dir"]

fs = gcsfs.GCSFileSystem(project=proj_name)
fopen = fs.open(proj_dir + '/spark_data_io.py')
exec(fopen.read())
fopen.close()
fopen = fs.open(proj_dir + '/spark_transform.py')
exec(fopen.read())
fopen.close()
fopen = fs.open(proj_dir + '/spark_processing.py')
exec(fopen.read())
fopen.close()
subjects = ['Patient_8']
results = predict_subjects(gs_dir, subjects, sc, num_nodes)
if len(results) == 12:
    #If using full dataset from all 12 subject folders, generate a submission file
    submission_df = generate_pred_result(results, gs_dir, sc).write.csv()
    with fs.open('/'.join([proj_dir, result_dir,'submissions.csv']), 'w') as f:   
        submission_df = test_submission_df.toPandas()
        submission_df['clip'] = submission_df['clip'] + '.mat'
        submission_df.to_csv(f, index = False)



--- Patient_8: Test Data Loading 333.554297208786 seconds ---


ERROR:root:Exception while sending command.
Traceback (most recent call last):
  File "/usr/lib/spark/python/lib/py4j-0.10.7-src.zip/py4j/java_gateway.py", line 1159, in send_command
    raise Py4JNetworkError("Answer from Java side is empty")
py4j.protocol.Py4JNetworkError: Answer from Java side is empty

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/usr/lib/spark/python/lib/py4j-0.10.7-src.zip/py4j/java_gateway.py", line 985, in send_command
    response = connection.send_command(command)
  File "/usr/lib/spark/python/lib/py4j-0.10.7-src.zip/py4j/java_gateway.py", line 1164, in send_command
    "Error while receiving", e, proto.ERROR_ON_RECEIVE)
py4j.protocol.Py4JNetworkError: Error while receiving


Py4JError: An error occurred while calling z:org.apache.spark.api.python.PythonRDD.readRDDFromFile

In [21]:

test_submission_df

+--------------------+--------------------+--------------------+
|                clip|               early|             seizure|
+--------------------+--------------------+--------------------+
|Patient_8_test_se...|0.004442998296358531|0.004747984205336...|
|Patient_8_test_se...|0.004870720456110...|0.005213705764286057|
|Patient_8_test_se...|0.005167207363362322|0.007772355933263...|
|Patient_8_test_se...|0.004029739998460...|0.004379898386960699|
|Patient_8_test_se...|0.003751685844273...|0.005054755800194789|
|Patient_8_test_se...|0.004402393622232576|0.004683204088209005|
|Patient_8_test_se...|0.010538033767644473|0.018815996261971044|
|Patient_8_test_se...| 0.05255601878144003| 0.15476116072202525|
|Patient_8_test_se...|0.004158253901051215|0.004437706937154208|
|Patient_8_test_se...|0.003893962021894996|0.004155510145013...|
|Patient_8_test_se...|0.003636524904709912|0.003900250517594...|
|Patient_8_test_se...|0.010251760647134739|0.017790337315275413|
|Patient_8_test_se...|0.0

In [30]:
result_dir = settings["submission-dir"]
test_submission_df.write.csv('/'.join([gs_dir, result_dir, 'submissions.csv']))

----------------------------------------
Exception happened during processing of request from ('127.0.0.1', 48492)
Traceback (most recent call last):
  File "/opt/conda/lib/python3.6/socketserver.py", line 317, in _handle_request_noblock
    self.process_request(request, client_address)
  File "/opt/conda/lib/python3.6/socketserver.py", line 348, in process_request
    self.finish_request(request, client_address)
  File "/opt/conda/lib/python3.6/socketserver.py", line 361, in finish_request
    self.RequestHandlerClass(request, client_address, self)
  File "/opt/conda/lib/python3.6/socketserver.py", line 696, in __init__
    self.handle()
  File "/usr/lib/spark/python/pyspark/accumulators.py", line 265, in handle
    poll(accum_updates)
  File "/usr/lib/spark/python/pyspark/accumulators.py", line 238, in poll
    if func():
  File "/usr/lib/spark/python/pyspark/accumulators.py", line 242, in accum_updates
    num_updates = read_int(self.rfile)
  File "/usr/lib/spark/python/pyspark/seri

In [16]:
def resultRddToDf(x_and_name):
    '''Convert rdd to  and pass this function in Row() args'''

    name, x = x_and_name
    d = {}
    d['clip'] = name
    d['seizure'] = float(x[1] + x[2])
    d['early'] = float(x[2])
    return d
result_prob_df = result_df.select(['clip','probability']).rdd.map(lambda x: Row(**resultRddToDf(x))).toDF()
result_prob_df.show()



+--------------------+--------------------+--------------------+
|                clip|               early|             seizure|
+--------------------+--------------------+--------------------+
|Patient_8_test_se...|0.004442998296358531|0.004747984205336...|
|Patient_8_test_se...|0.004870720456110...|0.005213705764286057|
|Patient_8_test_se...|0.005167207363362322|0.007772355933263...|
|Patient_8_test_se...|0.004029739998460...|0.004379898386960699|
|Patient_8_test_se...|0.003751685844273...|0.005054755800194789|
|Patient_8_test_se...|0.004402393622232576|0.004683204088209005|
|Patient_8_test_se...|0.010538033767644473|0.018815996261971044|
|Patient_8_test_se...| 0.05255601878144003| 0.15476116072202525|
|Patient_8_test_se...|0.004158253901051215|0.004437706937154208|
|Patient_8_test_se...|0.003893962021894996|0.004155510145013...|
|Patient_8_test_se...|0.003636524904709912|0.003900250517594...|
|Patient_8_test_se...|0.010251760647134739|0.017790337315275413|
|Patient_8_test_se...|0.0

In [17]:
result_prob_df.schema

StructType(List(StructField(clip,StringType,true),StructField(early,DoubleType,true),StructField(seizure,DoubleType,true)))

In [15]:
name, x  = first
print(name)
print(x)

Patient_8_test_segment_1
[0.9952520157946632,0.0003049859089783696,0.004442998296358531]


In [1]:
subject = 'Patient_8'
model = load_model(gs_dir, subject)
if not model:
    model = train_model(gs_dir, [subject], sc, fs, num_nodes)[0]
start_time = time.time()
result = model.transform(test_df)
end_time = time.time()
print('--- '+ subject + ": Making Predictions ---" % (end_time - start_time))
print('--- '+ subject + ": Saving Trained Model ---" )
result.show()

NameError: name 'load_model' is not defined

In [2]:
from pyspark import SparkContext, SparkConf
from scipy.io import loadmat
import numpy as np
from pyspark.sql.types import Row
from pyspark.ml import Pipeline
from pyspark.ml.classification import RandomForestClassifier, RandomForestClassificationModel
from pyspark.mllib.evaluation import MulticlassMetrics
from pyspark.sql import SQLContext
from pyspark.ml.linalg import Vectors
import os
import time
from sklearn.metrics import roc_curve, auc
from pyspark.ml import Pipeline
import gcsfs
import json



def process_test_sample(sample, with_latency, transforms): 
            data = sample['data']
            transformed_data = transforms(data)
            X = transformed_data   


            return X 




def predict_subjects(gs_dir, subjects, sc, num_nodes):
    
    json_str_rdd = sc.textFile(gs_dir + '/SETTINGS.json')
    json_str = ''.join(json_str_rdd.collect())
    settings = json.loads(json_str)
    
    proj_name = settings['gcp-project-name']
    proj_dir = settings['gcp-bucket-project-dir']
    dataset_dir = settings['dataset-dir']
    result_dir = settings["submission-dir"]
    model_dir = settings["data-cache-dir"]
    
    fs = gcsfs.GCSFileSystem(project = proj_name)
    
    for subject in subjects:
        
        #Load data into rdd
        start_time = time.time()
        loader = dataloader('/'.join([proj_dir,dataset_dir,subject]), fs)
        test_raw, test_names = loader.load_test_data()
        print(test_names)
        print(test_raw)
        partitionNum = num_nodes * 4
        test_rdd = sc.parallelize(test_raw, partitionNum)
        
    return (test_rdd, test_names)

    

#Main Function for testing 
num_nodes = 2
appName = 'seizure_detection'
conf = SparkConf().setAppName(appName).setMaster('local')
conf = (conf.setMaster('local[*]')
        .set("spark.executor.instances", str(2 * num_nodes))
        .set('spark.executor.memory', '15G')
        .set('spark.driver.memory', '15G')
        .set('spark.driver.maxResultSize', '15G'))
try:
    sc.stop()
except:
    pass
sc = SparkContext(conf = conf)

sqlContext = SQLContext(sc)
    
gs_dir = "gs://seizure_detection_data/notebooks/seizure_detection_spark_gcp"
json_str_rdd = sc.textFile(gs_dir + '/SETTINGS.json')
json_str = ''.join(json_str_rdd.collect())
settings = json.loads(json_str)

proj_name = settings['gcp-project-name']
proj_dir = settings['gcp-bucket-project-dir']

fs = gcsfs.GCSFileSystem(project=proj_name)
fopen = fs.open(proj_dir + '/spark_data_io.py')
exec(fopen.read())
fopen.close()
fopen = fs.open(proj_dir + '/spark_transform.py')
exec(fopen.read())
fopen.close()
fopen = fs.open(proj_dir + '/spark_processing.py')
exec(fopen.read())
fopen.close()
    


test_rdd, test_names = predict_subjects(gs_dir, ['Patient_8'], sc, num_nodes)



['Patient_8_test_segment_1', 'Patient_8_test_segment_10', 'Patient_8_test_segment_100', 'Patient_8_test_segment_1000', 'Patient_8_test_segment_1001', 'Patient_8_test_segment_1002', 'Patient_8_test_segment_1003', 'Patient_8_test_segment_1004', 'Patient_8_test_segment_1005', 'Patient_8_test_segment_1006', 'Patient_8_test_segment_1007', 'Patient_8_test_segment_1008', 'Patient_8_test_segment_1009', 'Patient_8_test_segment_101', 'Patient_8_test_segment_1010', 'Patient_8_test_segment_1011', 'Patient_8_test_segment_1012', 'Patient_8_test_segment_1013', 'Patient_8_test_segment_1014', 'Patient_8_test_segment_1015', 'Patient_8_test_segment_1016', 'Patient_8_test_segment_1017', 'Patient_8_test_segment_1018', 'Patient_8_test_segment_1019', 'Patient_8_test_segment_102', 'Patient_8_test_segment_1020', 'Patient_8_test_segment_1021', 'Patient_8_test_segment_1022', 'Patient_8_test_segment_1023', 'Patient_8_test_segment_1024', 'Patient_8_test_segment_1025', 'Patient_8_test_segment_1026', 'Patient_8_test

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)

