In [51]:
import pandas as pd 
import numpy as np
from numpy import array
import os
from pathlib import Path
from collections import namedtuple
import re
from sklearn.preprocessing import StandardScaler
import pickle

### Traversing the folder 

- check the json data
- check the whether the time start and time end are distinct
- check whether there are 5 paragraphs

In [2]:
def extract_user_number(s):
    """
    Extracts the user number from a given path
    :s: path of the file
    """
    s = str(s).split("_")[0] # extract the user from the path
    s = s.split("/")[6] # split the path to isolate the user
    s = s.replace("User","") # remove the word "User"
    s = int(re.sub(r'[a-z]+', '', s, re.I)) # remove alphabetical characters
    return s 


In [3]:
def extract_test_number(s):
    """
    Extracts the test number from a given path
    :s: path of the file
    """
    s = s.split("/")[7]
    s = s.split("_")[1]
    s = int(re.sub(r'[a-z]+', '', s, re.I))
    return s 

In [4]:
p = Path("/cs/home/ybk1/Dissertation/Experiment Anonymised Version")
# p = Path("C://Users//User//OneDrive - University of St Andrews//Modules//CS5099//4. Data Documents//dataset//INSTRUMENTED DIGITAL AND PAPER READINGDATASET//Experiment Anonymised Version")

In [5]:
def get_all_datasets(p):
    """
    Loops through all the folders to find all the datasets and json files for training
    p: path of the root folder
    """
    File = namedtuple('File', 'name path size')
    files = []
    for item in p.glob('**/*'): # loops thorough all the files in all the sub-directories
        if item.match('*rawEEGData.csv')  and "baseline" not in item.name:
            name = item.name
            path = Path.resolve(item).parent
            size = item.stat().st_size

            files.append(File(name,path, size )) # stores the name, path and size in named tuple
    
    df = pd.DataFrame(files)
    df['user'] = df["path"].apply(extract_user_number)
    df.to_csv("All EEG files.csv", index=False)
    
    return df

From the value counts we can see that: 
- user 20, 23, 17, 7, 9, 15, 2 - all have duplicates
- user 14 is one short


### Building the datasets using the alldatasets csv

### Loading the data 

In [6]:
def check_json(labels):
    """
    Method for checking the json to see how many paragraphs it contains
    and whether the timestamps are distinct
    :labels: annotations.json in a DataFrame
    """
    #check whether there are 5 paragraphs
    no_para = len(labels)
    are_timestamps_distinct = False
    if no_para == 0: # checks if the json is empty
        return no_para, are_timestamps_distinct
    else:
        if len(labels) != 5:
            print ("There are less than 5 paragraphs in the dataset")
        else:
            print("There are 5 paragraphs in the dataset")

        #check whether the timestamps are distincts
        cols = ["timeRangeStart", "timeRangeEnd"]
        for col in cols:
            array_length= len(array(labels[col]))
            set_length = len(set(array(labels[col])))
            if array_length == set_length:
                print(col + " has distinct values")
                are_timestamps_distinct = True
                break
            else: 
                print(col + " does not have distinct values")
                are_timestamps_distinct = False
                break

        print(are_timestamps_distinct)
        return no_para, are_timestamps_distinct 
    

In [7]:
def label(timestamp, annotation, labels):
    """
    Method for adding the labels to the dataframe
    
    :return: relevant score of attention/interest/effort within the correct time range
    :timestamp: timestamp at the current iteration
    :annotation: annotation at the current interation
    :labels: annotations.json in array format
    """
    for row in labels:
        time_start = row[-2]
        time_end = row[-1]
        ann_dict = {"effort": row[-3], "attention": row[-4], "interest": row[-5],"para": row[-6]}
        if timestamp >= time_start and timestamp < time_end: # checks if the timestamp is witin the start and end range
            return ann_dict[annotation] # returns the relevant score stored in the dictionary
        

In [8]:
def add_labels(data, labels):
    """
    Method for processing the dataset by creating columns for effort, attention, interest and paragraph
    :data: EEG dataset
    :labels: annotations.json in array format
    """
    annotations = ["effort", "attention", "interest", "para"]
    for ann in annotations: # loops through the labels, and creates new columns based on the values within the timestamp range
        data[ann] = data["Timestamp"].apply(label, annotation=ann, labels=labels)
    
    initial_rows = len(data)
    data = data.dropna()
    final_rows = len(data)
    dropped_rows = initial_rows - final_rows
    print("Initial rows: {0}\nFinal rows: {1}\nDropped rows: {2}".format(initial_rows, 
                                                                         final_rows, 
                                                                         dropped_rows))
    return data , initial_rows, final_rows, dropped_rows

In [9]:
def export_datasets_to_csv(labeled_data, path):
    """
    Exports the labeled data as three separate csv files for interest, effort and attention
    :labeled_data: annotated datasets with columns for interest, effort and attention
    :path: path to save the dataset
    """
    X = labeled_data.iloc[:,:9] # inputs i.e channels and timestamp
    labels = ['interest', 'effort', 'attention']
    for label in labels:
        dataset = pd.concat([X, labeled_data[label]], axis=1).to_csv(path +  "/" +'EEG_' + label + '_dataset.csv', index=False)   


In [10]:
def build_all_datasets(df):
    """
    Method for building all the training sets by applying annotations at relevant timestamps
    and splitting the training sets into attention, interest and effort
    :df: DataFrame that includes all of the datasets
    :ds: dataset
    """
    ds = array(df)
    json = "annotations.json" # json file that contains the labels and annotations for the data
    File = namedtuple('File', 'path user test no_para are_timestamps_distinct initial_rows final_rows dropped_rows')
    files = []
    
    for row in ds:
        ds_path = str(row[1]) # Get the path of the dataset
        ds_csv = row[0] # get the name of the csv file
        print("Working on this csv: {}".format(ds_csv))
        ds_user = row[3] # get the user number
        ds_test = extract_test_number(ds_path)
        ds_data = pd.read_csv(ds_path + "/" + ds_csv)
        ds_labels = pd.read_json(ds_path + "/" + json)
        no_para, are_timestamps_distinct = check_json(ds_labels) # check the json file
                
        ds_labels = array(ds_labels)
        if are_timestamps_distinct == False:
            print("Don't process")
            initial_rows, final_rows  = len(ds_data), len(ds_data)
            dropped_rows = 0
            files.append(File(ds_path, ds_user, ds_test, no_para,are_timestamps_distinct,
                             initial_rows, final_rows, dropped_rows))          
        else:
            labeled_data, initial_rows, final_rows, dropped_rows = add_labels(ds_data, ds_labels) # Gets the labeled data
#             export_datasets_to_csv(labeled_data, ds_path) # Exports the 3 datasets to the appropriate path
            labeled_data.to_csv(ds_path + "/" + "annotated_EEG.csv", index=False)
            files.append(File(ds_path, ds_user, ds_test, no_para, are_timestamps_distinct,
                         initial_rows, final_rows, dropped_rows))
            
    training_files = pd.DataFrame(files)
    training_files.to_csv("Trainingfiles.csv", index=False)
    return training_files
        

In [11]:
def build_all_datasets_test():
    """
    Test method for building a single group of training sets by applying annotations at relevant timestamps
    and splitting the training sets into attention, interest and effort
    :df: DataFrame that includes all of the datasets
    :ds: dataset
    """
    # User001_test1_BET_01_30-08-2019_recording0_U1567156556_EYETRACKER_cleanFixationData.csv
    # /cs/home/ybk1/Dissertation/Experiment Anonymised Version/User001_Group_Test_30-08-2019--10-07-00/User001_test1_BET_01_30-08-2019/User001_test1_BET_01_30-08-2019_recording0
    ds_path = "/cs/home/ybk1/Dissertation/Experiment Anonymised Version/User001_Group_Test_30-08-2019--10-07-00/User001_test1_BET_01_30-08-2019/User001_test1_BET_01_30-08-2019_recording0"
    ds_csv = "User001_test1_BET_01_30-08-2019_recording0_U1567156556_EEG_rawEEGData.csv"
    json = "annotations.json"
    
    File = namedtuple('File', 'path user no_para are_timestamps_distinct initial_rows final_rows dropped_rows')
    files = []
        
    ds_user = 1 # get the user number
    ds_data = pd.read_csv(ds_path + "/" + ds_csv)
    ds_labels = pd.read_json(ds_path + "/" + json)
    no_para, are_timestamps_distinct = check_json(ds_labels) # check the json file


    ds_labels = array(ds_labels)
    if are_timestamps_distinct == False:
        print("Don't process")
        initial_rows, final_rows  = len(ds_data), len(ds_data)
        dropped_rows = 0
        files.append(File(ds_path, ds_user, no_para,are_timestamps_distinct,
                         initial_rows, final_rows, dropped_rows))
    else:
        labeled_data, initial_rows, final_rows, dropped_rows = add_labels(ds_data, ds_labels) # Gets the labeled data
#         export_datasets_to_csv(labeled_data, ds_path) # Exports the 3 datasets to the appropriate path
        labeled_data.to_csv(ds_path + "/" + "annotated_EEG.csv", index=False)
        files.append(File(ds_path, ds_user, no_para, are_timestamps_distinct,
                         initial_rows, final_rows, dropped_rows))
        
    training_files = pd.DataFrame(files)
    return training_files
        

**Run this block to rebuild the datasets**

In [12]:
# all_datasets = get_all_datasets(p)
# build_all_datasets(all_datasets)

In [13]:
# build_all_datasets_test()

### Extract viable datasets from the training files 

- remove the files that have zero final rows
- remove files that have empty JSON's
- remove files that only have 4 paragraphs
- remove files with indistince timestamps

In [43]:
def clean_training_files():
    """
    Method for removing:
    - Files with zero final rows
    - remove files with empty Json's
    - Files with only 4 paragraphs
    - Files with indistinct timestamp
    """
    tf = pd.read_csv("Trainingfiles.csv")
    print("Initial length of training files: {}".format(len(tf)))
    
    zero_final = tf['final_rows'] == 0 # remove the files with zero final row
    empty_json = tf['no_para'] == 0 # remove files with empty JSON's
    four_para = tf['no_para'] == 4 # remove files with only four paragraphs
    indistinct_timestamp = tf['are_timestamps_distinct'] == False # remove files with indistinct timestamps
    files_rm = [zero_final, empty_json, four_para, indistinct_timestamp] # store them all in a list
    for file_rm in files_rm:
        tf = tf.drop(tf[file_rm].index) # drop files sequentially
    
    tf = tf[(tf['user'] != 21) & (tf['user'] != 16)] # remove users 16 and 21 as they only have 2 and 1 test respectively
    print("Final length of training files: {}".format(len(tf)))
    tf.to_csv("clean_trainingfiles.csv", index=False)
    return tf
    

### Sampling the data 

In this part, the data will be sampled by created at 

In [44]:
clean_training_files()

Initial length of training files: 424
Final length of training files: 277




Unnamed: 0,path,user,test,no_para,are_timestamps_distinct,initial_rows,final_rows,dropped_rows
50,/cs/home/ybk1/Dissertation/Experiment Anonymis...,24,15,5,True,11291,7242,4049
51,/cs/home/ybk1/Dissertation/Experiment Anonymis...,24,13,5,True,11021,9556,1465
52,/cs/home/ybk1/Dissertation/Experiment Anonymis...,24,12,5,True,8709,7223,1486
53,/cs/home/ybk1/Dissertation/Experiment Anonymis...,24,14,5,True,9366,8011,1355
54,/cs/home/ybk1/Dissertation/Experiment Anonymis...,24,1,5,True,7188,5405,1783
...,...,...,...,...,...,...,...,...
419,/cs/home/ybk1/Dissertation/Experiment Anonymis...,11,14,5,True,16432,6542,9890
420,/cs/home/ybk1/Dissertation/Experiment Anonymis...,11,8,5,True,16963,5046,11917
421,/cs/home/ybk1/Dissertation/Experiment Anonymis...,11,10,5,True,9291,2721,6570
422,/cs/home/ybk1/Dissertation/Experiment Anonymis...,11,9,5,True,13667,3681,9986


In [48]:
clean_tf = pd.read_csv("clean_trainingfiles.csv")

In [None]:
#         with open('filename.pickle', 'rb') as handle:
#             b = pickle.load(handle)
        

In [17]:
#store all of user 1's test in a dictionary
user_1 = array(training_files[training_files['user'] == 1]["path"])
user_1_tests = {}

for test in user_1:
    dataset = test + "/" + file
    user_1_tests[extract_test_number(test)] = dataset



In [54]:
def get_samples(df, slider=1, sample_size=120):
    """
    Method for creating samples within a dataset
    :df: test that is being sampled
    :slider: the amount by which the window slides during sampling. The lower the number, the more samples.
    """
    df = df.drop(["Timestamp", " AdjustedUnix"], axis=1) #remove unnecessary columns
    Sample = namedtuple('Sample', 'inputs effort attention interest')
    sampled_tests = []
    
    
    # Group by paragraph and add each paragraph into an array
    paragraphs = df.groupby('para') 
    paragraphs = [paragraphs.get_group(x) for x in paragraphs.groups]
    
    incorrect_length = 0
    # Loop trough each paragraph to create samples
    for para in paragraphs:
        
        if not len(para) > sample_size: # check the length of paragraph if it is bigger than the sample size
            print ("invalid")
            continue
            
        new_sample_length = len(para[0:sample_size])
        counter = 0
        while  new_sample_length >= sample_size:
            """
            **Sliding window algorithm**
            - Create new samples based on sample size and iterate using the slider size for size of overlap
            - Create separate values for inputs, effort, attention, interest to add to a tuple
            """
            new_sample = para[counter : counter + sample_size]
            new_sample_length = len(new_sample)
            
            #checks new_sample length
            if new_sample_length == sample_size:
                # Extract the sample specific data
                inputs = array(new_sample.iloc[:, :8])
                effort, attention, interest = new_sample[["effort", "attention", "interest"]].T.values
                sampled_tests.append(Sample(inputs, int(max(effort)), int(max(attention)), int(max(interest))))
            else:
                incorrect_length += 1
                continue             
         
            # increase by slider
            counter += slider
        
        
    sampled_tests_df = pd.DataFrame(sampled_tests)
    inputs_and_labels = {}
    
    inputs_list = sampled_tests_df['inputs'].values
    inputs_list = np.rollaxis(np.dstack(inputs_list),-1)
    inputs_and_labels['inputs'] = inputs_list
    labels = ["effort", "attention", "interest"]
    for label in labels: 
        inputs_and_labels[label] = array(sampled_tests_df[label].values)
    
    
    return inputs_and_labels
    
    
    
    

In [52]:
def generate_all_samples(clean_tf):
    """
    Method for generating samples for all the tests with a default sample size of 120 and slide of 1. 
    Saves the sample as dictionary using pickle in the relevant directory
    :clean_tf: clean training files
    """
    test_paths = array(clean_tf['path'])
    file = "annotated_EEG.csv"
    sampled_file = "sampled_annotated_EEG.pickle"
    for test_path in test_paths:
        test_file = test_path + "/" + file
        test_dataset = pd.read_csv(test_file)
        sampled_test_dataset = get_samples(test_dataset)
        with open(test_path + "/" + sampled_file, 'wb') as handle:            
            pickle.dump(sampled_test_dataset, handle, protocol=pickle.HIGHEST_PROTOCOL)
        print()

In [None]:
generate_all_samples(clean_tf)