In [3]:
import pandas as pd 
import numpy as np
from numpy import array
import os
from pathlib import Path
from collections import namedtuple
import re

### Traversing the folder 

- check the json data
- check the whether the time start and time end are distinct
- check whether there are 5 paragraphs

In [2]:
def extract_user_number(s):
    """
    Extracts the user number from a given path
    :s: path of the file
    """
    s = str(s).split("_")[0] # extract the user from the path
    s = s.split("/")[6] # split the path to isolate the user
    s = s.replace("User","") # remove the word "User"
    s = int(re.sub(r'[a-z]+', '', s, re.I)) # remove alphabetical characters
    return s 


In [7]:
def extract_test_number(s):
    """
    Extracts the test number from a given path
    :s: path of the file
    """
    s = s.split("/")[7]
    s = s.split("_")[1]
    s = int(re.sub(r'[a-z]+', '', s, re.I))
    return s 

In [4]:
p = Path("/cs/home/ybk1/Dissertation/Experiment Anonymised Version")
# p = Path("C://Users//User//OneDrive - University of St Andrews//Modules//CS5099//4. Data Documents//dataset//INSTRUMENTED DIGITAL AND PAPER READINGDATASET//Experiment Anonymised Version")

In [5]:
def get_all_datasets(p):
    """
    Loops through all the folders to find all the datasets and json files for training
    p: path of the root folder
    """
    File = namedtuple('File', 'name path size')
    files = []
    for item in p.glob('**/*'): # loops thorough all the files in all the sub-directories
        if item.match('*rawEEGData.csv')  and "baseline" not in item.name:
            name = item.name
            path = Path.resolve(item).parent
            size = item.stat().st_size

            files.append(File(name,path, size )) # stores the name, path and size in named tuple
    
    df = pd.DataFrame(files)
    df['user'] = df["path"].apply(extract_user_number)
    df.to_csv("All EEG files.csv", index=False)
    
    return df

From the value counts we can see that: 
- user 20, 23, 17, 7, 9, 15, 2 - all have duplicates
- user 14 is one short


### Building the datasets using the alldatasets csv

### Loading the data 

In [6]:
def check_json(labels):
    """
    Method for checking the json to see how many paragraphs it contains
    and whether the timestamps are distinct
    :labels: annotations.json in a DataFrame
    """
    #check whether there are 5 paragraphs
    no_para = len(labels)
    are_timestamps_distinct = False
    if no_para == 0: # checks if the json is empty
        return no_para, are_timestamps_distinct
    else:
        if len(labels) != 5:
            print ("There are less than 5 paragraphs in the dataset")
        else:
            print("There are 5 paragraphs in the dataset")

        #check whether the timestamps are distincts
        cols = ["timeRangeStart", "timeRangeEnd"]
        for col in cols:
            array_length= len(array(labels[col]))
            set_length = len(set(array(labels[col])))
            if array_length == set_length:
                print(col + " has distinct values")
                are_timestamps_distinct = True
                break
            else: 
                print(col + " does not have distinct values")
                are_timestamps_distinct = False
                break

        print(are_timestamps_distinct)
        return no_para, are_timestamps_distinct 
    

In [7]:
def label(timestamp, annotation, labels):
    """
    Method for adding the labels to the dataframe
    
    :return: relevant score of attention/interest/effort within the correct time range
    :timestamp: timestamp at the current iteration
    :annotation: annotation at the current interation
    :labels: annotations.json in array format
    """
    for row in labels:
        time_start = row[-2]
        time_end = row[-1]
        ann_dict = {"effort": row[-3], "attention": row[-4], "interest": row[-5],"para": row[-6]}
        if timestamp >= time_start and timestamp < time_end: # checks if the timestamp is witin the start and end range
            return ann_dict[annotation] # returns the relevant score stored in the dictionary
        

In [8]:
def add_labels(data, labels):
    """
    Method for processing the dataset by creating columns for effort, attention, interest and paragraph
    :data: EEG dataset
    :labels: annotations.json in array format
    """
    annotations = ["effort", "attention", "interest", "para"]
    for ann in annotations: # loops through the labels, and creates new columns based on the values within the timestamp range
        data[ann] = data["Timestamp"].apply(label, annotation=ann, labels=labels)
    
    initial_rows = len(data)
    data = data.dropna()
    final_rows = len(data)
    dropped_rows = initial_rows - final_rows
    print("Initial rows: {0}\nFinal rows: {1}\nDropped rows: {2}".format(initial_rows, 
                                                                         final_rows, 
                                                                         dropped_rows))
    return data , initial_rows, final_rows, dropped_rows

In [9]:
def export_datasets_to_csv(labeled_data, path):
    """
    Exports the labeled data as three separate csv files for interest, effort and attention
    :labeled_data: annotated datasets with columns for interest, effort and attention
    :path: path to save the dataset
    """
    X = labeled_data.iloc[:,:9] # inputs i.e channels and timestamp
    labels = ['interest', 'effort', 'attention']
    for label in labels:
        dataset = pd.concat([X, labeled_data[label]], axis=1).to_csv(path +  "/" +'EEG_' + label + '_dataset.csv', index=False)   


In [18]:
def build_all_datasets(df):
    """
    Method for building all the training sets by applying annotations at relevant timestamps
    and splitting the training sets into attention, interest and effort
    :df: DataFrame that includes all of the datasets
    :ds: dataset
    """
    ds = array(df)
    json = "annotations.json" # json file that contains the labels and annotations for the data
    File = namedtuple('File', 'path user test no_para are_timestamps_distinct initial_rows final_rows dropped_rows')
    files = []
    
    for row in ds:
        ds_path = str(row[1]) # Get the path of the dataset
        ds_csv = row[0] # get the name of the csv file
        print("Working on this csv: {}".format(ds_csv))
        ds_user = row[3] # get the user number
        ds_test = extract_test_number(ds_path)
        ds_data = pd.read_csv(ds_path + "/" + ds_csv)
        ds_labels = pd.read_json(ds_path + "/" + json)
        no_para, are_timestamps_distinct = check_json(ds_labels) # check the json file
                
        ds_labels = array(ds_labels)
        if are_timestamps_distinct == False:
            print("Don't process")
            initial_rows, final_rows  = len(ds_data), len(ds_data)
            dropped_rows = 0
            files.append(File(ds_path, ds_user, ds_test, no_para,are_timestamps_distinct,
                             initial_rows, final_rows, dropped_rows))          
        else:
            labeled_data, initial_rows, final_rows, dropped_rows = add_labels(ds_data, ds_labels) # Gets the labeled data
#             export_datasets_to_csv(labeled_data, ds_path) # Exports the 3 datasets to the appropriate path
            labeled_data.to_csv(ds_path + "/" + "annotated_EEG.csv", index=False)
            files.append(File(ds_path, ds_user, ds_test, no_para, are_timestamps_distinct,
                         initial_rows, final_rows, dropped_rows))
            
    training_files = pd.DataFrame(files)
    training_files.to_csv("Trainingfiles.csv", index=False)
    return training_files
        

In [15]:
def build_all_datasets_test():
    """
    Test method for building a single group of training sets by applying annotations at relevant timestamps
    and splitting the training sets into attention, interest and effort
    :df: DataFrame that includes all of the datasets
    :ds: dataset
    """
    # User001_test1_BET_01_30-08-2019_recording0_U1567156556_EYETRACKER_cleanFixationData.csv
    # /cs/home/ybk1/Dissertation/Experiment Anonymised Version/User001_Group_Test_30-08-2019--10-07-00/User001_test1_BET_01_30-08-2019/User001_test1_BET_01_30-08-2019_recording0
    ds_path = "/cs/home/ybk1/Dissertation/Experiment Anonymised Version/User001_Group_Test_30-08-2019--10-07-00/User001_test1_BET_01_30-08-2019/User001_test1_BET_01_30-08-2019_recording0"
    ds_csv = "User001_test1_BET_01_30-08-2019_recording0_U1567156556_EEG_rawEEGData.csv"
    json = "annotations.json"
    
    File = namedtuple('File', 'path user no_para are_timestamps_distinct initial_rows final_rows dropped_rows')
    files = []
        
    ds_user = 1 # get the user number
    ds_data = pd.read_csv(ds_path + "/" + ds_csv)
    ds_labels = pd.read_json(ds_path + "/" + json)
    no_para, are_timestamps_distinct = check_json(ds_labels) # check the json file


    ds_labels = array(ds_labels)
    if are_timestamps_distinct == False:
        print("Don't process")
        initial_rows, final_rows  = len(ds_data), len(ds_data)
        dropped_rows = 0
        files.append(File(ds_path, ds_user, no_para,are_timestamps_distinct,
                         initial_rows, final_rows, dropped_rows))
    else:
        labeled_data, initial_rows, final_rows, dropped_rows = add_labels(ds_data, ds_labels) # Gets the labeled data
#         export_datasets_to_csv(labeled_data, ds_path) # Exports the 3 datasets to the appropriate path
        labeled_data.to_csv(ds_path + "/" + "annotated_EEG.csv", index=False)
        files.append(File(ds_path, ds_user, no_para, are_timestamps_distinct,
                         initial_rows, final_rows, dropped_rows))
        
    training_files = pd.DataFrame(files)
    return training_files
        

**Run this block to rebuild the datasets**

In [19]:
# all_datasets = get_all_datasets(p)
# build_all_datasets(all_datasets)

Working on this csv: User004c_test8_IET_08_02-09-2019_recording0_U1567417804_EEG_rawEEGData.csv
There are 5 paragraphs in the dataset
timeRangeStart has distinct values
True
Initial rows: 9996
Final rows: 0
Dropped rows: 9996
Working on this csv: User004c_test9_BET_09_02-09-2019_recording0_U1567418029_EEG_rawEEGData.csv
There are 5 paragraphs in the dataset
timeRangeStart has distinct values
True
Initial rows: 13153
Final rows: 0
Dropped rows: 13153
Working on this csv: User004d_test12_IES_12_02-09-2019_recording0_U1567419281_EEG_rawEEGData.csv
There are 5 paragraphs in the dataset
timeRangeStart has distinct values
True
Initial rows: 13549
Final rows: 0
Dropped rows: 13549
Working on this csv: User004d_test14_IDS_14_02-09-2019_recording0_U1567419722_EEG_rawEEGData.csv
There are 5 paragraphs in the dataset
timeRangeStart has distinct values
True
Initial rows: 12280
Final rows: 0
Dropped rows: 12280
Working on this csv: User004b_test4_IES_04_02-09-2019_recording0_U1567416685_EEG_rawEEGD

Unnamed: 0,path,user,test,no_para,are_timestamps_distinct,initial_rows,final_rows,dropped_rows
0,/cs/home/ybk1/Dissertation/Experiment Anonymis...,4,8,5,True,9996,0,9996
1,/cs/home/ybk1/Dissertation/Experiment Anonymis...,4,9,5,True,13153,0,13153
2,/cs/home/ybk1/Dissertation/Experiment Anonymis...,4,12,5,True,13549,0,13549
3,/cs/home/ybk1/Dissertation/Experiment Anonymis...,4,14,5,True,12280,0,12280
4,/cs/home/ybk1/Dissertation/Experiment Anonymis...,4,4,5,True,14474,0,14474
...,...,...,...,...,...,...,...,...
419,/cs/home/ybk1/Dissertation/Experiment Anonymis...,11,14,5,True,16432,6542,9890
420,/cs/home/ybk1/Dissertation/Experiment Anonymis...,11,8,5,True,16963,5046,11917
421,/cs/home/ybk1/Dissertation/Experiment Anonymis...,11,10,5,True,9291,2721,6570
422,/cs/home/ybk1/Dissertation/Experiment Anonymis...,11,9,5,True,13667,3681,9986


In [16]:
# build_all_datasets_test()

There are 5 paragraphs in the dataset
timeRangeStart has distinct values
True
Initial rows: 7955
Final rows: 7544
Dropped rows: 411


Unnamed: 0,path,user,no_para,are_timestamps_distinct,initial_rows,final_rows,dropped_rows
0,/cs/home/ybk1/Dissertation/Experiment Anonymis...,1,5,True,7955,7544,411


### Extract viable datasets from the training files 

- remove the files that have zero final rows
- remove files that have empty JSON's
- remove files that only have 4 paragraphs
- remove files with indistince timestamps

In [13]:
def clean_training_files():
    """
    Method for removing:
    - Files with zero final rows
    - remove files with empty Json's
    - Files with only 4 paragraphs
    - Files with indistinct timestamp
    """
    tf = pd.read_csv("Trainingfiles.csv")
    print("Initial length of training files: {}".format(len(tf)))
    
    zero_final = tf['final_rows'] == 0 # remove the files with zero final row
    empty_json = tf['no_para'] == 0 # remove files with empty JSON's
    four_para = tf['no_para'] == 4 # remove files with only four paragraphs
    indistinct_timestamp = tf['are_timestamps_distinct'] == False # remove files with indistinct timestamps
    files_rm = [zero_final, empty_json, four_para, indistinct_timestamp] # store them all in a list
    for file_rm in files_rm:
        tf = tf.drop(tf[file_rm].index) # drop files sequentially
    
    print("Final length of training files: {}".format(len(tf)))
    tf.to_csv("clean_trainingfiles.csv", index=False)
    return tf
    

### Sampling the data 

In this part, the data will be sampled by created at 

In [4]:
training_files = pd.read_csv("clean_trainingfiles.csv")

In [5]:
training_files['user'].value_counts()

8     16
23    16
22    16
2     16
20    16
19    16
18    16
3     16
6     16
24    16
11    16
7     16
9     16
25    15
1     15
10    15
17    15
12     9
16     2
21     1
Name: user, dtype: int64

In [8]:
#store all of user 1's test in a dictionary
user_1 = array(training_files[training_files['user'] == 1]["path"])
user_1_tests = {}
file = "annotated_EEG.csv"
for test in user_1:
    dataset = test + "/" + file
    user_1_tests[extract_test_number(test)] = dataset



In [10]:
#Loop through the dictionary of tests and then sample the tests

for key in user_1_tests: 
    test = pd.read_csv(user_1_tests[key]).drop(columns=["Timestamp", " AdjustedUnix"])
    sampled_test = sample(test)
    

TypeError: sample() takes 0 positional arguments but 1 was given

In [20]:
def sample():
    """
    Method for creating samples within a dataset
    :df: test that is being sampled
    """
    df = pd.read_csv(user_1_tests[1]).drop(["Timestamp", " AdjustedUnix"], axis=1) #remove unnecessary columns
    sampled_test = []
    sample_size = 60
    
    gb = gb = df.groupby('para')    
    gb = [gb.get_group(x) for x in gb.groups]
    
    para_0 = gb[0]
    
    # check the length of para
    if not len(para_0) > sample_size:
        print ("invalid")
        return -1
    
    slider = 10
    new_sample_length = len(para_0[0:sample_size])
    print(new_sample_length)
    counter = 0
    while  new_sample_length >= sample_size:
        new_sample = para_0[counter : counter + sample_size]
        new_sample_length = len(new_sample)
        sampled_test.append(new_sample)
        counter += slider
        print(counter)
    
    print(len(sampled_test))
    

#     print(gb[0].iloc[:,:8])
    
    
    return sampled_test
    
    
    
    

In [21]:
sample()

60
10
20
30
40
50
60
70
80
90
100
110
120
130
140
150
160
170
180
190
200
210
220
230
240
250
260
270
280
290
300
310
320
330
340
350
360
370
380
390
400
410
420
430
440
450
460
470
480
490
500
510
520
530
540
550
560
570
580
590
600
610
620
630
640
650
660
670
680
690
700
710
720
730
740
750
760
770
780
790
800
810
820
830
840
850
860
870
880
890
900
910
920
930
940
950
960
970
980
990
1000
1010
1020
1030
1040
1050
105


[               C1            C2             C3             C4             C5  \
 0   172495.625000 -16886.330078  200873.203125  188091.968750  179837.921875   
 1   172504.875000 -16903.593750  200863.953125  188080.625000  179839.968750   
 2   172504.875000 -16900.970703  200869.828125  188084.484375  179827.421875   
 3   172494.234375 -16878.892578  200887.421875  188101.031250  179807.015625   
 4   172478.031250 -16841.746094  200913.218750  188126.875000  179780.843750   
 5   172463.390625 -16800.787109  200941.687500  188154.437500  179762.281250   
 6   172444.453125 -16778.660156  200952.359375  188168.171875  179743.453125   
 7   172434.968750 -16773.988281  200955.218750  188171.359375  179739.062500   
 8   172439.406250 -16799.785156  200936.296875  188153.812500  179749.406250   
 9   172452.375000 -16844.035156  200904.875000  188124.828125  179776.828125   
 10  172455.046875 -16884.185547  200874.921875  188095.734375  179788.562500   
 11  172460.234375 -16909.31