In [1]:
import pandas as pd 
import numpy as np
from numpy import array
import os
from pathlib import Path
from collections import namedtuple
import re

### Traversing the folder 

- check the json data
- check the whether the time start and time end are distinct
- check whether there are 5 paragraphs

In [17]:
def extract_user_number(s):
    """
    Extracts the user number from a given path
    """
    s = str(s).split("_")[0] # extract the user from the path
    s = s.split("/")[6] # split the path to isolate the user
    s = s.replace("User","") # remove the word "User"
    s = int(re.sub(r'[a-z]+', '', s, re.I)) # remove alphabetical characters
    return s 


In [3]:
p = Path("/cs/home/ybk1/Dissertation/Experiment Anonymised Version")
# p = Path("C://Users//User//OneDrive - University of St Andrews//Modules//CS5099//4. Data Documents//dataset//INSTRUMENTED DIGITAL AND PAPER READINGDATASET//Experiment Anonymised Version")

In [20]:
def get_all_datasets(p):
    """
    Loops through all the folders to find all the datasets and json files for training
    p: path of the root folder
    """
    File = namedtuple('File', 'name path size')
    files = []
    for item in p.glob('**/*'): # loops thorough all the files in all the sub-directories
        if item.match('*rawEEGData.csv')  and "baseline" not in item.name:
            name = item.name
            path = Path.resolve(item).parent
            size = item.stat().st_size

            files.append(File(name,path, size )) # stores the name, path and size in named tuple
    
    df = pd.DataFrame(files)
    df['user'] = df["path"].apply(extract_user_number)
    df.to_csv("All EEG files.csv", index=False)
    
    return df

In [29]:
all_datasets = get_all_datasets(p)

In [31]:
no_tests_per_user = all_datasets['user'].value_counts()
no_tests_per_user

20    23
23    21
17    20
7     19
9     19
15    18
2     17
25    16
10    16
3     16
4     16
5     16
6     16
8     16
13    16
11    16
12    16
24    16
16    16
18    16
19    16
21    16
22    16
1     16
14    15
Name: user, dtype: int64

From the value counts we can see that: 
- user 20, 23, 17, 7, 9, 15, 2 - all have duplicates
- user 14 is one short


### Building the datasets using the alldatasets csv

### Loading the data 

In [13]:
# s_dir = "/cs/home/ybk1/Dissertation/Experiment Anonymised Version/User001_Group_Test_30-08-2019--10-07-00/User001_test1_BET_01_30-08-2019/User001_test1_BET_01_30-08-2019_recording0/"
# s_csv = "User001_test1_BET_01_30-08-2019_recording0_U1567156556_EEG_rawEEGData.csv"
# s_json = "annotations.json"

In [14]:
# s_data = pd.read_csv(s_dir + s_csv)

In [32]:
# pd_labels = pd.read_json(s_dir + s_json)

In [1]:
def check_json(labels):
    """
    Method for checking the json to see how many paragraphs it contains
    and whether the timestamps are distinct
    """
    #check whether there are 5 paragraphs
    no_para = len(labels)    
    if len(labels) != 5:
        print ("There are less than 5 Paragraraphs in the dataset")
    else:
        print("There are 5 paragraphs in the dataset")
    
    #check whether the timestamps are distincts
    are_timestamps_distinct = False
    cols = ["timeRangeStart", "timeRangeEnd"]
    for col in cols:
        array_length= len(array(labels[col]))
        set_length = len(set(array(labels[col])))
        if array_length == set_length:
            print(col + " has distinct values")
            is_distinct = True
        else: 
            print(col + " does not have distinct values")
        
    return no_para, are_timestamps_distinct 
    

In [33]:
check_json(pd_labels)

There are 5 paragraphs in the dataset
timeRangeStart has distinct values
timeRangeEnd has distinct values


In [27]:
labels = array(pd.read_json(s_dir + s_json))
labels

print(len(labels))

5


In [18]:
# first create zero columns
# then iterate through the each row and check if it is there


In [19]:
def label(timestamp, annotation, labels):
    """
    Method for adding the labels to the dataframe
    return: relevant score of attention/interest/effort within the correct time range
    """
    for row in labels:
        time_start = row[-2]
        time_end = row[-1]
        ann_dict = {"effort": row[-3], "attention": row[-4], "interest": row[-5],"para": row[-6]}
        if timestamp >= time_start and timestamp < time_end: # checks if the timestamp is witin the start and end range
            return ann_dict[annotation] # returns the relevant score stored in the dictionary
        

In [20]:
def process_data(data, labels):
    """
    Method for processing the dataset by adding 
    """
    annotations = ["effort", "attention", "interest", "para"]
    for ann in annotations: 
        data[ann] = data["Timestamp"].apply(label, annotation=ann, labels=labels)
    print("Processed: {}".format(len(data)))
    print("Empty rows \n{}".format(data.isnull().sum()))
    data = data.dropna()
    data.to_csv("pc.csv")
    print(len(data))
    return data   

In [25]:
def generate_datasets(processed):
    X = processed.iloc[:,:9]
    labels = ['interest', 'effort', 'attention']
    for label in labels:
        dataset = pd.concat([X, processed[label]], axis=1).to_csv('EEG_' + label + '_dataset.csv')   


In [26]:
processed = process_data(s_data)
generate_datasets(processed)

Processed: 7955
Empty rows 
C1                 0
C2                 0
C3                 0
C4                 0
C5                 0
C6                 0
C7                 0
C8                 0
Timestamp          0
 AdjustedUnix      0
effort           411
attention        411
interest         411
para             411
dtype: int64
7544


In [None]:
def build_all_datasets(df):
    """
    Method for building all the training sets by applying annotations at relevant timestamps
    and split the training sets into attention, interest and effort
    df: DataFrame that includes all of the datasets
    """
    ds = array(df)
    json = "annotations.json" # json file that contains the labels and annotations for the data
    
    for row in ds:
        ds_path = row[1] # Get the path of the dataset
        ds_csv = row[0] # get the name of the csv file
        ds_data = pd.read_csv(ds_path + ds_csv)
        ds_labels = pd.read_json(ds_path + ds_json)
        no_para, are_timestamps_distinct = check_json(ds_labels) # check the json file
        
        if are_timestamps_distinct == False:
            print("Don't process")
        else: 
            print("Process")
            
        
        