In [1]:
from sktime.classification.interval_based import TimeSeriesForestClassifier
from sktime.datasets import load_arrow_head
from sktime.datasets import load_basic_motions
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import pandas as pd
import numpy as np
import seaborn as sns
import os

In [2]:
all_files = os.listdir('actigraphy/')
# Might need to change these numbers 
N_train = 100
N_test = 15
# Reading all the files
train_data = [pd.read_csv('actigraphy/'+i) for i in all_files[:N_train]]
test_data = [pd.read_csv('actigraphy/'+i) for i in all_files[N_train:N_train+N_test]]

In [3]:
# def sleep_efficiency(df_raw):
#     df = df_raw.drop(['pid', 'sawa2'], axis = 1)
#     df['interval_num'] = df.interval.replace({'ACTIVE': 3, 'REST': 1, 'REST-S': 0}) # so the differences are unique
#     epochs_in_each_interval = (df.loc[df.interval_num.diff() != 0, :] # epochs where state changed
#                                .line
#                                .diff()[1:]) # number of epochs in each state, starting from active

#     epochs_in_awake = epochs_in_each_interval[0::3].reset_index(drop = True)
#     epochs_in_bed_awake = epochs_in_each_interval[1::3].reset_index(drop = True)
#     epochs_asleep = epochs_in_each_interval[2::3].reset_index(drop = True)
#     sleep_eff = epochs_asleep/(epochs_asleep + epochs_in_bed_awake)
#     return sleep_eff

In [65]:
def data_preproc(raw_df):
    '''Takes in the raw df and outputs df ready to put into the classifier'''
    
    df = raw_df.drop(['pid', 'sawa2'], axis = 1)
    
    df['interval_num'] = df.interval.replace({'ACTIVE': 3, 'REST': 1, 'REST-S': 0}) # so the differences are unique
    
    epochs_in_each_interval = (df.loc[df.interval_num.diff() != 0, :] # epochs where state changed
                               .line
                               .diff()) # number of epochs in each state, starting from active
    
    # interval_epochs eventually contains 1) indicies where state changes, 2) what interval starts at that epoch,  
    # 3) num of epochs in the state before it and 4) the num of epochs in that state itself
    interval_epochs = (df.iloc[epochs_in_each_interval.index,:].interval_num.to_frame()
                       .join(epochs_in_each_interval).reset_index())
    interval_epochs.columns = ['index', 'interval_num', 'num_epochs_before_interval']
    interval_epochs['num_epochs_in_interval'] = interval_epochs.num_epochs_before_interval[1:].reset_index(drop=True)
    
    # indicies where active (3) starts or sleeping (0) starts
    active_sleep_idx = list(interval_epochs.loc[(interval_epochs.interval_num == 0) | 
                                                (interval_epochs.interval_num == 3)]['index'])
        
    # indicies where active starts
    active_idx = interval_epochs.loc[interval_epochs.interval_num == 3].index
    
    sleep_eff = [] # will contain sleep efficiency for the different times that subject sleeps
    num_epochs_in_interval = interval_epochs.num_epochs_in_interval
    
    for i in range(len(active_idx) - 1):
        # when subject rests before sleep
        if active_idx[i] + 3 == active_idx[i+1]:
            sleep_eff.append(num_epochs_in_interval[active_idx[i]+2]/
                             (num_epochs_in_interval[active_idx[i]+1]+
                              num_epochs_in_interval[active_idx[i]+2]))
        # when subject goes to sleep directly
        else:
            sleep_eff.append(1)
    
    # indicies where light exposure occurs before the subject sleeps (active + rest)
    awake_light_idx = [tuple(active_sleep_idx[2*i:2*(i+1)]) for i in range(len(active_sleep_idx)//2)]
    
    # series with series of light (this is the format that sktime dfs take)
    awake_whitelight_series = pd.Series([df.whitelight.iloc[x:y].fillna(0).iloc[-240:].reset_index(drop=True) 
                                         for x,y in awake_light_idx]) # light for last 2 hours before sleep only
    awake_bluelight_series = pd.Series([df.bluelight.iloc[x:y].fillna(0).iloc[-240:].reset_index(drop=True) 
                                        for x,y in awake_light_idx])
    awake_greenlight_series = pd.Series([df.greenlight.iloc[x:y].fillna(0).iloc[-240:].reset_index(drop=True) 
                                         for x,y in awake_light_idx])
    awake_redlight_series = pd.Series([df.redlight.iloc[x:y].fillna(0).iloc[-240:].reset_index(drop=True) 
                                       for x,y in awake_light_idx])
    
    # each entry in column dim_xxxx is a series of light
    final_df = pd.DataFrame({"dim_white": awake_whitelight_series, 
                             "dim_blue": awake_bluelight_series,
                             "dim_green": awake_greenlight_series,
                             "dim_red": awake_redlight_series,
                             "sleep_eff": sleep_eff})
    
    # threshold chosen using idea that good sleep efficiency is when one sleeps in ~24 mins
    # for a sleep of 8 hrs (ratio -- 24mins/8hrs = 3mins/1hr)
    final_df["sleep_eff_cat"] = (final_df.sleep_eff >= 0.95).replace({True: "Good", False: "Bad"})
    
    return final_df

In [66]:
def concat_proc_data(raw_data_lst):
    '''Takes in list of raw dfs, preproccesses them and concats them.'''
    concat_df = pd.DataFrame(columns = ["dim_white","dim_blue","dim_green","dim_red","sleep_eff","sleep_eff_cat"])
    for df in raw_data_lst:
        try:
            proc_data = data_preproc(df)
        except:
            continue 
        else:
            concat_df = pd.concat([concat_df, proc_data])
    concat_df = concat_df.reset_index(drop = True)
    concat_df = concat_df.loc[concat_df.dim_white.apply(lambda x: len(x)) >= 240].reset_index(drop = True)
    return concat_df

In [67]:
def split_X_y(df):
    '''Split X features and y outcome.'''
    return (df.iloc[:, :4], df.iloc[:, 5])

In [68]:
train_preproc = concat_proc_data(train_data)
test_preproc = concat_proc_data(test_data)

In [63]:
train_preproc_X, train_preproc_y = split_X_y(train_preproc)
test_preproc_X, test_preproc_y = split_X_y(test_preproc)

In [57]:
from sktime.classification.kernel_based import RocketClassifier

In [64]:
rocket = RocketClassifier()
rocket.fit(train_preproc_X, train_preproc_y)
y_pred = rocket.predict(test_preproc_X)
accuracy_score(test_preproc_y, y_pred) # I think recall is more relevant, but that is 0. Oh well, baseline classifier.

0.7837837837837838

In [None]:
thresh = .95 # threshold for good sleep efficiency
concat_df.sleep_eff_cat = (concat_df.sleep_eff >= thresh).replace({True: "Good", False: "Bad"})
concat_df.sleep_eff_cat.value_counts(normalize = True)

# Rough work

In [253]:
X, y = load_arrow_head()
# X_train, X_test, y_train, y_test = train_test_split(X, y)
# classifier = TimeSeriesForestClassifier()
# classifier.fit(X_train, y_train)
# y_pred = classifier.predict(X_test)
# accuracy_score(y_test, y_pred)

In [254]:
df1 = load_arrow_head()
df2 = load_basic_motions()

In [276]:
final_df#.dim_0

Unnamed: 0,dim_0,sleep_eff
0,0 44.34 1 71.93 2 61.29 3 ...,0.789474
1,0 1.01 1 1.02 2 1.18 3 1.5...,0.922794
2,0 6.29 1 4.76 2 2.05 3 ...,0.998336
3,0 2.83 1 3.49 2 87.61 3...,0.989088
4,0 27.25 1 0.55 2 0.51 3 ...,0.911921
5,0 149.01 1 123.26 2 346.73 3...,0.997735
6,0 0.51 1 0.48 2 22.50 3 ...,0.996491
7,0 0.51 1 0.51 2 12.93 3 ...,0.992695
8,0 0.51 1 0.51 2 0.51 3 ...,0.982264
9,0 7.45 1 6.75 2 8.77 3 ...,0.889397


In [271]:
df1#.iloc[0,0]

Unnamed: 0,dim_0,class_val
0,0 -1.9630 1 -1.9578 2 -1.9561 3 ...,0
1,0 -1.7746 1 -1.7740 2 -1.7766 3 ...,1
2,0 -1.8660 1 -1.8420 2 -1.8350 3 ...,2
3,0 -2.0738 1 -2.0733 2 -2.0446 3 ...,0
4,0 -1.7463 1 -1.7413 2 -1.7227 3 ...,1
...,...,...
170,0 -1.6251 1 -1.6230 2 -1.6261 3 ...,2
171,0 -1.6578 1 -1.6647 2 -1.6326 3 ...,2
172,0 -1.6033 1 -1.5874 2 -1.5774 3 ...,2
173,0 -1.7390 1 -1.7415 2 -1.7329 3 ...,2


In [52]:
df2

Unnamed: 0,dim_0,dim_1,dim_2,dim_3,dim_4,dim_5,class_val
0,0 0.079106 1 0.079106 2 -0.903497 3...,0 0.394032 1 0.394032 2 -3.666397 3...,0 0.551444 1 0.551444 2 -0.282844 3...,0 0.351565 1 0.351565 2 -0.095881 3...,0 0.023970 1 0.023970 2 -0.319605 3...,0 0.633883 1 0.633883 2 0.972131 3...,standing
1,0 0.377751 1 0.377751 2 2.952965 3...,0 -0.610850 1 -0.610850 2 0.970717 3...,0 -0.147376 1 -0.147376 2 -5.962515 3...,0 -0.103872 1 -0.103872 2 -7.593275 3...,0 -0.109198 1 -0.109198 2 -0.697804 3...,0 -0.037287 1 -0.037287 2 -2.865789 3...,standing
2,0 -0.813905 1 -0.813905 2 -0.424628 3...,0 0.825666 1 0.825666 2 -1.305033 3...,0 0.032712 1 0.032712 2 0.826170 3...,0 0.021307 1 0.021307 2 -0.372872 3...,0 0.122515 1 0.122515 2 -0.045277 3...,0 0.775041 1 0.775041 2 0.383526 3...,standing
3,0 0.289855 1 0.289855 2 -0.669185 3...,0 0.284130 1 0.284130 2 -0.210466 3...,0 0.213680 1 0.213680 2 0.252267 3...,0 -0.314278 1 -0.314278 2 0.018644 3...,0 0.074574 1 0.074574 2 0.007990 3...,0 -0.079901 1 -0.079901 2 0.237040 3...,standing
4,0 -0.123238 1 -0.123238 2 -0.249547 3...,0 0.379341 1 0.379341 2 0.541501 3...,0 -0.286006 1 -0.286006 2 0.208420 3...,0 -0.098545 1 -0.098545 2 -0.023970 3...,0 0.058594 1 0.058594 2 0.175783 3...,0 -0.074574 1 -0.074574 2 0.114525 3...,standing
...,...,...,...,...,...,...,...
35,0 -0.040961 1 -0.040961 2 0.338414 3...,0 -0.971100 1 -0.971100 2 -3.420216 3...,0 0.203560 1 0.203560 2 -2.053446 3...,0 0.061258 1 0.061258 2 0.250357 3...,0 -0.047941 1 -0.047941 2 -0.639209 3...,0 0.961478 1 0.961478 2 -0.298298 3...,badminton
36,0 -1.801504 1 -1.801504 2 -0.480725 3...,0 2.344990 1 2.344990 2 -0.994385 3...,0 0.281253 1 0.281253 2 0.378807 3...,0 0.716447 1 0.716447 2 -0.870923 3...,0 0.162466 1 0.162466 2 0.095881 3...,0 0.921527 1 0.921527 2 -0.474080 3...,badminton
37,0 -0.046089 1 -0.283051 2 -0.587748 3...,0 -0.738026 1 -0.314572 2 3.388108 3...,0 0.179667 1 -0.724257 2 -0.223563 3...,0 0.364882 1 -1.163894 2 -2.543521 3...,0 -0.237040 1 -0.101208 2 0.402169 3...,0 0.386189 1 -0.165129 2 -0.897557 3...,badminton
38,0 -2.178746 1 -2.178746 2 -0.448056 3...,0 -0.385371 1 -0.385371 2 -2.08943...,0 -0.805837 1 -0.805837 2 1.04617...,0 -0.039951 1 -0.039951 2 1.946925 3...,0 0.484734 1 0.484734 2 -0.524684 3...,0 1.054696 1 1.054696 2 2.436986 3...,badminton


In [40]:
df

Unnamed: 0,dim_0,class_val
0,0 -1.9630 1 -1.9578 2 -1.9561 3 ...,0
1,0 -1.7746 1 -1.7740 2 -1.7766 3 ...,1
2,0 -1.8660 1 -1.8420 2 -1.8350 3 ...,2
3,0 -2.0738 1 -2.0733 2 -2.0446 3 ...,0
4,0 -1.7463 1 -1.7413 2 -1.7227 3 ...,1
...,...,...
170,0 -1.6251 1 -1.6230 2 -1.6261 3 ...,2
171,0 -1.6578 1 -1.6647 2 -1.6326 3 ...,2
172,0 -1.6033 1 -1.5874 2 -1.5774 3 ...,2
173,0 -1.7390 1 -1.7415 2 -1.7329 3 ...,2


In [41]:
type(df.iloc[0, 0])

pandas.core.series.Series