In [1]:
import pandas as pd 
import numpy as np 

In [122]:
#load in the 72-hour data with the rest/activity indicators 
day_night_data = pd.read_csv("Sleep_Mortality_Code/Data/hr_act_avgs.csv")

In [123]:
day_night_data.rename(columns={'Unnamed: 0': 'original_index'}, inplace=True)

In [259]:
len(day_night_data["su_id"].unique())

689

In [257]:
def process_data(df):
    unique_su_ids = df['su_id'].unique()
    final_df = pd.DataFrame(columns=['su-id', 'activity-average', 'day/night'])

    
    for su_id in unique_su_ids:
        temp_df = process_su_id(df, su_id)
        final_df = pd.concat([final_df, temp_df], ignore_index=True)
        
    return final_df

In [262]:
def process_su_id(df, su_id):
    subdata = df[df.su_id == su_id]
    subdata = subdata.reset_index()
    subdata['original_index'] = np.arange(len(subdata))
    subdata['index'] = np.arange(len(subdata))
    new_df = pd.DataFrame(columns=['su-id', 'activity-average', 'day/night'])
    
    try:
        nights = get_nights(subdata)
        days = get_days(subdata, nights)

        night = 1
        for window in nights:
            if (window[1] - window[0] + 1 == 0):
                continue
            activity_avg = get_average(subdata, window)
            new_df.loc[len(new_df.index)] = [su_id, activity_avg, 'night' + str(night)]

            night += 1

        day = 1
        for window in days:
            if (window[1] - window[0] + 1 == 0):
                continue
            activity_avg = get_average(subdata, window)
            new_df.loc[len(new_df.index)] = [su_id, activity_avg, 'day' + str(day)]

            day += 1
    except:
        new_df = pd.DataFrame(columns=['su-id', 'activity-average', 'day/night'])
        return new_df
    finally:
        return new_df
    return new_df


In [265]:
new = process_data(day_night_data)
new

Unnamed: 0,su-id,activity-average,day/night
0,10000100,7.250000,night1
1,10000100,17.766667,night2
2,10000100,10.925000,night3
3,10000100,46.982972,day1
4,10000100,73.547059,day2
...,...,...,...
4658,10043980,6.754167,night3
4659,10043980,61.232143,day1
4660,10043980,73.557292,day2
4661,10043980,82.678125,day3


In [289]:
new.loc[new['day/night'].str.contains("night"),'status'] = 'night'
new.loc[new['day/night'].str.contains("day"),'status'] = 'day'


In [290]:
new

Unnamed: 0,su-id,activity-average,day/night,status
0,10000100,7.250000,night1,night
1,10000100,17.766667,night2,night
2,10000100,10.925000,night3,night
3,10000100,46.982972,day1,day
4,10000100,73.547059,day2,day
...,...,...,...,...
4658,10043980,6.754167,night3,night
4659,10043980,61.232143,day1,day
4660,10043980,73.557292,day2,day
4661,10043980,82.678125,day3,day


In [322]:
#get average and standard deviation of days/nights 
dn_means = new.groupby(['su-id', 'status'],  as_index=False)["activity-average"].agg(['mean'])


In [324]:
#df2=df.pivot(index='countries', columns='metrics', values='values')
pivot_mean = dn_means.pivot_table(index = 'su-id', columns = 'status', values = 'mean' )

In [326]:
pivot_mean = pivot_mean.rename(columns={'day': 'day_mean', 'night': 'night_mean'})

In [328]:
dn_stds = new.groupby(['su-id', 'status'],  as_index=False)["activity-average"].agg(['std'])

In [329]:
pivot_std = pivot_std.rename(columns={'day': 'day_std', 'night': 'night_std'})

In [331]:
pivot_std

status,day_std,night_std
su-id,Unnamed: 1_level_1,Unnamed: 2_level_1
10000100,23.685189,5.337201
10000200,26.773715,2.338468
10000300,13.743435,12.264820
10000390,31.761544,4.817667
10000391,27.580302,4.958227
...,...,...
10043741,4.865689,0.852780
10043770,16.001105,6.359299
10043790,17.818507,2.006469
10043791,14.822356,2.559212


In [332]:
dn_stats = pivot_mean.merge(pivot_std, on='su-id', how='inner')

In [333]:
dn_stats

status,day_mean,night_mean,day_std,night_std
su-id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
10000100,71.364463,11.980556,23.685189,5.337201
10000200,130.585764,7.466139,26.773715,2.338468
10000300,52.996441,16.945602,13.743435,12.264820
10000390,38.306918,13.287963,31.761544,4.817667
10000391,72.864729,21.115079,27.580302,4.958227
...,...,...,...,...
10043741,70.247527,11.557272,4.865689,0.852780
10043770,122.474307,16.622698,16.001105,6.359299
10043790,55.372563,15.508148,17.818507,2.006469
10043791,84.900962,14.439352,14.822356,2.559212


In [334]:
dn_stats.to_csv('day_night_stats.csv')

In [264]:
len(new["su-id"].unique())

686

In [266]:
for s in day_night_data["su_id"].unique(): 
    if s not in new["su-id"].unique(): 
        print(s)

10000330
10016640
10037310


In [None]:
#manually determine the activity averages for days/nights on these 3 

In [270]:
err1 = day_night_data[day_night_data["su_id"] == 10000330]

In [271]:
len(err1)

73

In [277]:
#they sleep only once in 72 hours? something's up
#same in R, might be that they used a different subset of hours for their actigraphy records
err1.tail(24)

Unnamed: 0,original_index,su_id,hr,hract,rest_ratio
254,255,10000330,2010-11-14 17,33.566667,-0.3
255,256,10000330,2010-11-14 18,29.983333,-1.0
256,257,10000330,2010-11-14 19,62.7,-1.0
257,258,10000330,2010-11-14 20,42.766667,-1.0
258,259,10000330,2010-11-14 21,14.216667,-1.0
259,260,10000330,2010-11-14 22,14.183333,-1.0
260,261,10000330,2010-11-14 23,33.916667,-1.0
261,262,10000330,2010-11-15 00,22.183333,0.233333
262,263,10000330,2010-11-15 01,18.666667,1.0
263,264,10000330,2010-11-15 02,11.616667,1.0


In [278]:
err2 = day_night_data[day_night_data["su_id"] == 10016640]

In [279]:
len(err2)

64

In [281]:
#another no sleeper... 
err2.head(40)

Unnamed: 0,original_index,su_id,hr,hract,rest_ratio
17459,17460,10016640,2010-09-30 15,60.226415,-1.0
17460,17461,10016640,2010-09-30 16,67.916667,-1.0
17461,17462,10016640,2010-09-30 17,28.4,-1.0
17462,17463,10016640,2010-09-30 18,29.583333,-1.0
17463,17464,10016640,2010-09-30 19,30.666667,-1.0
17464,17465,10016640,2010-09-30 20,54.566667,-1.0
17465,17466,10016640,2010-09-30 21,27.238095,-1.0
17466,17467,10016640,2010-10-01 10,85.181818,-1.0
17467,17468,10016640,2010-10-01 11,80.683333,-1.0
17468,17469,10016640,2010-10-01 12,30.4,-1.0


In [282]:
err3 = day_night_data[day_night_data["su_id"] == 10037310]

In [286]:
#we can probably manually calculate nights/days here 
err3.head(40)

Unnamed: 0,original_index,su_id,hr,hract,rest_ratio
40746,40747,10037310,2010-10-15 18,58.648649,-1.0
40747,40748,10037310,2010-10-15 19,25.183333,-0.833333
40748,40749,10037310,2010-10-15 20,9.283333,1.0
40749,40750,10037310,2010-10-15 21,20.5,1.0
40750,40751,10037310,2010-10-15 22,45.3,1.0
40751,40752,10037310,2010-10-15 23,26.216667,1.0
40752,40753,10037310,2010-10-16 00,10.6,1.0
40753,40754,10037310,2010-10-16 01,2.933333,1.0
40754,40755,10037310,2010-10-16 02,7.616667,1.0
40755,40756,10037310,2010-10-16 03,23.553571,1.0


In [246]:
process_data(day_night_data)

[10000100 10000200 10000300 10000330 10000390 10000391 10000470 10000471
 10000570 10000590 10000591 10000650 10000651 10000770 10000771 10000980
 10000981 10001170 10001230 10001231 10001300 10001301 10001330 10001380
 10001610 10001611 10001630 10001740 10001890 10001900 10001950 10002050
 10002170 10002270 10002271 10002300 10002301 10002460 10002540 10002600
 10002660 10002661 10002970 10002980 10003140 10003201 10003310 10003311
 10003460 10003461 10003560 10003970 10004070 10004150 10004270 10004360
 10004441 10004470 10004471 10004580 10004660 10004661 10004700 10004750
 10004790 10004840 10004860 10004930 10004931 10005070 10005071 10005080
 10005090 10005100 10005101 10005110 10005111 10005130 10005240 10005290
 10005291 10005300 10005301 10005370 10005830 10005831 10005840 10005841
 10005930 10006051 10006120 10006350 10006351 10006380 10006510 10006540
 10006740 10006800 10006810 10006811 10006890 10006900 10006901 10006970
 10007000 10007070 10007110 10007180 10007220 10007

KeyError: '[41 42 43 44 45 46 47 48 49 50 51 52 53] not found in axis'

In [138]:
def get_days(subdata, nights):
    length = subdata.size
    sorted_nights = sorted(nights, key=lambda x: x[0])
    
    days = []
    
    if (sorted_nights[0][0] != 0):
        days.append([0, sorted_nights[0][0] - 1])
    days.append([sorted_nights[0][1] + 1, sorted_nights[1][0] - 1])
    days.append([sorted_nights[1][1] + 1, sorted_nights[2][0] - 1])
    if (sorted_nights[2][1] != len(subdata)):
        days.append([sorted_nights[2][1] + 1, len(subdata) - 1])    
    
    return days
    
    

In [25]:
def get_average(subdata, window):
    hract_sum = 0
    for index, row in subdata.iterrows():
        if index >= window[0] and index <= window[1]:
            hract_sum += row["hract"]
    return hract_sum / (window[1] - window[0] + 1)

In [240]:
def get_nights(subdata):
    k = 3
    all_indices = []
    
    for i in range(0, k):
        
        if i > 0:
            indices_to_drop = [x for x in range(all_indices[i-1][0], all_indices[i-1][1])]
            subdata = subdata.drop(index=indices_to_drop)
            subdata.reset_index()

        
        maximum_value = -10000000
        current_max = 0
        indices = [0, 1]
        s = 0
        
        for index, row in subdata.iterrows():
            current_max += row["rest_ratio"]

            if maximum_value < current_max:
                maximum_value = current_max
                indices[1] = row['original_index'] - 1
                indices[0] = s - 1

            if current_max < 0:
                current_max = 0
                s = row['original_index'] + 1
        all_indices.append(indices)
    return all_indices
            

In [115]:
get_nights(one_df)

[[58, 68], [9, 18], [36, 43]]

In [111]:
one_df = day_night_data[day_night_data.su_id == 10000100]

In [113]:
day_night_data.head(20)

Unnamed: 0,original_index,su_id,hr,hract,rest_ratio
0,1,10000100,2011-01-13 10,25.463415,-1.0
1,2,10000100,2011-01-13 11,16.316667,0.0
2,3,10000100,2011-01-13 12,29.433333,-0.533333
3,4,10000100,2011-01-13 13,50.15,-1.0
4,5,10000100,2011-01-13 14,25.683333,-1.0
5,6,10000100,2011-01-13 15,39.15,-1.0
6,7,10000100,2011-01-13 16,91.75,-1.0
7,8,10000100,2011-01-13 17,95.616667,-1.0
8,9,10000100,2011-01-13 18,49.283333,-1.0
9,10,10000100,2011-01-13 19,3.116667,0.2


In [15]:
actigraphy_lists = day_night_data.to_numpy().tolist()

In [16]:
actigraphy_lists

[[1, 10000100, '2011-01-13 10', 25.4634146341463, 0.0, 0],
 [2, 10000100, '2011-01-13 11', 16.3166666666667, 0.5, 0],
 [3, 10000100, '2011-01-13 12', 29.4333333333333, 0.233333333333333, 0],
 [4, 10000100, '2011-01-13 13', 50.15, 0.0, 0],
 [5, 10000100, '2011-01-13 14', 25.6833333333333, 0.0, 0],
 [6, 10000100, '2011-01-13 15', 39.15, 0.0, 0],
 [7, 10000100, '2011-01-13 16', 91.75, 0.0, 0],
 [8, 10000100, '2011-01-13 17', 95.6166666666667, 0.0, 0],
 [9, 10000100, '2011-01-13 18', 49.2833333333333, 0.0, 0],
 [10, 10000100, '2011-01-13 19', 3.11666666666667, 0.6, 1],
 [11, 10000100, '2011-01-13 20', 85.7666666666667, 1.0, 1],
 [12, 10000100, '2011-01-13 21', 24.4833333333333, 1.0, 1],
 [13, 10000100, '2011-01-13 22', 5.75, 1.0, 1],
 [14, 10000100, '2011-01-13 23', 0.65, 1.0, 1],
 [15, 10000100, '2011-01-14 00', 11.5833333333333, 1.0, 1],
 [16, 10000100, '2011-01-14 01', 8.16666666666667, 1.0, 1],
 [17, 10000100, '2011-01-14 02', 5.61666666666667, 1.0, 1],
 [18, 10000100, '2011-01-14 03',

In [None]:
prior_suid = 10000100

prior_score = 0 

day_num = 1 
night_num = 1

for row in actigraphy_lists: 
    
    #if same person
    if row[1] == prior_suid: 
        
        
        #determine what day/night we're on
        
        #this doesn't account for waking time during the night or vice versa... 

        if row[5] == 0:
            #if it's equal to the prior score then we're on the same day
            if row[5] == prior_score:
                row.append(day_num)
                #reset prior score
                prior_score = row[5]
            else:
            #if it's not equal to the prior score, then we're on a new day
                day_num+=1
                row.append(day_num)
                #reset prior score
                prior_score = row[5]

    
        
        if row[5] == 1:
            if row[5] == prior_score: 
                row.append(night_num)
    
    else:
        #start this loop again

In [17]:
one_df

Unnamed: 0.1,Unnamed: 0,su_id,hr,hract,rest_ratio,rest_indicator
0,1,10000100,2011-01-13 10,25.463415,0.000000,0
1,2,10000100,2011-01-13 11,16.316667,0.500000,0
2,3,10000100,2011-01-13 12,29.433333,0.233333,0
3,4,10000100,2011-01-13 13,50.150000,0.000000,0
4,5,10000100,2011-01-13 14,25.683333,0.000000,0
...,...,...,...,...,...,...
68,69,10000100,2011-01-16 06,8.716667,0.883333,1
69,70,10000100,2011-01-16 07,72.333333,0.000000,0
70,71,10000100,2011-01-16 08,163.233333,0.000000,0
71,72,10000100,2011-01-16 09,86.766667,0.000000,0


In [14]:
#want to loop through for each SU_ID and label whether it's night 1, day 1, night 2, day 2, night 3, day 3 

for index, row in one_df.iterrows():
    print(row)

Unnamed: 0                    1
su_id                  10000100
hr                2011-01-13 10
hract                 25.463415
rest_ratio                  0.0
rest_indicator                0
Name: 0, dtype: object
Unnamed: 0                    2
su_id                  10000100
hr                2011-01-13 11
hract                 16.316667
rest_ratio                  0.5
rest_indicator                0
Name: 1, dtype: object
Unnamed: 0                    3
su_id                  10000100
hr                2011-01-13 12
hract                 29.433333
rest_ratio             0.233333
rest_indicator                0
Name: 2, dtype: object
Unnamed: 0                    4
su_id                  10000100
hr                2011-01-13 13
hract                     50.15
rest_ratio                  0.0
rest_indicator                0
Name: 3, dtype: object
Unnamed: 0                    5
su_id                  10000100
hr                2011-01-13 14
hract                 25.683333
rest_ratio  