# Check Cortisol samples before sending them to the lab

In [1]:
import os
import pandas as pd
import numpy as np
import re
import datetime

In [2]:
# set root dir
os.chdir(f"{os.getcwd()}/../")
ROOT_DIR = os.getcwd()

In [3]:
# import data
#filename = "BIBO10 Speeksel data FINAL 3-7-2018.xlsx" # this would be original file
# I wanted to modify original file and thus made a copy and work with it instead:
filename = "BIBO10_saliva_03-07-2018_modified.xlsx"
xl = pd.ExcelFile(f"{ROOT_DIR}/data/cortisol/10_years/{filename}")
# one df for each sheet
df_list = [xl.parse(i) for i in [1, 3, 5]]
# define new colnames
colnames_0 = [
    "ID",
    "complete",
    "sick_last_seven_d",
    "sick_notes",
    "medication_last_seven_d",
    "medication_notes",
    "abx_last_seven_d",
    "abx_notes",
    "extra_notes"
]
colnames_1 = [
    "ID", 
    "complete", 
    "date", 
    "time_wakeup", 
    "time_c1", 
    "time_c2", 
    "time_c3", 
    "time_c4", 
    "time_lunch", 
    "time_dinner", 
    "snacks",
    "time_snacks_1",
    "time_snacks_2",
    "time_snacks_3",
    "activity_morning",
    "activity_afternoon",
    "normal_day_question",
    "normal_day_notes",
    "problem_question",
    "problem_notes",
    "notes_yellow",
    "exclude",
    "notes_red"
    
]
# clean dfs
for i, df in enumerate(df_list):
    # change colnames
    if i == 0:
        df_list[i].columns = colnames_0
    else:
        df_list[i].columns = colnames_1
    # delete first row
    df_list[i] = df.iloc[1:, :]
    # drop incomplete rows
    #df_list[i] = df.loc[df.complete == 1, :]
    # replace 999 by np.nan
    df_list[i] = df_list[i].replace(to_replace = 999, value = np.nan)

In [4]:
# # insert 0 for all that I did not manually exclude
# def rep(value):
#     if value != 1:
#         return(0)
#     return(value)
# 
# df_list[1].exclude = df_list[1].exclude.apply(rep)
# df_list[2].exclude = df_list[2].exclude.apply(rep)

# writer = pd.ExcelWriter(f"{ROOT_DIR}/data/cortisol/10_years/cortisol_10_henrik.xlsx")
# for i, sheet in enumerate(["additional_info", "saliva_day_1", "saliva_day_2"]):
#     df_list[i].to_excel(writer, na_rep = 99999, sheet_name = sheet)
# writer.save()

# Clean time values
I require pandas timestamp objects. To convert to these I need a specific string format (e.g.) "hh:mm:ss". In the following I check which strings deviate from this format other than the _np.nan_. If multiple strings deviate in the same way, I will write a function. Otherwise I will edit single values in a copy of the excel file. The changed file will be the *BIBO10_saliva_03-07-2018_modified.xlsx* file. To see the file as it was before modification, use the outcommented filename at the top. Then you can see what I modified. Now you only see the np.nan

**Check these at RU:**  
- ID: 231 check at least day 2 to find out why this was mentioned behind C2: (1x, forgot 2d)
- ID: 249 check what 2x means, maybe one samples needs to be disregarded
- ID: 278 same here...
- ID: 362 both days
- ID: 376 What does (1-2) mean behind the time on day 1
- ID: 413 day 1 wakeup time
- ID: 452 did they provide all samples on the indicated days or just those where there is a data?
- ID: 409 the date was noted as 29.02.2017, which does not exist

**Answers for above notes:**  
- ID: 231 We asked them to take 2 samples at C2.
- ID: 249 Likely it meant just that they took 2 samples...
- ID: 278 same here...
- ID: 362 both days
- ID: 376 What does (1-2) mean behind the time on day 1
- ID: 413 day 1 wakeup time = 07:30
- ID: 452 for both days samples have been provided on separate days and therefore I excluded them all.
- ID: 409 it must have been a saturday based on notes, thus I changed date to 25 and 26 feb because is this closest weekend. Will ask Roseriet

**Open questions:**  
- ID 231: Is it a problem that 1 sample is missing at C2?
- ID 234 awoke at soccer. No wakeup time provided, what to do? I set wakeup time to np.nan. But e.g. ID 241 also awoke at soccer but since time provided, I used it.
- if ID 452 (and other IDs) provided some samples at another day, should I keep some samples or will we want to exclude them completely?
- 409 provided non-existent date, what should I do?



**Exlude:**  
- 253: C2 one sample too late, the other not
- 259: C1 follow up date
- 448: C2 too late (13:00)



In [5]:
# see markdown text above for explanation
pattern = re.compile(r"^\d\d:\d\d:\d\d$")
day = 1
for df in df_list[1:]:
    for time in ["time_wakeup", "time_c1", "time_c2", "time_c3", "time_c4"]:
        for i, s in enumerate(df.loc[:, time]):
            if not pattern.findall(str(s)) and df.iloc[i, 1] == 1:
                print(f"day {day}, {time}, ID: {df.iloc[i, 0]}, string: {s}")
    day += 1

day 1, time_wakeup, ID: 234.0, string: nan
day 1, time_wakeup, ID: 288.0, string: nan
day 1, time_wakeup, ID: 345.0, string: nan
day 1, time_c1, ID: 224.0, string: nan
day 1, time_c1, ID: 260.0, string: nan
day 1, time_c1, ID: 288.0, string: nan
day 1, time_c1, ID: 314.0, string: nan
day 1, time_c1, ID: 436.0, string: nan
day 1, time_c2, ID: 203.0, string: nan
day 1, time_c2, ID: 417.0, string: nan
day 1, time_c3, ID: 203.0, string: nan
day 1, time_c3, ID: 212.0, string: nan
day 1, time_c3, ID: 281.0, string: nan
day 1, time_c3, ID: 356.0, string: nan
day 1, time_c3, ID: 417.0, string: nan
day 1, time_c4, ID: 203.0, string: nan
day 1, time_c4, ID: 206.0, string: nan
day 1, time_c4, ID: 220.0, string: nan
day 1, time_c4, ID: 224.0, string: nan
day 1, time_c4, ID: 241.0, string: nan
day 1, time_c4, ID: 244.0, string: nan
day 1, time_c4, ID: 308.0, string: nan
day 1, time_c4, ID: 339.0, string: nan
day 1, time_c4, ID: 345.0, string: nan
day 1, time_c4, ID: 356.0, string: nan
day 1, time_c

In [6]:
# # the date of the timestamp object was "today" because the timestring only
# # provides hours, minutes, seconds. I get the info from the date column
# # and add it for each timestamp. Make sure that date == the date the sample
# # was really taken. I print out for which value it did not work:
# for dfi in [1, 2]:
#    for i, row in enumerate(df_list[dfi].index):
#        for col in np.arange(3, 8):
#            try:
#                year = df_list[dfi].date[row].year
#                month = df_list[dfi].date[row].month
#                day = df_list[dfi].date[row].day
#                df_list[dfi].iloc[i, col] = df_list[dfi].iloc[i, col].replace(year = year, month = month, day = day)
#            # does not work for np.nan
#            except (TypeError, AttributeError):
#                print(f"{dfi}, {row}, {col}")
#               #print(df_list[dfi].date[row])

# Ready to work with data
Now it should be possible to check if any time falls within a desired time window and define an action if it does not.

# TODO:

## Count value that miss mealtime or measurement time

Number of cases where any measurement time is missing:  
NUmber of cases where any mealtime is missing:  
 

## Check dates
The days must be subsequent. Important: Deviations from the date columns are only visible in the notes, not in the datevalues of the times. This was the case in the sheet for some dates but this was not suitable for analyses in any program.

In [7]:
for df in df_list:
    display(df.head())

Unnamed: 0,ID,complete,sick_last_seven_d,sick_notes,medication_last_seven_d,medication_notes,abx_last_seven_d,abx_notes,extra_notes
1,201,1.0,2.0,0.0,2.0,0,2.0,0.0,
2,202,1.0,2.0,0.0,2.0,0,2.0,0.0,
3,203,1.0,2.0,0.0,2.0,Multivitamin every morning.,2.0,0.0,
4,204,1.0,,,,,,,
5,205,1.0,2.0,0.0,2.0,0,2.0,0.0,


Unnamed: 0,ID,complete,date,time_wakeup,time_c1,time_c2,time_c3,time_c4,time_lunch,time_dinner,...,time_snacks_3,activity_morning,activity_afternoon,normal_day_question,normal_day_notes,problem_question,problem_notes,notes_yellow,exclude,notes_red
1,201.0,1.0,2016-04-30 00:00:00,07:30:00,07:45:00,10:55:00,15:05:00,19:00:00,12:00:00,18:00:00,...,0.0,"IPad , hairdresser","Shopping kitchens for our new home, IPad",1,0,2.0,0,,0.0,
2,202.0,1.0,2016-05-12 00:00:00,08:30:00,09:30:00,11:05:00,15:12:00,19:00:00,13:40:00,17:45:00,...,0.0,,,2,Holiday,1.0,"At his father, so I didn't do the saliva sampl...",,0.0,
3,203.0,1.0,2016-05-01 00:00:00,07:45:00,07:54:00,,,,,19:15:00,...,,Soccer,Soccer,2,Holiday,1.0,Hard to do the saliva samples.,,0.0,
4,204.0,1.0,2016-08-06 00:00:00,07:45:00,08:00:00,11:30:00,15:15:00,20:30:00,13:30:00,18:00:00,...,0.0,"Got up, shopping",Played outside,1,0,2.0,0,,0.0,
5,205.0,1.0,2016-09-03 00:00:00,08:15:00,08:35:00,11:00:00,15:00:00,19:00:00,13:00:00,17:00:00,...,0.0,Drawing lessons,Library,1,0,1.0,Day 1-1: unfortunately it fell on the ground.,,0.0,


Unnamed: 0,ID,complete,date,time_wakeup,time_c1,time_c2,time_c3,time_c4,time_lunch,time_dinner,...,time_snacks_3,activity_morning,activity_afternoon,normal_day_question,normal_day_notes,problem_question,problem_notes,notes_yellow,exclude,notes_red
1,201,1.0,2016-05-01 00:00:00,06:25:00,06:30:00,11:05:00,15:05:00,19:05:00,14:00:00,18:00:00,...,0,"Played inside, read","I-pad, played",1.0,0,2.0,0,,0.0,
2,202,1.0,2016-05-15 00:00:00,09:00:00,09:20:00,11:10:00,15:06:00,19:05:00,13:29:00,17:20:00,...,15:10:00,,,2.0,Holiday,1.0,"Sample 11:10 fell, used the extra one. / We ke...",,0.0,
3,203,1.0,2016-05-02 00:00:00,07:15:00,07:45:00,10:48:00,15:45:00,,13:15:00,16:45:00,...,15:15:00,Soccer,"Visit, chilling",2.0,Holiday,1.0,Holiday.,,0.0,
4,204,1.0,2016-08-07 00:00:00,07:45:00,08:00:00,10:50:00,15:10:00,19:00:00,13:00:00,18:00:00,...,0,Visiting grandma,Playing outside,1.0,0,2.0,0,,0.0,
5,205,1.0,2016-09-04 00:00:00,08:10:00,08:30:00,12:30:00,15:00:00,19:00:00,12:30:00,19:00:00,...,0,Played at home,Visiting grandpa,1.0,0,2.0,Forgotten second sample at 11:00. I took two s...,,0.0,


## Check disease/sickness/medication
- check *normal_day_question_*: if not normal (2), read *normal_day_notes*  and exclude if ill or otherwise "disqualified"
- check *sick_last_seven_d* and if yes, read *sick_notes*  
- check *abx_last_seven_d*/*medication_last_seven_d* and exclude if abx started > 2 days of sample taking or if taking medication with systemic action

## Result:
- 226 Two disease symptoms
- 245 Food: check times and food interference from (12:30-15:00) due to "lot of candy" **I checked, is fine**
- 227 Medication: Check type of injection?! ("Got his immunotherapy injections.") **is not documented, ask Roseriet**
- 420 Two disease symptoms: Night before he went to the school party. He feels sick: diarrhea, abdominal pain. **I still include this one because he obvisously played the soccer game...**


In [8]:
# check normal day for illness (NOTE: In the end I checked normal and non-normal day)
# because they also noted diseases if normal == 1
for dfi in [1, 2]:
    df = df_list[dfi]
    print(f"\nDay {dfi}: \n\n")
    for i, row in enumerate(df.index):
        if df.normal_day_question[row] == 2:
            print(f"ID {df.ID[row]}: {df.normal_day_notes[row]}")


Day 1: 


ID 202.0: Holiday
ID 203.0: Holiday
ID 207.0: Holiday (child was at fathers house yesterday)
ID 209.0: On Saturday he can stay up late till 21:30, normally it's 20:00
ID 220.0: Not for a Saturday. Normally scouting in the morning, but not this week. 
ID 224.0: Normally soccer game.
ID 226.0: Still abdominal pain and diarrhea, fair at the village.
ID 231.0: Exciting for new drum kit, birthday.
ID 238.0: Stayed up late.
ID 245.0: We had a birthday party (12:30-15:00), so ate a lot of candy!
ID 257.0: Normal Saturday, party with class this evening till 23:00.
ID 260.0: A busy Saturday.
ID 262.0: Most of the time goes to bed earlier.
ID 264.0: Normally she's awake at 9:00, not now because of the school camp.
ID 271.0: Day trip.
ID 272.0: Holiday, it was very hot.
ID 273.0: Weekend.
ID 286.0: Holiday, visit of twins of 3 years old.
ID 288.0: Much more quiet then normal.
ID 292.0: Late to bed because of a party.
ID 298.0: Yesterday we arrived home from holiday at 22:30.
ID 300.0: 

In [9]:
print("\n\n\nMedication")
# check medication
df = df_list[0]
for i, row in enumerate(df.index):
    if df.medication_last_seven_d[row] == 1:
        print(f"ID {df.ID[row]}: {df.medication_notes[row]}")

print("\n\n\nSickness")
for i, row in enumerate(df.index):
    if df.sick_last_seven_d[row] == 1:
        print(f"ID {df.ID[row]}: {df.sick_notes[row]}")

# 354 Risperidon 0,5 mg 1 dd. 1 (neuroleptikum)
# for some I am not sure if they are considred systemic. 
# E.g. antihistamines I know can have immune modulatory properties.




Medication
ID 227: 1 x per day Xyzal 0,5 mg / 1 x per day 1 inhaler seretide 125.
ID 237: Every morning at breakfast: aerius 2,5 mg (antihistamine) 1 d.d./ Rhinocort nasalspray 1 d.d. inhaler
ID 238: nan
ID 244: Bisolvon (chroomhexodine)
ID 251: Triamcinolon (ointment for eczema) vaselinelanettecreme, Forlax (stool), fluticason (nasalpray).
ID 254: Paracetamol, 500 mg headache because of the heat.
ID 268: Obybutynine tlcl accord 5 mg mo+lunch+forlax, 10 gr in the morning
ID 271: paracetamol  120 mg 3 
ID 276: Omega 3 fish oil 1.000 mg p/dag , magnesium citrate 200 mg p/dag (only on workdays)
ID 278: Cozaar, 40 mg
ID 295: Vitamin B12
ID 297: 250 mg paracetamol 17-8-2016
ID 298: Macrogolum 4000 every day 30 gram
ID 317: Affusine ointment and Purol eczema, Ventolin 100 inhaler during sports), 1x per day. 
ID 334: 2 days 5 mg methylphenidate, not during samples.
ID 338: Ointment: cetamacogrol 2x/day, 0,5 paracetamol 2x (Sunday, Monday).
ID 340: Melatonin, 2x 3-4 tablets 0,1 mg (drugstor

## Exclude: 
- 254 was excluded anyways due to systemic medication
- ask Roseriet: 285 has three disease symptoms but no mentioning on the saliva sheets of that.


In [10]:
# check abx
df = df_list[0]
for i, row in enumerate(df.index):
    if df.loc[row, "abx_last_seven_d"] == 1:
        print(f'ID {df.loc[row, "ID"]}: {df.loc[row, "abx_notes"]}')

# check ID 359 for abx maybe because is not saying anything about how long abx was taken
# I checked: no mentioning on paper either. Thus, abx was indicated as yes but no more info

ID 359: nan


## Check times

1. Note time C1 - *time_wake_up*, which should within 15 min but does not excluded (but noted) if not within 15 min.
2. C1 must be $\geq$ 06:00:00 $\leq$ 10:00:00  
3. C2 must be $\geq$ 10:00:00 $\leq$ 12:00:00  
4. C3 must be $\geq$ 14:00:00 $\leq$ 16:00:00  
5. C4 must be $\geq$ 18:00:00 $\leq$ 21:00:00 

I need to ask Roseriet what I should do if one value is too late. Just exlude that one sample or exclude all values of the day? because I can imagine that for circadian cortisol e.g. the first sample is quite critical.

In the following, I print the values out of range:

In [15]:
# create note column for timing
df_list[1] = df_list[1].assign(notes_timing = pd.Series(["" for i in range(len(df_list[1].index))]))
df_list[2] = df_list[2].assign(notes_timing = pd.Series(["" for i in range(len(df_list[2].index))]))

def diff_min(end, begin):
    difference = (end.hour * 60 + end.minute)-(begin.hour * 60 + begin.minute)
    return(difference)

def calc_hours(minutes):
    '''takes minutes as input and outputs format h:m'''
    h = 0
    if minutes >= 60:
        h = minutes//60
        minutes = minutes%60
    return(f"{h}:{minutes}")

        

# C1 - wakeup
for df in df_list[1:]:
    for i, row in enumerate(df.index):
        try:
            difference = diff_min(df.loc[row, "time_c1"], df.loc[row, "time_wakeup"])
            if difference > 15:
                df.loc[row, "notes_timing"] = f"Time between wakeup and C1 was {calc_hours(difference)} h; "
            else:
                df.loc[row, "notes_timing"] = ""
        except AttributeError:
            if df.loc[row, "complete"] == 1:
                df.loc[row, "notes_timing"] = "Could not calculate time between wakeup and C1; "

# C1
for dfi, df in enumerate(df_list[1:]):
    for i, row in enumerate(df.index):
        if type(df.loc[row, "time_c1"]) == datetime.time:
            if (df.loc[row, "time_c1"] < datetime.time(6, 0) or df.loc[row, "time_c1"] > datetime.time(10, 0)):
                df.loc[row, "notes_timing"] = df.loc[row, "notes_timing"] + "C1 out of range; "
                print(f'C1, day {dfi+1}: {df.loc[row, "ID"]}, {df.loc[row, "time_c1"]}')
            
# C2
for dfi, df in enumerate(df_list[1:]):
    for i, row in enumerate(df.index):
        if type(df.loc[row, "time_c2"]) == datetime.time:
            if (df.loc[row, "time_c2"] < datetime.time(10, 0) or df.loc[row, "time_c2"] > datetime.time(12, 0)):
                df.loc[row, "notes_timing"] = df.loc[row, "notes_timing"] + "C2 out of range; "
                print(f'C2, day {dfi+1}: {df.loc[row, "ID"]}, {df.loc[row, "time_c2"]}')

# C3
for dfi, df in enumerate(df_list[1:]):
    for i, row in enumerate(df.index):
        if type(df.loc[row, "time_c3"]) == datetime.time:
            if (df.loc[row, "time_c3"] < datetime.time(14, 0) or df.loc[row, "time_c3"] > datetime.time(16, 0)):
                df.loc[row, "notes_timing"] = df.loc[row, "notes_timing"] + "C3 out of range; "
                print(f'C3, day {dfi+1}: {df.loc[row, "ID"]}, {df.loc[row, "time_c3"]}')

# C4
for dfi, df in enumerate(df_list[1:]):
    for i, row in enumerate(df.index):
        if type(df.loc[row, "time_c4"]) == datetime.time:
            if (df.loc[row, "time_c4"] < datetime.time(18, 0) or df.time_c4[row] > datetime.time(21, 0)):
                df.loc[row, "notes_timing"] = df.loc[row, "notes_timing"] + "C4 out of range; "
                print(f'C4, day {dfi+1}: {df.loc[row, "ID"]}, {df.loc[row, "time_c4"]}')

C1, day 1: 213.0, 10:30:00
C1, day 1: 264.0, 10:45:00
C1, day 1: 302.0, 10:27:00
C1, day 1: 312.0, 10:10:00
C1, day 1: 318.0, 10:21:00
C1, day 1: 338.0, 10:15:00
C1, day 1: 362.0, 11:15:00
C1, day 1: 403.0, 19:15:00
C1, day 2: 252, 10:15:00
C1, day 2: 271, 10:25:00
C1, day 2: 274, 10:30:00
C1, day 2: 312, 10:15:00
C1, day 2: 318, 10:11:00
C1, day 2: 359, 10:15:00
C1, day 2: 383, 10:15:00
C1, day 2: 454, 10:15:00
C2, day 1: 262.0, 12:10:00
C2, day 1: 278.0, 13:30:00
C2, day 1: 284.0, 12:18:00
C2, day 1: 288.0, 12:25:00
C2, day 1: 309.0, 12:05:00
C2, day 1: 311.0, 12:55:00
C2, day 1: 355.0, 12:10:00
C2, day 1: 367.0, 12:10:00
C2, day 1: 385.0, 13:40:00
C2, day 1: 435.0, 12:15:00
C2, day 1: 453.0, 12:32:00
C2, day 2: 205, 12:30:00
C2, day 2: 240, 12:47:00
C2, day 2: 309, 12:06:00
C2, day 2: 343, 13:15:00
C2, day 2: 355, 12:12:00
C2, day 2: 362, 08:00:00
C2, day 2: 385, 13:00:00
C2, day 2: 435, 12:30:00
C2, day 2: 444, 12:05:00
C2, day 2: 451, 12:40:00
C3, day 1: 258.0, 16:30:00
C3, day 1:

In [20]:
df_list[1].notes_timing

1                                                       
2                 Time between wakeup and C1 was 1:0 h; 
3                                                       
4                                                       
5                Time between wakeup and C1 was 0:20 h; 
6                                                       
7                                                       
8                                                       
9                                                       
10                                                      
11                                                      
12                                                      
13     Time between wakeup and C1 was 3:30 h; C1 out ...
14                                                      
15                                                      
16                                                      
17                                                      
18                             

## Check food
Check whether there was food intake before the samples. If substantial food intake present, exclude.

## Result:
See print and:  
- check 207 with Roseriet because mealtime == sampling time


In [21]:
def calc_min(time):
    '''takes datetime.time object and returns min'''
    h = time.hour
    minutes = time.minute
    if h > 0:
        minutes += h*60
    return(minutes)

# this code block prints any sample for IDs where a meal time was too close to sampling
# change max_time to specify the time difference allowed
max_time = 15
snack_times = ["time_lunch", "time_snacks_1", "time_snacks_2", "time_snacks_3"]
sample_times = ["time_c1", "time_c2", "time_c3", "time_c4"]
for dfi, df in enumerate(df_list[1:]):
    for i, row in enumerate(df.index):
        # can only check if time provided
        snacks = [t for t in snack_times if type(df.loc[row, t]) == datetime.time]
        samples = [t for t in sample_times if type(df.loc[row, t]) == datetime.time]
        for t in samples:
            try:
                closest_meal = min([calc_min(df.loc[row, t]) - (calc_min(df.loc[row, snacktime])) for snacktime in snacks])
                if closest_meal >= 0 and closest_meal < max_time:
                    print(f'{t} too close to a meal for day {dfi+1}, ID {df.loc[row, "ID"]}')
            except ValueError:
                pass


time_c3 too close to a meal for day 1, ID 207.0
time_c3 too close to a meal for day 1, ID 210.0
time_c3 too close to a meal for day 1, ID 239.0
time_c3 too close to a meal for day 1, ID 260.0
time_c3 too close to a meal for day 1, ID 301.0
time_c3 too close to a meal for day 1, ID 393.0
time_c4 too close to a meal for day 1, ID 448.0
time_c4 too close to a meal for day 2, ID 207
time_c3 too close to a meal for day 2, ID 302
time_c4 too close to a meal for day 2, ID 433
time_c4 too close to a meal for day 2, ID 437
time_c3 too close to a meal for day 2, ID 441
time_c4 too close to a meal for day 2, ID 446
time_c3 too close to a meal for day 2, ID 458
