In [63]:
import os 
import sys
import pandas as pd
import numpy as np
import requests
from io import StringIO as stringio
import json
from pathlib import Path
from datetime import datetime

# this file is algo testing to create the sleep logs required to implement into GGIR-3.2.6

## required format:
Example of a basic sleeplog:

| ID  | onset_N1  | wakeup_N1 | onset_N2  | wakeup_N2 | onset_N3  | wakeup_N3 | onset_N4  |
|-----|-----------|-----------|-----------|-----------|-----------|-----------|-----------|
| 345 | 21:55:00  | 08:47:00  |           |           | 23:45:00  | 06:30:00  | 00:00:00  |
One column for participant id, this does not have to be the first column. Specify which column it is with argument `colid`.

Alternatingly one column for onset time and one column for waking time. Specify which column is the column for the first night by argument `coln1`, in the above example coln1=2.

Timestamps are to be stored without date as in hh:mm:ss with hour values ranging between 0 and 23 (not 24). If onset corresponds to lights out or intention to fall asleep, then specify sleepwindowType = "TimeInBed".


In [64]:
# sleep directory
SLEEP_DIR = "/Volumes/VossLab/Repositories/Accelerometer_Data/Sleep"
# date to start from
FROM_DATE = "08-05-2024"

# set up logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    handlers=[
        logging.FileHandler("sleep_data_processing.log"),
        logging.StreamHandler(sys.stdout)
    ]
)
logger = logging.getLogger(__name__)

In [65]:
# function to return list of lab-subject ID pairs
def _return_report(token):
    """
    pulls the id report from the rdss via redcap api.
    reads the report as a dataframe.
    checks for boost_ids that are associated with multiple lab_ids, logs a critical error,
    and removes these rows from the dataframe.
    separates duplicate rows (based on any column) from the cleaned data.
    
    returns:
        df_cleaned: dataframe with duplicates removed and problematic boost_ids excluded
        duplicate_rows: dataframe of duplicate rows
    """
    url = 'https://redcap.icts.uiowa.edu/redcap/api/'
    data = {
        'token': token,
        'content': 'report',
        'report_id': 43327,
        'format': 'csv'
    }
    r = requests.post(url, data=data)
    if r.status_code != 200:
        print(f"error! status code is {r.status_code}")
        sys.exit(1)
    
    df = pd.read_csv(stringio(r.text))
    
    # identify boost_ids associated with multiple lab_ids.
    boost_id_counts = df.groupby('boost_id')['lab_id'].nunique()
    problematic_boost_ids = boost_id_counts[boost_id_counts > 1].index.tolist()
    
    if problematic_boost_ids:
        logging.critical(f"found boost_id(s) with multiple lab_ids: {', '.join(map(str, problematic_boost_ids))}. "
                        "these entries will be removed from processing.")
        df = df[~df['boost_id'].isin(problematic_boost_ids)]
    
    # identify and separate duplicate rows based on any column.
    duplicate_rows = df[df.duplicated(keep=False)]
    df_cleaned = df.drop_duplicates(keep=False)
    
    if not duplicate_rows.empty:
        logging.info(f"duplicate rows found:\n{duplicate_rows}")
    
    return df_cleaned, duplicate_rows


In [66]:
TOKEN = 'DE4E2DB72778DACA9B8848574107D2F5'

df_cleaned, duplicate_rows = _return_report(TOKEN)
df_cleaned


2025-05-22 08:16:40,438 - CRITICAL - found boost_id(s) with multiple lab_ids: 7023. these entries will be removed from processing.


Unnamed: 0,lab_id,boost_id
0,1023,8022
1,1043,7062
2,1051,7146
3,1093,6011
4,1097,6012
...,...,...
208,1343,8051
209,1344,7157
210,1345,8052
211,1346,8053


In [67]:
# return all csv 

def get_sleep_csvs_after_date(SLEEP_DIR, FROM_DATE):
    sleep_dir = Path(SLEEP_DIR)
    from_date = datetime.strptime(FROM_DATE, "%m-%d-%Y")

    data = []
    for file in sleep_dir.glob("*.csv"):
        try:
            # Expecting format: "1234_3-29-2025_Sleep.csv"
            parts = file.stem.split("_")
            date_str = parts[1]  # "3-29-2025"
            file_date = datetime.strptime(date_str, "%m-%d-%Y")
            if file_date > from_date:
                data.append({"path": str(file), "date": file_date})
        except (IndexError, ValueError):
            continue  # Skip files that don't match expected pattern

    return pd.DataFrame(data)

sleep_paths = get_sleep_csvs_after_date(SLEEP_DIR, FROM_DATE)
sleep_paths

Unnamed: 0,path,date
0,/Volumes/VossLab/Repositories/Accelerometer_Da...,2024-08-07
1,/Volumes/VossLab/Repositories/Accelerometer_Da...,2024-08-06
2,/Volumes/VossLab/Repositories/Accelerometer_Da...,2024-08-08
3,/Volumes/VossLab/Repositories/Accelerometer_Da...,2024-08-07
4,/Volumes/VossLab/Repositories/Accelerometer_Da...,2024-08-09
...,...,...
176,/Volumes/VossLab/Repositories/Accelerometer_Da...,2025-04-26
177,/Volumes/VossLab/Repositories/Accelerometer_Da...,2025-05-01
178,/Volumes/VossLab/Repositories/Accelerometer_Da...,2025-05-05
179,/Volumes/VossLab/Repositories/Accelerometer_Da...,2025-04-30


In [68]:
test_df = pd.read_csv(sleep_paths.iloc[0]['path'], skiprows=5)
test_df.head()

Unnamed: 0,Sleep Algorithm,In Bed Date,In Bed Time,Out Bed Date,Out Bed Time,Onset Date,Onset Time,Latency,Total Counts,Efficiency,Total Minutes in Bed,Total Sleep Time (TST),Wake After Sleep Onset (WASO),Number of Awakenings,Average Awakening Length,Movement Index,Fragmentation Index,Sleep Fragmentation Index
0,Cole-Kripke,8/7/2024,12:00 AM,8/7/2024,4:25 AM,8/7/2024,12:00 AM,0,26017,81.13,265,215,50,12,4.17,11.321,0.0,11.321
1,Cole-Kripke,8/7/2024,10:00 PM,8/8/2024,4:15 AM,8/7/2024,10:09 PM,9,50730,83.73,375,314,52,18,2.89,16.267,22.222,38.489
2,Cole-Kripke,8/8/2024,9:39 PM,8/9/2024,4:20 AM,8/8/2024,9:55 PM,16,41112,84.54,401,339,46,18,2.56,10.723,16.667,27.39
3,Cole-Kripke,8/9/2024,9:49 PM,8/10/2024,5:35 AM,8/9/2024,10:05 PM,16,70375,82.83,466,386,64,19,3.37,16.738,21.053,37.791
4,Cole-Kripke,8/10/2024,10:02 PM,8/11/2024,5:26 AM,8/10/2024,10:29 PM,27,93392,83.56,444,371,46,14,3.29,17.342,7.143,24.485


In [69]:
test_df_labid = sleep_paths.iloc[175]['path'].split("/")[-1].split("_")[0]
print(f"lab_id: {test_df_labid}")
match = df_cleaned[df_cleaned['lab_id'].astype(str) == str(test_df_labid)]
matched_boost_id = match['boost_id'].iloc[0] if not match.empty else None
matched_boost_id

lab_id: 1315


'7137'

In [70]:

def reshape_sleep_events(df: pd.DataFrame, matched_boost_id) -> pd.DataFrame:
    """
    Turns a per-event sleep DataFrame into one row per ID, with columns:
      ID, onset_N1, wakeup_N1, onset_N2, wakeup_N2, ...
    """
    # 1) Copy & convert times to 24 hr format
    df2 = df.copy()
    df2['In Bed Time']  = pd.to_datetime(df2['In Bed Time'],  format='%I:%M %p').dt.strftime('%H:%M:%S')
    df2['Out Bed Time'] = pd.to_datetime(df2['Out Bed Time'], format='%I:%M %p').dt.strftime('%H:%M:%S')

    # 2) (Optional) sort by actual in‐bed datetime so N1, N2… are chronological
    df2['In Bed Datetime'] = pd.to_datetime(df2['In Bed Date'] + ' ' + df2['In Bed Time'])
    df2 = df2.sort_values('In Bed Datetime').reset_index(drop=True)

    # 3) Flatten into one row
    row = {'ID': matched_boost_id}
    for i, (onset, wakeup) in enumerate(zip(df2['In Bed Time'], df2['Out Bed Time']), start=1):
        row[f'onset_N{i}']  = onset
        row[f'wakeup_N{i}'] = wakeup

    return pd.DataFrame([row])

In [71]:
# suppose df is your original sleep‐events dataframe
# and matched_boost_id is set in memory already:
wide = reshape_sleep_events(test_df, matched_boost_id)

In [81]:
wide

Unnamed: 0,ID,onset_N1,wakeup_N1,onset_N2,wakeup_N2,onset_N3,wakeup_N3,onset_N4,wakeup_N4,onset_N5,wakeup_N5,onset_N6,wakeup_N6,onset_N7,wakeup_N7
0,7137,00:00:00,04:25:00,22:00:00,04:15:00,21:39:00,04:20:00,21:49:00,05:35:00,22:02:00,05:26:00,22:32:00,04:20:00,21:45:00,04:20:00


In [84]:
import pandas as pd
import os
from collections import defaultdict

def process_and_save_sleep_data(sleep_paths, df_cleaned, output_path, session):
    all_rows = []
    session_counts = defaultdict(int)

    for i in range(len(sleep_paths)):
        try:
            file_path = sleep_paths.iloc[i]['path']
            filename = os.path.basename(file_path)
            lab_id = filename.split("_")[0]

            # Update and check session count
            session_counts[lab_id] += 1
            session_number = session_counts[lab_id]

            # Only process the specified session
            if session_number != session:
                continue

            # Match lab_id to boost_id
            match = df_cleaned[df_cleaned['lab_id'].astype(str) == str(lab_id)]
            matched_boost_id = match['boost_id'].iloc[0] if not match.empty else None

            if matched_boost_id:
                base_id = matched_boost_id
                full_id = f"{base_id}"

                df = pd.read_csv(file_path, skiprows=5)
                reshaped_df = reshape_sleep_events(df, full_id)
                all_rows.append(reshaped_df)
            else:
                print(f"No match found for lab_id: {lab_id}")
        except Exception as e:
            print(f"Error processing index {i}, file {file_path}: {e}")

    if all_rows:
        final_df = pd.concat(all_rows, ignore_index=True)
        final_df.to_csv(output_path, index=False)
        print(f"Saved reshaped sleep data to {output_path}")
    else:
        print("No data matched the specified session.")

In [85]:
process_and_save_sleep_data(sleep_paths, df_cleaned, "sleep.csv", session=1)

No match found for lab_id: 1088
No match found for lab_id: 1079
No match found for lab_id: 781
No match found for lab_id: 920
No match found for lab_id: 868
No match found for lab_id: 890
No match found for lab_id: 1135
No match found for lab_id: 931
No match found for lab_id: 902
No match found for lab_id: 205
No match found for lab_id: 901
No match found for lab_id: 1152
No match found for lab_id: 588
No match found for lab_id: 877
No match found for lab_id: 1157
No match found for lab_id: 1153
No match found for lab_id: 926
No match found for lab_id: 1156
No match found for lab_id: 399
No match found for lab_id: 1278
No match found for lab_id: 1281
No match found for lab_id: 944
No match found for lab_id: 1280
No match found for lab_id: 893
No match found for lab_id: 1287
No match found for lab_id: 1284
No match found for lab_id: 1282
No match found for lab_id: 1285
No match found for lab_id: 1274
No match found for lab_id: 1292
No match found for lab_id: 1310
No match found for lab

/Volumes/VossLab/Repositories/Accelerometer_Data/Sleep/1088_8-7-2024_Sleep.csv