In [1]:
import pandas as pd
import numpy as np
import os

In [4]:
int_sleep = pd.read_csv('/mnt/lss/Projects/BOOST/InterventionStudy/3-experiment/data/act-int-test/sleep_log_intervention.csv')
int_sleep.head()

Unnamed: 0,ID,D1_date,D1_inbed,D1_wakeup,D2_date,D2_inbed,D2_wakeup,D3_date,D3_inbed,D3_wakeup,...,D7_wakeup,D8_date,D8_inbed,D8_wakeup,D9_date,D9_inbed,D9_wakeup,D10_date,D10_inbed,D10_wakeup
0,sub-8022_ses-1_accel,2025-02-07,19:00:00,08:30:00,2025-02-08,23:45:00,07:30:00,2025-02-09,23:55:00,07:00:00,...,07:00:00,2025-02-15,01:00:00,09:00:00,,,,,,
1,sub-8022_ses-2_accel,2025-04-17,00:00:00,06:30:00,2025-04-18,23:10:00,07:40:00,2025-04-20,02:45:00,07:30:00,...,08:00:00,,,,,,,,,
2,sub-8001_ses-1_accel,2024-09-21,00:30:00,08:30:00,2024-09-22,00:15:00,07:40:00,2024-09-23,05:20:00,09:30:00,...,09:00:00,2024-09-28,00:30:00,08:45:00,2024-09-29,00:30:00,08:15:00,2024-09-30,00:20:00,08:30:00
3,sub-8002_ses-1_accel,2024-09-14,22:15:00,06:45:00,2024-09-15,23:00:00,06:45:00,2024-09-16,22:51:00,06:45:00,...,07:00:00,2024-09-21,22:30:00,06:45:00,2024-09-22,22:45:00,06:45:00,,,
4,sub-8002_ses-2_accel,2024-11-07,23:00:00,03:30:00,2024-11-08,21:30:00,05:30:00,2024-11-09,21:30:00,05:30:00,...,06:15:00,2024-11-14,22:45:00,06:15:00,2024-11-15,22:30:00,07:00:00,,,


In [11]:
import os
import sys
import logging
import pandas as pd
import requests
from datetime import datetime, timedelta
from io import StringIO

class ID_COMPARISONS:
    
    def __init__(self, token, daysago=None) -> None:
       self.token = token
       self.INT_DIR = 'mnt/lss/Projects/BOOST/InterventionStudy/3-experiment/data/act-int-test'
       self.OBS_DIR = '/mnt/lss/Projects/BOOST/ObservationalStudy/3-experiment/data/act-obs-test'
       self.daysago = daysago
       logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")

    def compare_ids(self):
        """
        Pulls all files from RDSS
        Pulls the list from RedCap
        Compares IDs and returns a dictionary with two keys:
          - 'matches': normal matches mapping boost_id to a list of dicts (filename, labID, date)
          - 'duplicates': a list of dictionaries each with lab_id, boost_id, filenames (list), and dates (list)
        """
        # Retrieve the RedCap report and duplicates from report
        report, report_duplicates = self._return_report()
        # Retrieve the full RDSS file list and duplicate files merged with duplicates from report
        rdss, file_duplicates = self._rdss_file_list(report_duplicates, self.daysago)

        # Initialize the result dictionary for normal (non-duplicate) matches
        result = {}

        # Iterate over the rows in the cleaned RedCap report
        for _, row in report.iterrows():
            boost_id = str(row['boost_id'])
            lab_id = str(row['lab_id'])
            
            # Find matching files in the RDSS list
            rdss_matches = rdss[rdss['ID'] == lab_id]
            if not rdss_matches.empty:
                if boost_id not in result:
                    result[boost_id] = []
                for _, match_row in rdss_matches.iterrows():
                    result[boost_id].append({
                        'filename': match_row['filename'],
                        'labID': lab_id,
                        'date': match_row['Date']
                    })
        
        # Process duplicates into the desired structure.
        duplicates_dict = []
        if not file_duplicates.empty:
            # Group by lab_id and boost_id; each group represents one duplicate combination.
            grouped = file_duplicates.groupby(['lab_id', 'boost_id'])
            for (lab_id, boost_id), group in grouped:
                duplicates_dict.append({
                    'lab_id': lab_id,
                    'boost_id': boost_id,
                    'filenames': group['filename'].tolist(),
                    'dates': group['Date'].tolist()
                })
        else:
            logging.info("Found no duplicates.")

        return {'matches': result, 'duplicates': duplicates_dict}

    def _return_report(self):
        """
        pulls the id report from the rdss via redcap api.
        reads the report as a dataframe.
        checks for boost_ids that are associated with multiple lab_ids, logs a critical error,
        and removes these rows from the dataframe.
        separates duplicate rows (based on any column) from the cleaned data.
        
        returns:
            df_cleaned: dataframe with duplicates removed and problematic boost_ids excluded
            duplicate_rows: dataframe of duplicate rows
        """
        url = 'https://redcap.icts.uiowa.edu/redcap/api/'
        data = {
            'token': self.token,
            'content': 'report',
            'report_id': 43327,
            'format': 'csv'
        }
        r = requests.post(url, data=data)
        if r.status_code != 200:
            print(f"error! status code is {r.status_code}")
            sys.exit(1)
        
        df = pd.read_csv(StringIO(r.text))
        
        # identify boost_ids associated with multiple lab_ids.
        boost_id_counts = df.groupby('boost_id')['lab_id'].nunique()
        problematic_boost_ids = boost_id_counts[boost_id_counts > 1].index.tolist()
        
        if problematic_boost_ids:
            logging.critical(f"found boost_id(s) with multiple lab_ids: {', '.join(map(str, problematic_boost_ids))}. "
                            "these entries will be removed from processing.")
            df = df[~df['boost_id'].isin(problematic_boost_ids)]
        
        # identify and separate duplicate rows based on any column.
        duplicate_rows = df[df.duplicated(keep=False)]
        df_cleaned = df.drop_duplicates(keep=False)
        
        if not duplicate_rows.empty:
            logging.info(f"duplicate rows found:\n{duplicate_rows}")
        
        return df_cleaned, duplicate_rows

    def _rdss_file_list(self, duplicates, daysago=None):
        """
        extracts the first string before the space and the date from filenames ending with .csv
        in the specified folder and stores them in a dataframe.
        
        Also, merges the file list with duplicate report entries based on lab_id.
        
        Returns:
            df: DataFrame of all file entries
            merged_df: DataFrame of file entries that match duplicate lab_ids from the report
        """
        extracted_data = []

        # Loop through all files in the rdss_dir folder.
        rdss_dir = '/mnt/rdss/VossLab/Repositories/Accelerometer_Data'
        for filename in os.listdir(rdss_dir):
            if filename.endswith('.csv'):
                try:
                    base_name = filename.split(' ')[0]  # Extract lab_id
                    date_part = filename.split('(')[1].split(')')[0]  # Extract date
                    extracted_data.append({'ID': base_name, 'Date': date_part, 'filename': filename})
                except IndexError:
                    print(f"Skipping file with unexpected format: {filename}")

        df = pd.DataFrame(extracted_data)

        if not df.empty:
            df['Date'] = pd.to_datetime(df['Date'], errors='coerce')

            if daysago:
                cutoff_date = datetime.today() - timedelta(days=daysago)
                df = df[df['Date'] >= cutoff_date]  # Filter files within the last `daysago` days
            else:
                df = df[df['Date'] >= '2024-08-05']  # Filter out rows before the threshold date

        # Filter the file list to only include rows where ID is in the duplicate report (if any)
        if not duplicates.empty:
            matched_df = df[df['ID'].isin(duplicates['lab_id'])]
            # Merge with the duplicates to bring in boost_id information from the report
            merged_df = matched_df.merge(duplicates, left_on='ID', right_on='lab_id')
        else:
            merged_df = pd.DataFrame()

        return df, merged_df


In [18]:
token = "DE4E2DB72778DACA9B8848574107D2F5"
id = ID_COMPARISONS(token=token, daysago=400)
results = id.compare_ids()

2025-07-29 10:33:46,042 - CRITICAL - found boost_id(s) with multiple lab_ids: 7023. these entries will be removed from processing.
2025-07-29 10:33:46,616 - INFO - Found no duplicates.


In [24]:
results


{'matches': {'8022': [{'filename': '1023 (2025-02-07)RAW.csv',
    'labID': '1023',
    'date': Timestamp('2025-02-07 00:00:00')},
   {'filename': '1023 (2025-04-17)RAW.csv',
    'labID': '1023',
    'date': Timestamp('2025-04-17 00:00:00')}],
  '7062': [{'filename': '1043 (2025-02-22)RAW.csv',
    'labID': '1043',
    'date': Timestamp('2025-02-22 00:00:00')}],
  '7146': [{'filename': '1051 (2025-05-05)RAW.csv',
    'labID': '1051',
    'date': Timestamp('2025-05-05 00:00:00')}],
  '6011': [{'filename': '1093 (2024-08-09)RAW.csv',
    'labID': '1093',
    'date': Timestamp('2024-08-09 00:00:00')}],
  '6013': [{'filename': '1098 (2024-08-14)RAW.csv',
    'labID': '1098',
    'date': Timestamp('2024-08-14 00:00:00')}],
  '6022, 7143': [{'filename': '1108 (2025-05-08)RAW.csv',
    'labID': '1108',
    'date': Timestamp('2025-05-08 00:00:00')}],
  '7001': [{'filename': '1111 (2024-09-05)RAW.csv',
    'labID': '1111',
    'date': Timestamp('2024-09-05 00:00:00')}],
  '7002': [{'filename': 