In [47]:
import os
import sys
import logging
import pandas as pd
import requests
from datetime import datetime, timedelta
from io import StringIO

class ID_COMPARISONS:
    
    def __init__(self, daysago=None) -> None:
       self.token = 'DE4E2DB72778DACA9B8848574107D2F5'
       self.INT_DIR = '/Volumes/vosslabhpc/Projects/BOOST/InterventionStudy/3-experiment/data/act-int-test'
       self.OBS_DIR = '/Volumes/vosslabhpc/Projects/BOOST/ObservationalStudy/3-experiment/data/act-obs-test'
       self.daysago = daysago
       logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")

    def compare_ids(self):
        """
        Pulls all files from RDSS
        Pulls the list from RedCap
        Compares IDs and returns a dictionary with two keys:
          - 'matches': normal matches mapping boost_id to a list of dicts (filename, labID, date)
          - 'duplicates': a list of dictionaries each with lab_id, boost_id, filenames (list), and dates (list)
        """
        # Retrieve the RedCap report and duplicates from report
        report, report_duplicates = self._return_report()
        # Retrieve the full RDSS file list and duplicate files merged with duplicates from report
        rdss, file_duplicates = self._rdss_file_list(report_duplicates, self.daysago)

        # Initialize the result dictionary for normal (non-duplicate) matches
        result = {}

        # Iterate over the rows in the cleaned RedCap report
        for _, row in report.iterrows():
            boost_id = str(row['boost_id'])
            lab_id = str(row['lab_id'])
            
            # Find matching files in the RDSS list
            rdss_matches = rdss[rdss['ID'] == lab_id]
            if not rdss_matches.empty:
                if boost_id not in result:
                    result[boost_id] = []
                for _, match_row in rdss_matches.iterrows():
                    result[boost_id].append({
                        'filename': match_row['filename'],
                        'labID': lab_id,
                        'date': match_row['Date']
                    })
        
        # Process duplicates into the desired structure.
        duplicates_dict = []
        if not file_duplicates.empty:
            # Group by lab_id and boost_id; each group represents one duplicate combination.
            grouped = file_duplicates.groupby(['lab_id', 'boost_id'])
            for (lab_id, boost_id), group in grouped:
                duplicates_dict.append({
                    'lab_id': lab_id,
                    'boost_id': boost_id,
                    'filenames': group['filename'].tolist(),
                    'dates': group['Date'].tolist()
                })
        else:
            logging.info("Found no duplicates.")

        return {'matches': result, 'duplicates': duplicates_dict}, report

    def _return_report(self):
        """
        pulls the id report from the rdss via redcap api.
        reads the report as a dataframe.
        checks for boost_ids that are associated with multiple lab_ids, logs a critical error,
        and removes these rows from the dataframe.
        separates duplicate rows (based on any column) from the cleaned data.
        
        returns:
            df_cleaned: dataframe with duplicates removed and problematic boost_ids excluded
            duplicate_rows: dataframe of duplicate rows
        """
        url = 'https://redcap.icts.uiowa.edu/redcap/api/'
        data = {
            'token': self.token,
            'content': 'report',
            'report_id': 43327,
            'format': 'csv'
        }
        r = requests.post(url, data=data)
        if r.status_code != 200:
            print(f"error! status code is {r.status_code}")
            sys.exit(1)
        
        df = pd.read_csv(StringIO(r.text))
        
        # identify boost_ids associated with multiple lab_ids.
        boost_id_counts = df.groupby('boost_id')['lab_id'].nunique()
        problematic_boost_ids = boost_id_counts[boost_id_counts > 1].index.tolist()
        
        if problematic_boost_ids:
            logging.critical(f"found boost_id(s) with multiple lab_ids: {', '.join(map(str, problematic_boost_ids))}. "
                            "these entries will be removed from processing.")
            df = df[~df['boost_id'].isin(problematic_boost_ids)]
        
        # identify and separate duplicate rows based on any column.
        duplicate_rows = df[df.duplicated(keep=False)]
        df_cleaned = df.drop_duplicates(keep=False)
        
        if not duplicate_rows.empty:
            logging.info(f"duplicate rows found:\n{duplicate_rows}")
        
        return df_cleaned, duplicate_rows

    def _rdss_file_list(self, duplicates, daysago=None):
        """
        extracts the first string before the space and the date from filenames ending with .csv
        in the specified folder and stores them in a dataframe.
        
        Also, merges the file list with duplicate report entries based on lab_id.
        
        Returns:
            df: DataFrame of all file entries
            merged_df: DataFrame of file entries that match duplicate lab_ids from the report
        """
        extracted_data = []

        # Loop through all files in the rdss_dir folder.
        rdss_dir = '/mnt/rdss/VossLab/Repositories/Accelerometer_Data'
        for filename in os.listdir(rdss_dir):
            if filename.endswith('.csv'):
                try:
                    base_name = filename.split(' ')[0]  # Extract lab_id
                    date_part = filename.split('(')[1].split(')')[0]  # Extract date
                    extracted_data.append({'ID': base_name, 'Date': date_part, 'filename': filename})
                except IndexError:
                    print(f"Skipping file with unexpected format: {filename}")

        df = pd.DataFrame(extracted_data)

        if not df.empty:
            df['Date'] = pd.to_datetime(df['Date'], errors='coerce')

            if daysago:
                cutoff_date = datetime.today() - timedelta(days=daysago)
                df = df[df['Date'] >= cutoff_date]  # Filter files within the last `daysago` days
            else:
                df = df[df['Date'] >= '2024-08-05']  # Filter out rows before the threshold date

        # Filter the file list to only include rows where ID is in the duplicate report (if any)
        if not duplicates.empty:
            matched_df = df[df['ID'].isin(duplicates['lab_id'])]
            # Merge with the duplicates to bring in boost_id information from the report
            merged_df = matched_df.merge(duplicates, left_on='ID', right_on='lab_id')
        else:
            merged_df = pd.DataFrame()

        return df, merged_df


# REPORT EXAMPLE
"""  lab_id  boost_id
0     1023      8022
1     1043      7062
2     1093      6011
3     1097      6012
4     1098      6013
..     ...       ...
90    1192      7058
91    1193      7059
"""


# RDSS EXAMPLE
"""        ID        Date                  filename
0     1005  2022-05-10  1005 (2022-05-10)RAW.csv
1     1023  2022-04-30  1023 (2022-04-30)RAW.csv
2     1027  2022-04-26  1027 (2022-04-26)RAW.csv
3     1016  2022-03-23  1016 (2022-03-23)RAW.csv
4      994  2022-05-13   994 (2022-05-13)RAW.csv
...    ...         ...                       ...
1152   584  2018-09-15   584 (2018-09-15)RAW.csv
1153   584  2018-10-16   584 (2018-10-16)RAW.csv"""

# TOKEN FOR REFERENCE





'        ID        Date                  filename\n0     1005  2022-05-10  1005 (2022-05-10)RAW.csv\n1     1023  2022-04-30  1023 (2022-04-30)RAW.csv\n2     1027  2022-04-26  1027 (2022-04-26)RAW.csv\n3     1016  2022-03-23  1016 (2022-03-23)RAW.csv\n4      994  2022-05-13   994 (2022-05-13)RAW.csv\n...    ...         ...                       ...\n1152   584  2018-09-15   584 (2018-09-15)RAW.csv\n1153   584  2018-10-16   584 (2018-10-16)RAW.csv'

In [65]:
I = ID_COMPARISONS(daysago=499)

In [66]:
matches, report = I.compare_ids()

2025-08-01 10:37:35,204 - CRITICAL - found boost_id(s) with multiple lab_ids: 7023. these entries will be removed from processing.
2025-08-01 10:37:49,049 - INFO - Found no duplicates.


In [67]:
len(matches['matches'])

174

In [68]:
num_subjects = sum(1 for sid in matches['matches'] if sid.startswith('8'))
print(num_subjects)


40


In [69]:
int = pd.read_csv('./int.csv')

In [70]:
int

Unnamed: 0.1,Unnamed: 0,sub-8024,sub-8016,sub-8037,sub-8005,sub-8008,sub-8011,sub-8023,sub-8015,sub-8006,sub-8018,sub-8003,sub-8036,sub-8004,sub-8019,sub-8021,sub-8035,sub-8026,sub-8014
0,sessions,1,2,1,2,2,2,1,1,2,2,2,1,2,2,1,1,2,1
1,total_days,9,18,9,18,16,18,9,9,18,18,15,9,12,18,8,9,18,9
2,weekdays,5,14,6,14,13,12,7,7,11,14,9,7,6,14,7,7,13,7
3,weekends,4,4,3,4,3,6,2,2,7,4,6,2,6,4,1,2,5,2


In [71]:
int.shape

(4, 19)

In [72]:
report

Unnamed: 0,lab_id,boost_id
0,1023,8022
1,1040,7183
2,1043,7062
3,1051,7146
4,1093,6011
...,...,...
266,1402,7205
267,1403,7206
268,1404,7207
269,1405,7208


In [78]:
report = report[report['boost_id'].astype(str).str.startswith('8')]
boost_ids = report['boost_id']
boost_ids

0      8022
29     8001
30     8002
31     8003
32     8004
33     8007
34     8006
39     8005
50     8008
53     8009
54     8010
55     8011
58     8012
59     8013
60     8014
62     8015
63     8016
65     8017
67     8018
68     8019
69     8020
70     8021
84     8023
85     8024
87     8025
93     8026
97     8028
98     8029
99     8030
100    8031
117    8032
138    8033
139    8034
140    8035
162    8036
165    8037
174    8038
180    8039
182    8043
183    8040
184    8045
185    8041
186    8042
196    8046
201    8047
202    8048
206    8050
208    8051
210    8052
211    8053
212    8054
234    8055
237    8056
238    8057
239    8058
240    8059
241    8060
259    8061
Name: boost_id, dtype: object

In [74]:
int_matches = matches['matches']

subject_keys = [k for k in int_matches.keys() if k.startswith('8')]
subject_keys

['8022',
 '8001',
 '8002',
 '8003',
 '8004',
 '8006',
 '8005',
 '8008',
 '8011',
 '8012',
 '8014',
 '8015',
 '8016',
 '8017',
 '8018',
 '8019',
 '8020',
 '8021',
 '8023',
 '8024',
 '8026',
 '8030',
 '8032',
 '8035',
 '8036',
 '8037',
 '8038',
 '8039',
 '8040',
 '8042',
 '8046',
 '8047',
 '8048',
 '8050',
 '8051',
 '8052',
 '8053',
 '8054',
 '8056',
 '8060']

In [79]:
# 1) Extract the “column 1” IDs properly:
#    If `boost_ids` is a DataFrame:
col1 = boost_ids

# 2) Inspect its dtype and contents
print("Column 1 dtype:", col1.dtype)
print("First 10 values:", col1.head(10).tolist())

# 3) Normalize to strings (and strip whitespace)
#    If it’s a float dtype (e.g. 6022.0), cast to int first:
if pd.api.types.is_float_dtype(col1):
    col1 = col1.dropna().astype(int).astype(str)
else:
    col1 = col1.astype(str)

df_ids = [s.strip() for s in col1.tolist()]

# 4) Also strip your subject_keys
subject_keys = [s.strip() for s in subject_keys]

# 5) Now do a set difference
missing = sorted(set(subject_keys) - set(df_ids))
print(f"IDs in subject_keys but not in df col1 ({len(missing)}):", missing)


Column 1 dtype: object
First 10 values: ['8022', '8001', '8002', '8003', '8004', '8007', '8006', '8005', '8008', '8009']
IDs in subject_keys but not in df col1 (0): []


In [80]:
# 1) Normalize both to sets of strings
boost_set   = set(boost_ids.astype(str))
subject_set = set(subject_keys)

# 2a) IDs in the list but NOT in the Series
missing_in_series = list(subject_set - boost_set)
print("In list only:   ", missing_in_series)

# 2b) IDs in the Series but NOT in the list
missing_in_list   = list(boost_set - subject_set)
print("In series only: ", missing_in_list)


In list only:    []
In series only:  ['8007', '8031', '8055', '8025', '8041', '8013', '8029', '8058', '8010', '8033', '8043', '8034', '8045', '8028', '8057', '8009', '8059', '8061']
