In [72]:
import pandas as pd
import numpy as np
import ast

In [None]:
interval_df = pd.read_csv("~/Downloads/sherlock/valid_7day_windows.csv")
interval_df.head()

In [75]:
import pandas as pd
import numpy as np
import ast
import re # Import regular expressions module


# Updated function to handle standard lists and space-separated number lists
def parse_string_list_safely(item):
    if pd.isna(item):
        return np.nan
    elif isinstance(item, list):
         return item # Already a list
    elif isinstance(item, str):
        original_item_repr = repr(item) # Get a representation for logging
        item = item.strip() # Remove leading/trailing whitespace
        if not item:
            return np.nan # Handle empty strings

        # Attempt 1: Standard Python list literal parsing
        try:
            evaluated = ast.literal_eval(item)
            if isinstance(evaluated, list):
                return evaluated
            else:
                # If it evaluates to a single item, wrap it in a list?
                # Example: "'[10.]'" might eval to 10.0 if quotes were weird.
                # Let's assume we always want list output from strings like '[...]'
                # If the original string looks like a list, but evaluates to single item:
                if item.startswith('[') and item.endswith(']'):
                     return [evaluated] # Wrap single valid item in list
                else:
                     # Or treat non-list results from valid literals as error?
                     print(f"Warning: Evaluated item {original_item_repr} is not a list, treating as NaN.")
                     return np.nan
        except (ValueError, SyntaxError, TypeError):
            # If standard parsing fails, proceed to Attempt 2
            pass # Fall through to the next attempt

        # Attempt 2: Space-separated numbers within brackets
        if item.startswith('[') and item.endswith(']'):
            content = item[1:-1].strip() # Extract content within brackets
            if not content: # Handle empty brackets '[]'
                return []
            try:
                # Split by whitespace, filter empty strings, convert to float
                # Using regex \s+ handles multiple spaces correctly
                parsed_list = [float(num_str) for num_str in re.split(r'\s+', content) if num_str]
                return parsed_list
            except ValueError:
                # Handle cases where content isn't purely space-separated numbers
                print(f"Warning: Could not parse content of {original_item_repr} as space-separated numbers, treating as NaN.")
                return np.nan
            except Exception as e:
                 print(f"Warning: Unexpected error parsing space-separated list {original_item_repr}: {e}, treating as NaN.")
                 return np.nan

        # If neither parsing method worked
        print(f"Warning: Could not parse string {original_item_repr} as any known list format, treating as NaN.")
        return np.nan
    else:
        print(f"Warning: Unexpected type {type(item)} for item '{item}', treating as NaN.")
        return np.nan

In [None]:
label_df = pd.read_csv("~/Downloads/sherlock/combined_mhc_data.csv")
label_df.createdOn = pd.to_datetime(label_df.createdOn, format='ISO8601')
label_df.heart_disease = label_df.heart_disease.apply(parse_string_list_safely)
label_df = label_df.explode('heart_disease')
label_df.head()

In [82]:
labels = ['sleep_diagnosis1', 'happiness', 'heart_disease', 'feel_worthwhile1', 
          'feel_worthwhile2', 'feel_worthwhile3', 'Diabetes', 'Hypertension']

for label in labels:
    label_df[label] = label_df[label].apply(lambda x: np.nan if np.isnan(x) else int(x))

In [121]:
hcs = list(interval_df.healthCode.unique())

global_records = []
for hc in hcs:
    i_df = interval_df[interval_df.healthCode == hc].copy()
    if i_df.empty:
        continue
    
    records = i_df.to_dict('records')

    for label in labels:
        l_df = label_df[(label_df.healthCode == hc) & (label_df[label].notna())].copy()
        if l_df.empty:
            continue
        dates = l_df.createdOn.dt.date.astype(str)
        matches = find_closest_dates(dates, i_df.time_range)
        for record, match_key in zip(records, matches):
            match_idx = matches[match_key]
            label_dict = {
                'label_value': l_df.iloc[match_idx][label],
                'label_date': l_df.iloc[match_idx]['createdOn']
            }
            record[label] = label_dict
    global_records.extend(records)

In [None]:
import json
import os
from pathlib import Path

# Convert numpy values to Python native types for JSON serialization
def convert_numpy_to_native(obj):
    if isinstance(obj, dict):
        return {k: convert_numpy_to_native(v) for k, v in obj.items()}
    elif isinstance(obj, list):
        return [convert_numpy_to_native(item) for item in obj]
    elif hasattr(obj, 'item'):  # Check if it's a numpy scalar
        return obj.item()
    else:
        return obj

# Convert records to JSON-serializable format
serializable_records = convert_numpy_to_native(global_records)

# Create output path
output_path = os.path.expanduser("~/Downloads/global_records.json")

# Write to JSON file
with open(output_path, 'w') as f:
    json.dump(serializable_records, f, indent=2, default=str)

# Display first record as confirmation
global_records[0] if global_records else "No records found"

In [None]:
# Create a denormalized pandas dataframe from global_records
import pandas as pd

# First, flatten nested dictionaries in global_records
flattened_records = []
for record in global_records:
    flat_record = {}
    for key, value in record.items():
        if isinstance(value, dict) and 'label_value' in value:
            # Flatten label dictionaries
            flat_record[f"{key}_value"] = value['label_value']
            flat_record[f"{key}_date"] = value['label_date']
        else:
            flat_record[key] = value
    flattened_records.append(flat_record)

# Create pandas DataFrame from flattened records
denormalized_df = pd.DataFrame(flattened_records)

# Display the first few rows
denormalized_df.head()

In [None]:
denormalized_df.to_parquet("~/Downloads/global_records.parquet")

In [None]:
# Display value counts for each column in label_df
for column in labels:
    if column in ['healthCode', 'createdOn', 'file_uris']:
        # Skip non-categorical columns
        continue
    print(label_df[column].value_counts())

In [18]:
from datetime import datetime
import bisect

def find_closest_dates(dates_list, intervals_list):
    """
    For each interval, find the closest date in the dates_list.
    
    Args:
        dates_list: List of date strings in format 'YYYY-MM-DD'
        intervals_list: List of interval strings in format 'YYYY-MM-DD_YYYY-MM-DD'
        
    Returns:
        Dictionary with intervals as keys and indices of closest dates as values
    """
    # Convert dates_list to datetime objects
    dates_dt = [datetime.strptime(date, "%Y-%m-%d") for date in dates_list]
    
    # Sort dates for binary search
    sorted_dates_with_indices = sorted(enumerate(dates_dt), key=lambda x: x[1])
    sorted_dates = [d for _, d in sorted_dates_with_indices]
    original_indices = [i for i, _ in sorted_dates_with_indices]
    
    result = {}
    
    for interval in intervals_list:
        # Parse start and end dates
        start_str, end_str = interval.split('_')
        start_date = datetime.strptime(start_str, "%Y-%m-%d")
        end_date = datetime.strptime(end_str, "%Y-%m-%d")
        
        # Calculate midpoint
        midpoint = start_date + (end_date - start_date) / 2
        
        # Find the closest date using binary search
        pos = bisect.bisect_left(sorted_dates, midpoint)
        
        # Handle edge cases
        if pos == 0:
            closest_idx = 0
        elif pos == len(sorted_dates):
            closest_idx = len(sorted_dates) - 1
        else:
            # Check which is closer
            if abs(sorted_dates[pos] - midpoint) < abs(sorted_dates[pos-1] - midpoint):
                closest_idx = pos
            else:
                closest_idx = pos - 1
        
        # Get original index
        original_idx = original_indices[closest_idx]
        result[interval] = original_idx
    
    return result

In [144]:
df0 = pd.read_parquet("~/Downloads/global_records.parquet")
df1 = pd.read_parquet("~/Downloads/global_records_1.parquet")

In [None]:
# Compare the two dataframes
print("Shape comparison:")
print(f"df0 shape: {df0.shape}")
print(f"df1 shape: {df1.shape}")

# Check if they have the same columns
print("\nColumn comparison:")
df0_cols = set(df0.columns)
df1_cols = set(df1.columns)
print(f"Columns only in df0: {df0_cols - df1_cols}")
print(f"Columns only in df1: {df1_cols - df0_cols}")
print(f"Common columns: {len(df0_cols.intersection(df1_cols))}")

# Compare values for common columns
common_cols = list(df0_cols.intersection(df1_cols))
print("\nValue comparison for common columns:")
for col in common_cols[:5]:  # Limit to first 5 columns to avoid excessive output
    if df0[col].equals(df1[col]):
        print(f"Column '{col}' has identical values")
    else:
        print(f"Column '{col}' has differences")
        # Show a sample of differences
        mask = ~(df0[col] == df1[col])
        if mask.any():
            diff_sample = pd.DataFrame({
                'df0': df0.loc[mask, col].head(3),
                'df1': df1.loc[mask, col].head(3)
            })
            print(diff_sample)

# Check for NaN differences
print("\nNaN value comparison:")
nan_diff_cols = []
for col in common_cols:
    df0_nulls = df0[col].isna().sum()
    df1_nulls = df1[col].isna().sum()
    if df0_nulls != df1_nulls:
        nan_diff_cols.append((col, df0_nulls, df1_nulls))

if nan_diff_cols:
    print("Columns with different NaN counts:")
    for col, df0_nulls, df1_nulls in nan_diff_cols[:5]:  # Limit to first 5
        print(f"'{col}': df0={df0_nulls} nulls, df1={df1_nulls} nulls")
else:
    print("All columns have the same number of NaN values")

# Check for duplicate rows
print("\nDuplicate row comparison:")
print(f"df0 duplicated rows: {df0.duplicated().sum()}")
print(f"df1 duplicated rows: {df1.duplicated().sum()}")

# Summary statistics comparison for numeric columns
print("\nSummary statistics comparison (first numeric column):")
numeric_cols = [col for col in common_cols if pd.api.types.is_numeric_dtype(df0[col])]
if numeric_cols:
    col = numeric_cols[0]
    print(f"Column: {col}")
    print("df0 stats:")
    print(df0[col].describe())
    print("df1 stats:")
    print(df1[col].describe())


In [None]:
# Load the global_records1.json file
import json

with open('/Users/narayanschuetz/Downloads/global_records_1.json', 'r') as f:
    global_records = json.load(f)
    
# Display the loaded data
global_records

In [None]:
# Check if 'sleep_diagnosis1' key exists in any dictionary in global_records
sleep_diagnosis1_exists = any('sleep_diagnosis1' in record for record in global_records)
print(f"Does 'sleep_diagnosis1' key exist in any record? {sleep_diagnosis1_exists}")

# Display records that have the 'sleep_diagnosis1' key (if any)
records_with_sleep_diagnosis1 = [record for record in global_records if 'sleep_diagnosis1' in record]
if records_with_sleep_diagnosis1:
    print(f"\nFound {len(records_with_sleep_diagnosis1)} records with 'sleep_diagnosis1' key:")
    for i, record in enumerate(records_with_sleep_diagnosis1[:3]):  # Show first 3 examples
        print(f"Record {i+1}: {record}")
    if len(records_with_sleep_diagnosis1) > 3:
        print(f"... and {len(records_with_sleep_diagnosis1) - 3} more records")
else:
    print("No records contain the 'sleep_diagnosis1' key")

# Still display the global_records for reference
global_records

In [None]:
# Find healthCodes where there are no labels at all
# First identify all label columns (those ending with '_value')
label_columns = [col for col in df1.columns if col.endswith('_value')]

# For each healthCode, check if all label columns are NaN
mask = df1[label_columns].isna().all(axis=1)
healthcodes_without_labels = df1.loc[mask, 'healthCode'].unique()

print(f"Found {len(healthcodes_without_labels)} healthCodes with no labels:")
print(healthcodes_without_labels[:10])  # Show first 10 as example
if len(healthcodes_without_labels) > 10:
    print(f"... and {len(healthcodes_without_labels) - 10} more")


In [None]:
from torch_dataset import FilteredMhcDataset

happiness_dataset = FilteredMhcDataset(
    dataframe=df1, 
    root_dir="blabla", 
    label_of_interest='happiness'
)