In [None]:

#   1. Merge all core 2002 CDS files into a single cross-sectional dataset.
#   2. Merge the 2001 PSID Family File to enrich the 2002 baseline.
#   3. Merge the longitudinal Transition to Adulthood (TAS) waves.
#   4. Process all Time Diary data to create both aggregate and contextual variables.
#   5. Perform the final merge to combine all data sources.


import pandas as pd
import os

# Define all base paths
BASE_DATA_PATH = r'C:\Users\joshu\Aussie\Monash\Parental\Data'
CDS_2002_PATH = os.path.join(BASE_DATA_PATH, 'Supplemental Studies', 'Child Development Survey', 'CDS2002', '2002')
TAS_PATH = os.path.join(BASE_DATA_PATH, 'Supplemental Studies', 'Transition into Adulthood Supplement')
FAMILY_FILES_PATH = os.path.join(BASE_DATA_PATH, 'Main Study', 'Family Files')
ANALYSIS_PATH = os.path.join(BASE_DATA_PATH, 'Processed Data')

# Load Data
def load_data(file_path, required=True):
    """Safely loads a CSV file, print status and shape."""
    try:
        df = pd.read_csv(file_path, low_memory=False)
        print(f"  - Successfully loaded: {os.path.basename(file_path)} (Shape: {df.shape})")
        return df
    except FileNotFoundError:
        if required:
            print(f"  - FATAL ERROR: Required file not found at {file_path}")
            raise
        else:
            print(f"  - Warning: Optional file not found, skipping: {os.path.basename(file_path)}")
            return None

# Merge Core CDS-II Data
def merge_core_cds_data():
    """
    Raw 2002 CDS files into a single cross-sectional dataset.
    """
    print("\n--- Step 1: Merging Core CDS-II Data (2002 Wave) ---")
    
    # Load all necessary raw files
    demog_df = load_data(os.path.join(CDS_2002_PATH, 'DEMOG.csv'))
    gen_map_df = load_data(os.path.join(CDS_2002_PATH, 'GEN_MAP.csv'))
    pcg_chld_df = load_data(os.path.join(CDS_2002_PATH, 'PCG_CHLD.csv'))
    child_df = load_data(os.path.join(CDS_2002_PATH, 'CHILD.csv'), required=False)
    assessmt_df = load_data(os.path.join(CDS_2002_PATH, 'ASSESSMT.csv'), required=False)
    ocg_chld_df = load_data(os.path.join(CDS_2002_PATH, 'OCG_CHLD.csv'), required=False)
    idmap_df = load_data(os.path.join(CDS_2002_PATH, 'IDMAP02.csv'))
    pcg_hhld_df = load_data(os.path.join(CDS_2002_PATH, 'PCG_HHLD.csv'))

    # Rename keys for consistency before merging
    demog_df.rename(columns={'DEMID01': 'ID_2001', 'DEMSN01': 'SN_2001'}, inplace=True)
    gen_map_df.rename(columns={'GENID01': 'ID_2001', 'GENSN01': 'SN_2001', 'CH_ID68': 'ER30001', 'CH_PN': 'ER30002'}, inplace=True)
    
    # Start with the demographic file as the base
    merged_df = pd.merge(demog_df, gen_map_df, on=['ID_2001', 'SN_2001'], how='left')
    
    # Merge all child-level files
    child_files = {'pcg_chld': pcg_chld_df, 'child': child_df, 'assessmt': assessmt_df, 'ocg_chld': ocg_chld_df}
    key_map = {'pcg_chld': ('PCHID01', 'PCHSN01'), 'child': ('CHLDID01', 'CHLDSN01'), 'assessmt': ('ASMTID01', 'ASMTSN01'), 'ocg_chld': ('OCGCID01', 'OCGCSN01')}
    
    for name, df in child_files.items():
        if df is not None:
            key_id, key_sn = key_map[name]
            df.rename(columns={key_id: 'ID_2001', key_sn: 'SN_2001'}, inplace=True)
            merged_df = pd.merge(merged_df, df, on=['ID_2001', 'SN_2001'], how='left', suffixes=('', f'_{name}'))
            
    # Merge household data via IDMAP
    idmap_df.rename(columns={'CHLDID02': 'ID_2001', 'CHLDSN02': 'SN_2001'}, inplace=True)
    pcg_hhld_df.rename(columns={'PHHID01': 'PCGID02', 'PHHSN01': 'PCGSN02'}, inplace=True)
    merged_df = pd.merge(merged_df, idmap_df, on=['ID_2001', 'SN_2001'], how='left')
    merged_df = pd.merge(merged_df, pcg_hhld_df, on=['PCGID02', 'PCGSN02'], how='left', suffixes=('', '_pcghhld'))
    
    # cleanup
    merged_df = merged_df.loc[:, ~merged_df.columns.duplicated()]
    print("Core CDS merge complete.")
    return merged_df

# Merge 2001 PSID Family File
def merge_family_file(base_df):
    """
    2001 PSID Family File, adding
    household-level economic and social variables.
    """
    print("\n--- Step 2: Merging 2001 PSID Family File ---")
    family_df = load_data(os.path.join(FAMILY_FILES_PATH, 'fam2001er', 'FAM2001ER.csv'))
    
    # The key in the family file is 'ER17002', which corresponds to 'ID_2001'
    family_df.rename(columns={'ER17002': 'ID_2001'}, inplace=True)
    
    enriched_df = pd.merge(base_df, family_df, on='ID_2001', how='left', suffixes=('', '_fam2001'))
    
    enriched_df = enriched_df.loc[:, ~enriched_df.columns.duplicated()]
    print("Family file merge complete.")
    return enriched_df

# Merge Longitudinal TAS Data 
def merge_longitudinal_tas_data(base_df):
    """
    Merges the Transition to Adulthood (TAS) waves onto the base CDS dataset.
    """
    print("\n--- Step 3: Merging Longitudinal TAS Data ---")
    
    tas_files = {
        '05': (os.path.join(TAS_PATH, 'ta2005', 'TA2005.csv'), 'TA050004', 'TA050005'),
        '15': (os.path.join(TAS_PATH, 'ta2015', 'TA2015.csv'), 'TA150004', 'TA150005')
    }
    
    longitudinal_df = base_df.copy()
    for year, (path, id_col, pn_col) in tas_files.items():
        tas_df = load_data(path, required=False)
        if tas_df is not None:
            tas_df.rename(columns={id_col: 'ER30001', pn_col: 'ER30002'}, inplace=True)
            longitudinal_df = pd.merge(longitudinal_df, tas_df, on=['ER30001', 'ER30002'], how='left', suffixes=('', f'_tas{year}'))
    
    longitudinal_df = longitudinal_df.loc[:, ~longitudinal_df.columns.duplicated()]
    print("TAS merge complete.")
    return longitudinal_df

# Process Time Diary Data 
def process_time_diaries():
    """
    Creates a standalone DataFrame with comprehensive time-use variables.
    """
    print("\n--- Step 4: Processing Time Diary Data ---")
    
    td_agg_df = load_data(os.path.join(CDS_2002_PATH, 'TD02_ACT_AGG.csv'))
    td_activity_df = load_data(os.path.join(CDS_2002_PATH, 'TD_ACTIVITY.csv'))

    child_identifiers = td_agg_df[['AGGRID01', 'AGGRSN01']].copy().rename(columns={'AGGRID01': 'ID_2001', 'AGGRSN01': 'SN_2001'})

    # Part A: Aggregate weekly hours
    part_a_df = calculate_aggregate_weekly_hours(child_identifiers.copy(), td_agg_df)
    
    # Part B: Intensive parenting time
    part_b_df = calculate_intensive_parenting_time(td_activity_df)
    
    # Combine into a single time-use dataset
    time_use_df = pd.merge(part_a_df, part_b_df, on=['ID_2001', 'SN_2001'], how='left')
    time_use_df.fillna(0, inplace=True)
    print("Time Diary processing complete.")
    return time_use_df

def calculate_aggregate_weekly_hours(base_df, td_agg_df):
    """Calculates weekly average hours for 39 broad activity categories."""
    td_agg_df.rename(columns={'AGGRID01': 'ID_2001', 'AGGRSN01': 'SN_2001'}, inplace=True)
    panel_with_agg = pd.merge(base_df, td_agg_df, on=['ID_2001', 'SN_2001'], how='left')
    activity_codes = [f'39{i:02d}' for i in range(1, 40)]
    for code in activity_codes:
        wd_col, we_col = f'WD02{code}', f'WE02{code}'
        new_col = f'weekly_avg_hrs_cat_{code}'
        if wd_col in panel_with_agg.columns and we_col in panel_with_agg.columns:
            wd_sec = panel_with_agg[wd_col].fillna(0)
            we_sec = panel_with_agg[we_col].fillna(0)
            panel_with_agg[new_col] = ((wd_sec * 5) + (we_sec * 2)) / 3600
    new_cols = ['ID_2001', 'SN_2001'] + [f'weekly_avg_hrs_cat_{code}' for code in activity_codes]
    return panel_with_agg[new_cols]

def calculate_intensive_parenting_time(td_activity_df):
    """Processes the raw activity file to calculate measures of 'intensive parenting'."""
    skill_codes = [5490, 5491, 5492, 5493, 5494, 8010, 8011, 8012, 5040, 8020, 8030, 8040, 8090, 8510, 8520, 8211, 8212, 8213, 8214, 8215, 8221, 8222, 8223]
    skill_df = td_activity_df[td_activity_df['COLA_02'].isin(skill_codes)].copy()
    wd_skill_df = skill_df[skill_df['DIARY_02'] == 0]
    we_skill_df = skill_df[skill_df['DIARY_02'] == 1]
    
    child_ids = td_activity_df[['TDID01', 'TDSN01']].drop_duplicates().rename(columns={'TDID01': 'ID_2001', 'TDSN01': 'SN_2001'})

    for day_type, df in [('wd', wd_skill_df), ('we', we_skill_df)]:
        for parent, col in [('mother', 'COLGB_02'), ('father', 'COLGC_02')]:
            mask = df[col] == 1
            time = df[mask].groupby(['TDID01', 'TDSN01'])['DUR_02'].sum().reset_index()
            time.rename(columns={'DUR_02': f'{parent}_interactive_{day_type}_sec', 'TDID01': 'ID_2001', 'TDSN01': 'SN_2001'}, inplace=True)
            child_ids = pd.merge(child_ids, time, on=['ID_2001', 'SN_2001'], how='left')

    cols_to_fill = [f'{p}_interactive_{d}_sec' for p in ['mother', 'father'] for d in ['wd', 'we']]
    for col in cols_to_fill:
        if col not in child_ids.columns: child_ids[col] = 0
        else: child_ids[col] = child_ids[col].fillna(0)
            
    child_ids['parent_interactive_skill_hrs_wk'] = \
        (((child_ids['mother_interactive_wd_sec'] + child_ids['father_interactive_wd_sec']) * 5) +
         ((child_ids['mother_interactive_we_sec'] + child_ids['father_interactive_we_sec']) * 2)) / 3600
         
    return child_ids[['ID_2001', 'SN_2001', 'parent_interactive_skill_hrs_wk']]

# Main Execution Block
if __name__ == '__main__':
    # Create the analysis folder if it doesn't exist
    if not os.path.exists(ANALYSIS_PATH):
        os.makedirs(ANALYSIS_PATH)

    # Step 1
    core_cds_df = merge_core_cds_data()
    path_step1 = os.path.join(ANALYSIS_PATH, '01_cds_merged.csv')
    core_cds_df.to_csv(path_step1, index=False)
    print(f"Step 1 intermediate file saved to: {path_step1}")

    # Step 2
    cds_family_df = merge_family_file(core_cds_df)
    path_step2 = os.path.join(ANALYSIS_PATH, '02_cds_with_family_data.csv')
    cds_family_df.to_csv(path_step2, index=False)
    print(f"Step 2 intermediate file saved to: {path_step2}")

    # Step 3
    cds_tas_panel = merge_longitudinal_tas_data(cds_family_df)
    path_step3 = os.path.join(ANALYSIS_PATH, '03_cds_tas_panel.csv')
    cds_tas_panel.to_csv(path_step3, index=False)
    print(f"Step 3 intermediate file saved to: {path_step3}")

    # Step 4
    time_use_variables = process_time_diaries()
    path_step4 = os.path.join(ANALYSIS_PATH, '04_time_use_variables.csv')
    time_use_variables.to_csv(path_step4, index=False)
    print(f"Step 4 intermediate file saved to: {path_step4}")

    # Step 5: Final Merge
    print("\n--- Step 5: Final Merge ---")
    final_dataset = pd.merge(cds_tas_panel, time_use_variables, on=['ID_2001', 'SN_2001'], how='left')
    final_dataset = final_dataset.loc[:, ~final_dataset.columns.duplicated()]
    print("All data sources successfully merged.")

    # Save final outputs
    final_path = os.path.join(ANALYSIS_PATH, 'final_analysis_dataset.csv')
    final_dataset.to_csv(final_path, index=False)
    print(f"Final dataset saved to: {final_path}")
    
    if len(final_dataset) >= 1000:
        sample_df = final_dataset.sample(n=1000, random_state=42)
        sample_path = os.path.join(ANALYSIS_PATH, 'sample_final_analysis_dataset.csv')
        sample_df.to_csv(sample_path, index=False)
        print(f"Sample dataset saved to: {sample_path}")


--- Step 1: Merging Core CDS-II Data (2002 Wave) ---
  - Successfully loaded: DEMOG.csv (Shape: (2907, 22))
  - Successfully loaded: GEN_MAP.csv (Shape: (2907, 9))
  - Successfully loaded: PCG_CHLD.csv (Shape: (2907, 921))
  - Successfully loaded: CHILD.csv (Shape: (2182, 508))
  - Successfully loaded: ASSESSMT.csv (Shape: (2644, 256))
  - Successfully loaded: OCG_CHLD.csv (Shape: (1686, 79))
  - Successfully loaded: IDMAP02.csv (Shape: (2891, 8))
  - Successfully loaded: PCG_HHLD.csv (Shape: (2009, 218))
Core CDS merge complete.
Step 1 intermediate file saved to: C:\Users\joshu\Aussie\Monash\Parental\Data\Processed Data\01_cds_merged.csv

--- Step 2: Merging 2001 PSID Family File ---
  - Successfully loaded: FAM2001ER.csv (Shape: (7406, 3559))
Family file merge complete.
Step 2 intermediate file saved to: C:\Users\joshu\Aussie\Monash\Parental\Data\Processed Data\02_cds_with_family_data.csv

--- Step 3: Merging Longitudinal TAS Data ---
  - Successfully loaded: TA2005.csv (Shape: (745