In [None]:
# build_master_dataset_2007.py
#
# Purpose:
# This script serves as the single, definitive program for constructing the complete
# longitudinal analysis dataset from the raw 2007 PSID-CDS files. It is
# a direct adaptation of the 2002 master script, updated for the 2007 file names
# and variable structures. It saves intermediate files at each major step,
# making it ideal for use in a Jupyter Notebook.
#
# The workflow is as follows:
#   1. Merge all core 2007 CDS files into a single cross-sectional dataset.
#   2. Merge the 2007 PSID Family File to enrich the 2007 baseline.
#   3. Merge the longitudinal Transition to Adulthood (TAS) waves.
#   4. Process all 2007 Time Diary data to create both aggregate and contextual variables.
#   5. Perform the final merge to combine all data sources.
#
# Author: Gemini
# Date: October 1, 2025

import pandas as pd
import os

# --- Configuration: Define all base paths ---
BASE_DATA_PATH = r'C:\Users\joshu\Aussie\Monash\Parental\Data'
CDS_2007_PATH = os.path.join(BASE_DATA_PATH, 'Supplemental Studies', 'Child Development Survey', 'CDS2007', '2007')
TAS_PATH = os.path.join(BASE_DATA_PATH, 'Supplemental Studies', 'Transition into Adulthood Supplement')
FAMILY_FILES_PATH = os.path.join(BASE_DATA_PATH, 'Main Study', 'Family Files')
ANALYSIS_PATH = os.path.join(BASE_DATA_PATH, 'Processed Data 2007')

# --- Helper Function to Load Data ---
def load_data(file_path, required=True):
    """Safely loads a CSV file, printing its status and shape."""
    try:
        df = pd.read_csv(file_path, low_memory=False)
        print(f"  - Successfully loaded: {os.path.basename(file_path)} (Shape: {df.shape})")
        return df
    except FileNotFoundError:
        if required:
            print(f"  - FATAL ERROR: Required file not found at {file_path}")
            raise
        else:
            print(f"  - Warning: Optional file not found, skipping: {os.path.basename(file_path)}")
            return None

# --- STEP 1: Merge Core CDS-III Data (2007 Wave) ---
def merge_core_cds_data_2007():
    """
    Loads and merges all raw 2007 CDS files into a single cross-sectional dataset.
    This forms the foundational data for the 2007 wave.
    """
    print("\n--- Step 1: Merging Core CDS-III Data (2007 Wave) ---")
    
    demog_df = load_data(os.path.join(CDS_2007_PATH, 'DEMOG07.csv'))
    gen_map_df = load_data(os.path.join(CDS_2007_PATH, 'GENMAP07.csv'))
    pcg_chld_df = load_data(os.path.join(CDS_2007_PATH, 'PCG_CHILD07.csv'))
    child_df = load_data(os.path.join(CDS_2007_PATH, 'CHILD07.csv'), required=False)
    assessmt_df = load_data(os.path.join(CDS_2007_PATH, 'ASSESS07.csv'), required=False)
    ocg_chld_df = load_data(os.path.join(CDS_2007_PATH, 'OCG_CHILD07.csv'), required=False)
    idmap_df = load_data(os.path.join(CDS_2007_PATH, 'IDMAP07.csv'))
    pcg_hhld_df = load_data(os.path.join(CDS_2007_PATH, 'PCG_HH07.csv'))

    demog_df.rename(columns={'DEMID07': 'ID_2007', 'DEMSN07': 'SN_2007'}, inplace=True)
    
    # Robustly select and rename permanent ID columns to prevent silent failures
    gen_map_sub = gen_map_df[['GENID07', 'GENSN07', 'CH_ID68', 'CH_PN']].rename(columns={
        'GENID07': 'ID_2007', 'GENSN07': 'SN_2007', 
        'CH_ID68': 'ER30001', 'CH_PN': 'ER30002'
    })
    
    merged_df = pd.merge(demog_df, gen_map_sub, on=['ID_2007', 'SN_2007'], how='left')
    
    child_files = {'pcg_child07': pcg_chld_df, 'child07': child_df, 'assess07': assessmt_df, 'ocg_child07': ocg_chld_df}
    key_map = {'pcg_child07': ('PCHID07', 'PCHSN07'), 'child07': ('CHLDID07', 'CHLDSN07'), 'assess07': ('ASMID07', 'ASMSN07'), 'ocg_child07': ('OCHID07', 'OCHSN07')}
    
    for name, df in child_files.items():
        if df is not None:
            key_id, key_sn = key_map[name]
            df.rename(columns={key_id: 'ID_2007', key_sn: 'SN_2007'}, inplace=True)
            merged_df = pd.merge(merged_df, df, on=['ID_2007', 'SN_2007'], how='left', suffixes=('', f'_{name}'))
            
    idmap_df.rename(columns={'CHILDID07': 'ID_2007', 'CHILDSN07': 'SN_2007'}, inplace=True)
    pcg_hhld_df.rename(columns={'PHHID07': 'PCGID07', 'PHHSN07': 'PCGSN07'}, inplace=True)
    merged_df = pd.merge(merged_df, idmap_df, on=['ID_2007', 'SN_2007'], how='left')
    merged_df = pd.merge(merged_df, pcg_hhld_df, on=['PCGID07', 'PCGSN07'], how='left', suffixes=('', '_pcghh07'))
    
    merged_df = merged_df.loc[:, ~merged_df.columns.duplicated()]
    print("Core 2007 CDS merge complete.")
    return merged_df

# --- STEP 2: Merge 2007 PSID Family File ---
def merge_family_file_2007(base_df):
    """
    Enriches the core CDS dataset by merging the 2007 PSID Family File.
    """
    print("\n--- Step 2: Merging 2007 PSID Family File ---")
    family_df = load_data(os.path.join(FAMILY_FILES_PATH, 'fam2007er', 'FAM2007ER.csv'))
    
    family_df.rename(columns={'ER36002': 'ID_2007'}, inplace=True)
    
    enriched_df = pd.merge(base_df, family_df, on='ID_2007', how='left', suffixes=('', '_fam2007'))
    
    enriched_df = enriched_df.loc[:, ~enriched_df.columns.duplicated()]
    print("2007 Family file merge complete.")
    return enriched_df

# --- STEP 3: Merge Longitudinal TAS Data ---
def merge_longitudinal_tas_data(base_df):
    """
    Merges the Transition to Adulthood (TAS) waves onto the base 2007 CDS dataset.
    """
    print("\n--- Step 3: Merging Longitudinal TAS Data ---")
    
    tas_files = {
        '05': (os.path.join(TAS_PATH, 'ta2005', 'TA2005.csv'), 'TA050004', 'TA050005'),
        '15': (os.path.join(TAS_PATH, 'ta2015', 'TA2015.csv'), 'TA150004', 'TA150005')
    }
    
    longitudinal_df = base_df.copy()
    for year, (path, id_col, pn_col) in tas_files.items():
        tas_df = load_data(path, required=False)
        if tas_df is not None:
            tas_df.rename(columns={id_col: 'ER30001', pn_col: 'ER30002'}, inplace=True)
            longitudinal_df = pd.merge(longitudinal_df, tas_df, on=['ER30001', 'ER30002'], how='left', suffixes=('', f'_tas{year}'))
    
    longitudinal_df = longitudinal_df.loc[:, ~longitudinal_df.columns.duplicated()]
    print("TAS merge complete.")
    return longitudinal_df

# --- STEP 4: Process 2007 Time Diary Data ---
def process_time_diaries_2007():
    """
    Creates a standalone DataFrame with comprehensive time-use variables for the 2007 wave.
    """
    print("\n--- Step 4: Processing 2007 Time Diary Data ---")
    
    td_agg_df = load_data(os.path.join(CDS_2007_PATH, 'TD07_ACT_AGG.csv'))
    td_activity_df = load_data(os.path.join(CDS_2007_PATH, 'TD_ACTIVITY07.csv'))

    child_identifiers = td_agg_df[['AGGRID07', 'AGGRSN07']].copy().rename(columns={'AGGRID07': 'ID_2007', 'AGGRSN07': 'SN_2007'})

    part_a_df = calculate_aggregate_weekly_hours_2007(child_identifiers.copy(), td_agg_df)
    part_b_df = calculate_intensive_parenting_time_2007(td_activity_df)
    
    time_use_df = pd.merge(part_a_df, part_b_df, on=['ID_2007', 'SN_2007'], how='left')
    time_use_df.fillna(0, inplace=True)
    print("2007 Time Diary processing complete.")
    return time_use_df

def calculate_aggregate_weekly_hours_2007(base_df, td_agg_df):
    """Calculates weekly average hours for 2007."""
    td_agg_df.rename(columns={'AGGRID07': 'ID_2007', 'AGGRSN07': 'SN_2007'}, inplace=True)
    panel_with_agg = pd.merge(base_df, td_agg_df, on=['ID_2007', 'SN_2007'], how='left')
    activity_codes = [f'39{i:02d}' for i in range(1, 40)]
    for code in activity_codes:
        wd_col, we_col = f'WD07{code}', f'WE07{code}'
        new_col = f'weekly_avg_hrs_cat_{code}_07'
        if wd_col in panel_with_agg.columns and we_col in panel_with_agg.columns:
            wd_sec = panel_with_agg[wd_col].fillna(0)
            we_sec = panel_with_agg[we_col].fillna(0)
            panel_with_agg[new_col] = ((wd_sec * 5) + (we_sec * 2)) / 3600
    new_cols = ['ID_2007', 'SN_2007'] + [f'weekly_avg_hrs_cat_{code}_07' for code in activity_codes]
    return panel_with_agg[new_cols]

def calculate_intensive_parenting_time_2007(td_activity_df):
    """Calculates 'intensive parenting' measures for 2007."""
    skill_codes = [5490, 5491, 5492, 5493, 5494, 8010, 8011, 8012, 5040, 8020, 8030, 8040, 8090, 8510, 8520, 8211, 8212, 8213, 8214, 8215, 8221, 8222, 8223]
    skill_df = td_activity_df[td_activity_df['COLA_07'].isin(skill_codes)].copy()
    wd_skill_df = skill_df[skill_df['DIARY_07'] == 0]
    we_skill_df = skill_df[skill_df['DIARY_07'] == 1]
    
    child_ids = td_activity_df[['TDID07', 'TDSN07']].drop_duplicates().rename(columns={'TDID07': 'ID_2007', 'TDSN07': 'SN_2007'})

    for day_type, df in [('wd', wd_skill_df), ('we', we_skill_df)]:
        for parent, col in [('mother', 'COLGB_07'), ('father', 'COLGC_07')]:
            mask = df[col] == 1
            time = df[mask].groupby(['TDID07', 'TDSN07'])['DUR_07'].sum().reset_index()
            time.rename(columns={'DUR_07': f'{parent}_interactive_{day_type}_sec_07', 'TDID07': 'ID_2007', 'TDSN07': 'SN_2007'}, inplace=True)
            child_ids = pd.merge(child_ids, time, on=['ID_2007', 'SN_2007'], how='left')

    cols_to_fill = [f'{p}_interactive_{d}_sec_07' for p in ['mother', 'father'] for d in ['wd', 'we']]
    for col in cols_to_fill:
        if col not in child_ids.columns: child_ids[col] = 0
        else: child_ids[col] = child_ids[col].fillna(0)
            
    child_ids['parent_interactive_skill_hrs_wk_07'] = \
        (((child_ids['mother_interactive_wd_sec_07'] + child_ids['father_interactive_wd_sec_07']) * 5) +
         ((child_ids['mother_interactive_we_sec_07'] + child_ids['father_interactive_we_sec_07']) * 2)) / 3600
         
    return child_ids[['ID_2007', 'SN_2007', 'parent_interactive_skill_hrs_wk_07']]

# --- Main Execution Block ---
if __name__ == '__main__':
    if not os.path.exists(ANALYSIS_PATH):
        os.makedirs(ANALYSIS_PATH)

    # Step 1
    core_cds_df = merge_core_cds_data_2007()
    path_step1 = os.path.join(ANALYSIS_PATH, '01_cds_merged_2007.csv')
    core_cds_df.to_csv(path_step1, index=False)
    print(f"Step 1 intermediate file saved to: {path_step1}")

    # Step 2
    cds_family_df = merge_family_file_2007(core_cds_df)
    path_step2 = os.path.join(ANALYSIS_PATH, '02_cds_with_family_data_2007.csv')
    cds_family_df.to_csv(path_step2, index=False)
    print(f"Step 2 intermediate file saved to: {path_step2}")

    # Step 3
    cds_tas_panel = merge_longitudinal_tas_data(cds_family_df)
    path_step3 = os.path.join(ANALYSIS_PATH, '03_cds_tas_panel_2007.csv')
    cds_tas_panel.to_csv(path_step3, index=False)
    print(f"Step 3 intermediate file saved to: {path_step3}")

    # Step 4
    time_use_variables = process_time_diaries_2007()
    path_step4 = os.path.join(ANALYSIS_PATH, '04_time_use_variables_2007.csv')
    time_use_variables.to_csv(path_step4, index=False)
    print(f"Step 4 intermediate file saved to: {path_step4}")

    # Step 5: Final Merge
    print("\n--- Step 5: Final Merge ---")
    final_dataset = pd.merge(cds_tas_panel, time_use_variables, on=['ID_2007', 'SN_2007'], how='left')
    final_dataset = final_dataset.loc[:, ~final_dataset.columns.duplicated()]
    print("All 2007 data sources successfully merged.")

    # Save final outputs
    final_path = os.path.join(ANALYSIS_PATH, 'final_analysis_dataset_2007.csv')
    final_dataset.to_csv(final_path, index=False)
    print(f"Final dataset saved to: {final_path}")
    
    if len(final_dataset) >= 1000:
        sample_df = final_dataset.sample(n=1000, random_state=42)
        sample_path = os.path.join(ANALYSIS_PATH, 'sample_final_analysis_dataset_2007.csv')
        sample_df.to_csv(sample_path, index=False)
        print(f"Sample dataset saved to: {sample_path}")




--- Step 1: Merging Core CDS-III Data (2007 Wave) ---
  - Successfully loaded: DEMOG07.csv (Shape: (1623, 18))
  - Successfully loaded: GENMAP07.csv (Shape: (1623, 9))
  - Successfully loaded: PCG_CHILD07.csv (Shape: (1608, 616))
  - Successfully loaded: CHILD07.csv (Shape: (1506, 557))
  - Successfully loaded: ASSESS07.csv (Shape: (1506, 255))
  - Successfully loaded: OCG_CHILD07.csv (Shape: (890, 60))
  - Successfully loaded: IDMAP07.csv (Shape: (1608, 8))
  - Successfully loaded: PCG_HH07.csv (Shape: (1250, 255))
Core 2007 CDS merge complete.
Step 1 intermediate file saved to: C:\Users\joshu\Aussie\Monash\Parental\Data\Processed Data 2007\01_cds_merged_2007.csv

--- Step 2: Merging 2007 PSID Family File ---
  - Successfully loaded: FAM2007ER.csv (Shape: (8289, 5240))
2007 Family file merge complete.
Step 2 intermediate file saved to: C:\Users\joshu\Aussie\Monash\Parental\Data\Processed Data 2007\02_cds_with_family_data_2007.csv

--- Step 3: Merging Longitudinal TAS Data ---
  - Suc

KeyError: 'ER30001'