In [1]:
# merge_cds_data.py
#
# Purpose:
# This script performs a sequential merge of multiple data files from the 2002 wave
# of the Panel Study of Income Dynamics - Child Development Supplement (PSID-CDS).
# The primary objective is to create a single, clean, child-level dataset that serves
# as a foundational file for analyzing parenting dynamics. The final output is
# structurally clean, with original variable names and values preserved, and includes
# all necessary identifiers for future merges with other PSID waves or supplemental files.
#
# (Future prompts to modify this code will also be documented in this comprehensive manner)
#
# Author: Gemini
# Date: September 23, 2025

import pandas as pd
import numpy as np
import os

# Set a random seed to ensure that the random sample generated is reproducible.
# Anyone running this script will get the exact same sample file.
np.random.seed(42)

def load_and_prepare_data():
    """
    Loads all necessary CSV files from the '2002' subfolder.

    This function is designed to be the single point of data ingestion. It uses a
    dictionary, `file_info`, to manage filenames and to standardize the names of
    the key identifier columns across different files. This standardization is
    critical for ensuring the subsequent merge operations are straightforward and accurate.

    Returns:
        dict: A dictionary where keys are short names for the dataframes (e.g., 'demog')
              and values are the loaded pandas DataFrames. Returns None if files are missing.
    """
    data_files = {}
    folder_path = '2002'
    
    # This dictionary is the control center for the loading process.
    # It maps a nickname for each dataset to:
    #   1. The actual filename (e.g., 'DEMOG.csv').
    #   2. A dictionary for renaming the key columns to a consistent standard.
    #      For example, 'DEMID01' and 'CHLDID02' both refer to the 2001 wave Family ID,
    #      so we rename them both to 'ID_2001' to use as a consistent merge key.
    file_info = {
        'demog': ('DEMOG.csv', {'DEMID01': 'ID_2001', 'DEMSN01': 'SN_2001'}),
        'idmap': ('IDMAP02.csv', {'CHLDID02': 'ID_2001', 'CHLDSN02': 'SN_2001'}),
        'gen_map': ('GEN_MAP.csv', {'CH_ID68': 'ER30001', 'CH_PN': 'ER30002', 'GENID01': 'ID_2001', 'GENSN01': 'SN_2001'}),
        'pcg_chld': ('PCG_CHLD.csv', {'PCHID01': 'ID_2001', 'PCHSN01': 'SN_2001'}),
        'child': ('CHILD.csv', {'CHLDID01': 'ID_2001', 'CHLDSN01': 'SN_2001'}),
        'assessmt': ('ASSESSMT.csv', {'ASMTID01': 'ID_2001', 'ASMTSN01': 'SN_2001'}),
        'ocg_chld': ('OCG_CHLD.csv', {'OCGCID01': 'ID_2001', 'OCGCSN01': 'SN_2001'}),
        'pcg_hhld': ('PCG_HHLD.csv', {'PHHID01': 'PCGID02', 'PHHSN01': 'PCGSN02'}),
    }

    try:
        for key, (filename, rename_map) in file_info.items():
            full_path = os.path.join(folder_path, filename)
            data_files[key] = pd.read_csv(full_path).rename(columns=rename_map)
        
        print("All data files loaded and prepared successfully.")
        return data_files
    except FileNotFoundError as e:
        print(f"Error: Missing file - {e}. Please ensure all CSV files are in the '{folder_path}' directory.")
        return None

def merge_data_clean(data_files):
    """
    Performs a sequential merge of the prepared dataframes to create a single,
    child-level dataset.

    The merge strategy is crucial. We begin with a base file containing all children
    and then layer on additional data. The 'how="left"' parameter is used in all
    merges to ensure that we keep every child from the original demographic file,
    even if they don't have corresponding data in other files (e.g., younger children
    without a child interview). This preserves the complete sample.

    Args:
        data_files (dict): A dictionary of the loaded and prepared DataFrames.

    Returns:
        pandas.DataFrame: A single, merged DataFrame.
    """
    if data_files is None:
        return None

    # Step 1: Establish the Base DataFrame
    # We start with the demographic file ('demog'). This file serves as the master list
    # of all 2,907 children who participated in the 2002 CDS wave.
    merged_df = data_files['demog']
    print(f"Step 1: Base data established from DEMOG.csv. Shape: {merged_df.shape}")

    # Step 2: Merge the Generational Map (GEN_MAP)
    # This step is performed early to bring the permanent 1968 identifiers ('ER30001', 'ER30002')
    # into the main dataframe. These are the most important long-term identifiers.
    # The merge is performed on the 2001 wave IDs, which are common to both files.
    merged_df = pd.merge(merged_df, data_files['gen_map'], on=['ID_2001', 'SN_2001'], how='left', suffixes=('', '_genmap'))
    print(f"Step 2: Generational Map data merged. Shape: {merged_df.shape}")

    # Step 3: Merge Child-Level Data Files
    # These files contain information specific to each child (e.g., their own interview,
    # assessments). They can be directly merged onto the base frame using the
    # child's 2001 wave ID.
    child_level_dfs = {'pcg_chld': data_files['pcg_chld'], 'child': data_files['child'],
                       'assessmt': data_files['assessmt'], 'ocg_chld': data_files['ocg_chld']}
    for name, df in child_level_dfs.items():
        # The `suffixes` argument handles any columns with the same name in different files
        # by appending a suffix (e.g., '_child'), preventing data loss.
        merged_df = pd.merge(merged_df, df, on=['ID_2001', 'SN_2001'], how='left', suffixes=('', f'_{name}'))
    print(f"Step 3: All child-level data files merged. Shape: {merged_df.shape}")

    # Step 4: Merge the ID Map (IDMAP02)
    # This map is a crucial bridge. It links each CHILD's 2001 ID to their
    # PRIMARY CAREGIVER's (PCG) 2001 ID ('PCGID02', 'PCGSN02'). This allows us
    # to subsequently link household-level data.
    merged_df = pd.merge(merged_df, data_files['idmap'], on=['ID_2001', 'SN_2001'], how='left', suffixes=('', '_idmap'))
    print(f"Step 4: ID Map data merged, adding PCG identifiers. Shape: {merged_df.shape}")

    # Step 5: Merge the PCG Household Data
    # Now that we have the PCG's 2001 ID for each child, we can merge the household data,
    # which was collected from the PCG. This correctly assigns the same household
    # information to all children living in that household.
    merged_df = pd.merge(merged_df, data_files['pcg_hhld'], on=['PCGID02', 'PCGSN02'], how='left', suffixes=('', '_pcghhld'))
    print(f"Step 5: PCG Household data merged. Shape: {merged_df.shape}")
    
    # Step 6: Final Structural Cleaning
    # The merge process, especially with `suffixes`, can create duplicate columns if they
    # existed in multiple files with the same name and values. This line removes them.
    merged_df = merged_df.loc[:, ~merged_df.columns.duplicated()]
    print(f"Step 6: Merged dataset structure cleaned. Final Shape: {merged_df.shape}")

    return merged_df


if __name__ == '__main__':
    # This is the main execution block that runs when the script is executed.
    
    # First, load and prepare all the raw data files.
    all_data = load_and_prepare_data()
    
    # Only proceed if the data loading was successful.
    if all_data:
        # Second, perform the clean merge of the prepared data.
        final_df = merge_data_clean(all_data)

        if final_df is not None:
            # --- Create Output Files ---
            print("\n--- Generating Clean Output Files ---")
            
            # Save the full, clean merged dataset. This is your primary analysis file.
            full_output_filename = 'full_merged_cds_data_clean.csv'
            final_df.to_csv(full_output_filename, index=False)
            print(f"Full clean merged dataset with {final_df.shape[0]} rows and {final_df.shape[1]} columns saved to '{full_output_filename}'")

            # Create and save a random sample of 1000 for exploratory work.
            if len(final_df) >= 1000:
                sample_df = final_df.sample(n=1000)
                sample_output_filename = 'sample_merged_cds_data_clean.csv'
                sample_df.to_csv(sample_output_filename, index=False)
                print(f"Random sample of 1000 observations saved to '{sample_output_filename}'")



All data files loaded and prepared successfully.
Step 1: Base data established from DEMOG.csv. Shape: (2907, 22)
Step 2: Generational Map data merged. Shape: (2907, 29)
Step 3: All child-level data files merged. Shape: (2907, 1785)
Step 4: ID Map data merged, adding PCG identifiers. Shape: (2907, 1791)
Step 5: PCG Household data merged. Shape: (3411, 2007)
Step 6: Merged dataset structure cleaned. Final Shape: (3411, 2007)

--- Generating Clean Output Files ---
Full clean merged dataset with 3411 rows and 2007 columns saved to 'full_merged_cds_data_clean.csv'
Random sample of 1000 observations saved to 'sample_merged_cds_data_clean.csv'


In [7]:
# process_all_time_diaries.py
#
# Purpose:
# This script serves as a standalone module for processing all 2002 Time Diary data.
# It is designed to be independent of other merged files. It reads the raw
# time diary files and produces a clean dataset containing only child identifiers
# and a comprehensive set of constructed time-use variables.
#
# This output file can then be easily merged with the main longitudinal panel.
#
# The script integrates two analytical approaches:
#   1. Aggregate Analysis: Calculates "composite week" average hours for 39 broad
#      activity categories.
#   2. Contextual Analysis: Creates nuanced measures of "intensive parenting,"
#      inspired by Doepke and Zilibotti (2017), by quantifying "interactive skill time."
#
# (Future prompts to modify this code will also be documented in this comprehensive manner)
#
# Author: Gemini
# Date: September 23, 2025

import pandas as pd
import os

def create_comprehensive_time_use_variables():
    """
    Main function to load the time diary files and orchestrate the creation of
    both aggregate and contextual time use variables.
    """
    # --- Configuration: Define file paths ---
    base_path = r'C:\Users\joshu\Aussie\Monash\Parental\Data\Supplemental Studies\Child Development Survey\CDS2002\2002'
    agg_td_path = os.path.join(base_path, 'TD02_ACT_AGG.csv')
    activity_td_path = os.path.join(base_path, 'TD_ACTIVITY.csv')

    try:
        print("--- Loading Time Diary Files ---")
        td_agg_df = pd.read_csv(agg_td_path, low_memory=False)
        td_activity_df = pd.read_csv(activity_td_path, low_memory=False)
        print(f"Loaded aggregate TD data (Shape: {td_agg_df.shape})")
        print(f"Loaded raw activity TD data (Shape: {td_activity_df.shape})")
    except FileNotFoundError as e:
        print(f"Error: A required input file was not found. {e}")
        return None

    # Create a base DataFrame with just the unique child identifiers from the aggregate file.
    # These identifiers will be used to link all newly created variables.
    child_identifiers = td_agg_df[['AGGRID01', 'AGGRSN01']].copy()
    child_identifiers.rename(columns={'AGGRID01': 'ID_2001', 'AGGRSN01': 'SN_2001'}, inplace=True)

    # --- Part A: Calculate the 39 broad weekly averages ---
    time_use_df_part_a = calculate_aggregate_weekly_hours(child_identifiers, td_agg_df)

    # --- Part B: Calculate the intensive parenting measures ---
    time_use_df_part_b = calculate_intensive_parenting_time(td_activity_df)
    
    # --- Part C: Combine all time use variables into one file ---
    # Merge the intensive parenting measures onto the aggregate measures.
    final_time_use_df = pd.merge(time_use_df_part_a, time_use_df_part_b, on=['ID_2001', 'SN_2001'], how='left')
    
    # Fill any NaNs that result from the merge (e.g., a child has aggregate data but no
    # interactive skill time) with 0.
    final_time_use_df.fillna(0, inplace=True)

    return final_time_use_df

def calculate_aggregate_weekly_hours(base_df, td_agg_df):
    """
    Calculates weekly average hours for 39 broad activity categories.
    """
    print("\n--- Part A: Calculating Broad Weekly Time Use Averages ---")
    
    td_agg_df.rename(columns={'AGGRID01': 'ID_2001', 'AGGRSN01': 'SN_2001'}, inplace=True)
    
    # Merge the aggregate data onto the base identifier frame.
    panel_with_agg = pd.merge(base_df, td_agg_df, on=['ID_2001', 'SN_2001'], how='left')

    activity_codes = [f'39{i:02d}' for i in range(1, 40)]
    for code in activity_codes:
        wd_col = f'WD02{code}'
        we_col = f'WE02{code}'
        new_col_name = f'weekly_avg_hrs_cat_{code}'

        if wd_col in panel_with_agg.columns and we_col in panel_with_agg.columns:
            wd_seconds = panel_with_agg[wd_col].fillna(0)
            we_seconds = panel_with_agg[we_col].fillna(0)
            total_weekly_seconds = (wd_seconds * 5) + (we_seconds * 2)
            panel_with_agg[new_col_name] = total_weekly_seconds / 3600
    
    # Keep only the identifiers and the newly created weekly average columns.
    new_cols = ['ID_2001', 'SN_2001'] + [f'weekly_avg_hrs_cat_{code}' for code in activity_codes]
    result_df = panel_with_agg[new_cols]
            
    print("Aggregate weekly average calculations complete.")
    return result_df

def calculate_intensive_parenting_time(td_activity_df):
    """
    Processes the raw activity file to calculate measures of "intensive parenting."
    """
    print("\n--- Part B: Calculating Intensive Parenting Measures ---")

    skill_building_codes = [
        5490, 5491, 5492, 5493, 5494, 8010, 8011, 8012, 5040, 8020, 8030,
        8040, 8090, 8510, 8520, 8211, 8212, 8213, 8214, 8215, 8221, 8222, 8223
    ]
    
    skill_activities_df = td_activity_df[td_activity_df['COLA_02'].isin(skill_building_codes)].copy()
    wd_skill_df = skill_activities_df[skill_activities_df['DIARY_02'] == 0]
    we_skill_df = skill_activities_df[skill_activities_df['DIARY_02'] == 1]
    
    # Base dataframe for this part is all unique children in the activity file
    child_ids_in_activity = td_activity_df[['TDID01', 'TDSN01']].drop_duplicates()
    child_ids_in_activity.rename(columns={'TDID01': 'ID_2001', 'TDSN01': 'SN_2001'}, inplace=True)


    for day_type, df in [('wd', wd_skill_df), ('we', we_skill_df)]:
        # Mother
        mother_mask = df['COLGB_02'] == 1
        mother_time = df[mother_mask].groupby(['TDID01', 'TDSN01'])['DUR_02'].sum().reset_index()
        mother_time.rename(columns={'DUR_02': f'mother_interactive_{day_type}_sec', 'TDID01': 'ID_2001', 'TDSN01': 'SN_2001'}, inplace=True)
        child_ids_in_activity = pd.merge(child_ids_in_activity, mother_time, on=['ID_2001', 'SN_2001'], how='left')
        
        # Father
        father_mask = df['COLGC_02'] == 1
        father_time = df[father_mask].groupby(['TDID01', 'TDSN01'])['DUR_02'].sum().reset_index()
        father_time.rename(columns={'DUR_02': f'father_interactive_{day_type}_sec', 'TDID01': 'ID_2001', 'TDSN01': 'SN_2001'}, inplace=True)
        child_ids_in_activity = pd.merge(child_ids_in_activity, father_time, on=['ID_2001', 'SN_2001'], how='left')

    cols_to_fill = ['mother_interactive_wd_sec', 'mother_interactive_we_sec', 'father_interactive_wd_sec', 'father_interactive_we_sec']
    for col in cols_to_fill:
        if col not in child_ids_in_activity.columns:
            child_ids_in_activity[col] = 0
        else:
            # Revised method to avoid the FutureWarning
            child_ids_in_activity[col] = child_ids_in_activity[col].fillna(0)
            
    child_ids_in_activity['parent_interactive_skill_hrs_wk'] = \
        (((child_ids_in_activity['mother_interactive_wd_sec'] + child_ids_in_activity['father_interactive_wd_sec']) * 5) +
         ((child_ids_in_activity['mother_interactive_we_sec'] + child_ids_in_activity['father_interactive_we_sec']) * 2)) / 3600

    print("Intensive parenting variable calculation complete.")
    # Return only the identifiers and the final calculated variable
    return child_ids_in_activity[['ID_2001', 'SN_2001', 'parent_interactive_skill_hrs_wk']]


if __name__ == '__main__':
    final_time_use_dataset = create_comprehensive_time_use_variables()

    if final_time_use_dataset is not None:
        output_dir = r'C:\Users\joshu\Aussie\Monash\Parental\Data\Analysis Files'
        if not os.path.exists(output_dir):
            os.makedirs(output_dir)
            
        print(f"\n--- Standalone Time Use Dataset Complete ---")
        
        output_filename = os.path.join(output_dir, 'cds_2002_time_use_variables.csv')
        final_time_use_dataset.to_csv(output_filename, index=False)
        print(f"Final dataset saved to: {output_filename}")

        # Save a new random sample
        if len(final_time_use_dataset) >= 1000:
            sample_df = final_time_use_dataset.sample(n=1000)
            sample_output_filename = os.path.join(output_dir, 'sample_cds_2002_time_use_variables.csv')
            sample_df.to_csv(sample_output_filename, index=False)
            print(f"Random sample of 1000 observations saved to: {sample_output_filename}")



--- Loading Time Diary Files ---
Loaded aggregate TD data (Shape: (2569, 1311))
Loaded raw activity TD data (Shape: (99467, 34))

--- Part A: Calculating Broad Weekly Time Use Averages ---
Aggregate weekly average calculations complete.

--- Part B: Calculating Intensive Parenting Measures ---
Intensive parenting variable calculation complete.

--- Standalone Time Use Dataset Complete ---
Final dataset saved to: C:\Users\joshu\Aussie\Monash\Parental\Data\Analysis Files\cds_2002_time_use_variables.csv
Random sample of 1000 observations saved to: C:\Users\joshu\Aussie\Monash\Parental\Data\Analysis Files\sample_cds_2002_time_use_variables.csv
