In [None]:
# merge_family_data.py
#
# Purpose:
# This script performs the second major step in the data assembly process. It takes the
# clean, pre-compiled CDS dataset (created by 'create_cds_dataset.py') and enriches it
# by merging the 2001 PSID Main Family File. This adds a comprehensive set of
# household-level economic and social variables to each child's record. The final
# output is a complete dataset ready for analysis, along with a random sample.
#
# (Future prompts to modify this code will also be documented in this comprehensive manner)
#
# Author: Gemini
# Date: September 23, 2025

import pandas as pd
import numpy as np
import os

# Set a random seed for reproducible sampling
np.random.seed(42)

def merge_psid_family_data():
    """
    Loads the clean CDS dataset and the 2001 Family file, merges them,
    and returns the final, enriched DataFrame.
    """
    # --- Configuration: Set file paths ---
    cds_clean_path = r'C:\Users\joshu\Aussie\Monash\Parental\Data\Supplemental Studies\Child Development Survey\CDS2002\full_merged_cds_data_clean.csv'
    family_file_path = r'C:\Users\joshu\Aussie\Monash\Parental\Data\Main Study\Family Files\fam2001er\FAM2001ER.csv'

    try:
        print("--- Loading Input Files ---")
        # Load the clean, merged CDS data created by the first script
        cds_df = pd.read_csv(cds_clean_path)
        print(f"Loaded clean CDS data. Shape: {cds_df.shape}")

        # Load the 2001 PSID Family File
        # The codebook confirms 'ER17002' is the 2001 Family Interview ID. We rename it for the merge.
        family_df = pd.read_csv(family_file_path, low_memory=False).rename(columns={'ER17002': 'ID_2001'})
        print(f"Loaded 2001 Family File. Shape: {family_df.shape}")

    except FileNotFoundError as e:
        print(f"Error: Could not find a required file. {e}")
        print("Please ensure you have run 'create_cds_dataset.py' first and that both file paths are correct.")
        return None

    # --- Perform the Merge ---
    print("\n--- Merging Family Data ---")
    # We merge the family data onto the CDS data using the 2001 Family ID ('ID_2001'),
    # which is common to both files.
    final_df = pd.merge(cds_df, family_df, on='ID_2001', how='left', suffixes=('', '_fam2001'))
    
    # Final structural cleaning to remove any new duplicate columns
    final_df = final_df.loc[:, ~final_df.columns.duplicated()]
    print(f"Merge complete. Final dataset shape: {final_df.shape}")
    
    return final_df


if __name__ == '__main__':
    final_enriched_df = merge_psid_family_data()

    if final_enriched_df is not None:
        # Define the output directory
        output_dir = r'C:\Users\joshu\Aussie\Monash\Parental\Data\Analysis Files'
        if not os.path.exists(output_dir):
            os.makedirs(output_dir)
            
        print("\n--- Generating Final Output Files ---")

        # Save the full, enriched dataset
        full_output_filename = os.path.join(output_dir, 'full_merged_cds_with_family_data.csv')
        final_enriched_df.to_csv(full_output_filename, index=False)
        print(f"Full enriched dataset saved to: {full_output_filename}")

        # Save the random sample
        if len(final_enriched_df) >= 1000:
            sample_df = final_enriched_df.sample(n=1000)
            sample_output_filename = os.path.join(output_dir, 'sample_merged_cds_with_family_data.csv')
            sample_df.to_csv(sample_output_filename, index=False)
            print(f"Random sample of 1000 observations saved to: {sample_output_filename}")


--- Loading Base Dataset ---
Loaded base CDS-Family data. Shape: (3411, 5565)

--- Merging Longitudinal TAS Data ---
-> Merging TAS 2005 data...
   Merge complete. New shape: (3418, 6524)
-> Merging TAS 2015 data...
   Merge complete. New shape: (3425, 7828)

--- Generating Final Longitudinal Output Files ---
Full longitudinal panel dataset saved to: C:\Users\joshu\Aussie\Monash\Parental\Data\Analysis Files\cds_longitudinal_tas_panel.csv
Random sample of 1000 observations saved to: C:\Users\joshu\Aussie\Monash\Parental\Data\Analysis Files\sample_longitudinal_tas_panel.csv


In [None]:
# merge_family_data.py
#
# Purpose:
# This script builds a longitudinal panel dataset by merging the Transition to Adulthood
# Supplement (TAS) data onto the clean, compiled CDS-Family dataset. It starts with the
# 2002 childhood baseline and adds data from subsequent TAS waves, creating a wide-format
# dataset that tracks individuals over time.
#
# (Future prompts to modify this code will also be documented in this comprehensive manner)
#
# Author: Gemini
# Date: September 23, 2025

import pandas as pd
import numpy as np
import os

# Set a random seed for reproducible sampling
np.random.seed(42)

def merge_longitudinal_tas_data():
    """
    Loads the foundational CDS-Family dataset and sequentially merges TAS data waves.
    The function is designed to be easily extendable to include more TAS years.
    
    Returns:
        pandas.DataFrame: A single, wide-format longitudinal DataFrame, or None if errors occur.
    """
    # --- Configuration: Define all file paths ---
    # The base file created by the first script ('create_cds_dataset.py')
    base_file_path = r'C:\Users\joshu\Aussie\Monash\Parental\Data\Analysis Files\full_merged_cds_with_family_data.csv'

    # This dictionary controls the merging of TAS files.
    # To add more years (e.g., 2007, 2009), simply add a new entry.
    # The keys are the year suffixes, and the values are tuples containing:
    #   1. The full path to the data file.
    #   2. The name of the 1968 Family ID column in that file.
    #   3. The name of the Person Number column in that file.
    tas_files_to_merge = {
        '05': (
            r'C:\Users\joshu\Aussie\Monash\Parental\Data\Supplemental Studies\Transition into Adulthood Supplement\ta2005\TA2005.csv',
            'TA050004',
            'TA050005'
        ),
        '15': (
            r'C:\Users\joshu\Aussie\Monash\Parental\Data\Supplemental Studies\Transition into Adulthood Supplement\ta2015\TA2015.csv',
            'TA150004',
            'TA150005'
        )
    }

    try:
        print("--- Loading Base Dataset ---")
        # Load the clean, merged CDS-Family data created previously
        longitudinal_df = pd.read_csv(base_file_path, low_memory=False)
        print(f"Loaded base CDS-Family data. Shape: {longitudinal_df.shape}")
    except FileNotFoundError:
        print(f"Error: Base file not found at '{base_file_path}'")
        print("Please ensure you have run 'create_cds_dataset.py' and the 'merge_family_data.py' (original version) first.")
        return None

    # --- Sequentially Merge Each TAS Wave ---
    print("\n--- Merging Longitudinal TAS Data ---")
    for year_suffix, (filepath, id_col, pn_col) in tas_files_to_merge.items():
        try:
            print(f"-> Merging TAS 20{year_suffix} data...")
            tas_df = pd.read_csv(filepath, low_memory=False)

            # Rename TAS key columns to match the base dataframe for a clean merge
            rename_map = {id_col: 'ER30001', pn_col: 'ER30002'}
            tas_df.rename(columns=rename_map, inplace=True)
            
            # Perform a left merge to keep all original CDS participants
            longitudinal_df = pd.merge(
                longitudinal_df,
                tas_df,
                on=['ER30001', 'ER30002'],
                how='left',
                suffixes=('', f'_tas{year_suffix}') # Add suffix to new columns
            )
            print(f"   Merge complete. New shape: {longitudinal_df.shape}")

        except FileNotFoundError:
            print(f"   Warning: TAS file not found at '{filepath}'. Skipping this year.")
        except Exception as e:
            print(f"   An error occurred while merging {filepath}: {e}")

    # Final structural cleaning
    longitudinal_df = longitudinal_df.loc[:, ~longitudinal_df.columns.duplicated()]
    
    return longitudinal_df


if __name__ == '__main__':
    final_panel_df = merge_longitudinal_tas_data()

    if final_panel_df is not None:
        # Define the output directory
        output_dir = r'C:\Users\joshu\Aussie\Monash\Parental\Data\Analysis Files'
        if not os.path.exists(output_dir):
            os.makedirs(output_dir)
            
        print("\n--- Generating Final Longitudinal Output Files ---")

        # Save the full, enriched panel dataset
        full_output_filename = os.path.join(output_dir, 'cds_longitudinal_tas_panel.csv')
        final_panel_df.to_csv(full_output_filename, index=False)
        print(f"Full longitudinal panel dataset saved to: {full_output_filename}")

        # Save the random sample
        if len(final_panel_df) >= 1000:
            sample_df = final_panel_df.sample(n=1000)
            sample_output_filename = os.path.join(output_dir, 'sample_longitudinal_tas_panel.csv')
            sample_df.to_csv(sample_output_filename, index=False)
            print(f"Random sample of 1000 observations saved to: {sample_output_filename}")

