In [None]:
#### Preamble ####
# Purpose: Generate a simulated IPUMS USA dataset with similar
#          characteristics to the real-world IPUMS USA data. This dataset
#          will be used for testing, analysis, and development of data analysis
#          methods in academic and research projects.
# Author: Jiazhou(Justin) Bi and Weiyang Li
# Date: 17 November 2024
# Contact: justin.bi@mail.utoronto.ca or weiyang.li@mail.utoronto.ca
# License: MIT
# Pre-requisites:
#  - Required Python libraries: pandas, numpy
#  - Ensure the working directory has a 'data/simulated_data' folder
#    for saving the data
#  - Familiarity with IPUMS USA data structures and variables for context
# Additional Information:
#  - The dataset will include simulated fields such as state, ownership type,
#    mortgage status, group quarters, gender, age, marital status, education,
#    school type, occupation, veteran status, industry, and total income.
#  - The data is purely simulated based on assumed distributions and ranges,
#    mimicking the structure of real IPUMS USA data.
#  - The dataset can be used to test analysis pipelines without access
#    to sensitive real-world data.

In [None]:
#### Workspace setup ####

import pandas as pd
import numpy as np

# Set a random seed for reproducibility
np.random.seed(42)

# Number of records to simulate
n_records = 1000

#### Simulate data ####

# Simulate the dataset
data = pd.DataFrame({
    'STATEICP': np.random.choice(range(1, 57), size=n_records),  # Assuming 56 state codes including territories
    'OWNERSHP': np.random.choice([1, 2], size=n_records, p=[0.65, 0.35]),
    'MORTGAGE': np.random.choice([1, 2, 3, 4], size=n_records, p=[0.2, 0.3, 0.4, 0.1]),
    'GQ': np.random.choice([1, 2, 3, 5], size=n_records, p=[0.7, 0.1, 0.15, 0.05]),
    'SEX': np.random.choice([1, 2, 9], size=n_records, p=[0.48, 0.48, 0.04]),
    'AGE': np.random.randint(0, 100, size=n_records),
    'MARST': np.random.choice([1, 2, 3, 4, 5, 6], size=n_records, p=[0.25, 0.25, 0.2, 0.15, 0.1, 0.05]),
    'EDUC': np.random.choice(range(1, 12), size=n_records),  # Assuming education levels are 1-11
    'SCHLTYPE': np.random.choice([0, 1, 2], size=n_records, p=[0.2, 0.7, 0.1]),
    'OCC2010': np.random.choice(range(10, 1000), size=n_records),  # Occupational codes in range
    'VETSTAT': np.random.choice([0, 1, 2, 9], size=n_records, p=[0.7, 0.2, 0.05, 0.05]),
    'IND1990': np.random.choice(range(10, 1000), size=n_records),  # Industry codes in range
    'INCTOT': np.random.normal(50000, 20000, size=n_records)  # Simulated income distribution
})

# Ensuring no negative incomes
data['INCTOT'] = data['INCTOT'].clip(lower=0)

# Display the first few rows of the simulated dataset
print(data.head())

# Save to a CSV file
data.to_csv('../data/00-simulated_data/simulated_ipums_data.csv', index=False)


   STATEICP  OWNERSHP  MORTGAGE  GQ  SEX  AGE  MARST  EDUC  SCHLTYPE  OCC2010  \
0        39         1         4   3    1   69      5    10         1      275   
1        52         1         1   1    2   66      1     9         1      703   
2        29         1         1   1    2   57      2     5         0      114   
3        15         2         3   1    2   98      2     2         1      946   
4        43         1         3   2    2   81      3     7         1      442   

   VETSTAT  IND1990        INCTOT  
0        2      129  23598.456245  
1        2      965  45224.947959  
2        1      262  75711.644687  
3        0       69  48858.774171  
4        1      275  52831.915735  
