In [None]:
#### Preamble ####
# Purpose: Tests the simulated IPUMS USA dataset for structure, valid ranges,
#          logical consistency, and completeness.
# Author: Jiazhou(Justin) Bi and Weiyang Li
# Date: 17 November 2024
# Contact: justin.bi@mail.utoronto.ca or weiyang.li@mail.utoronto.ca
# License: MIT
#
# Pre-requisites:
#  - Installed Python libraries: pandas, pytest, numpy
#  - Simulated dataset saved as 'simulated_ipums_data.csv' in the 
#    'data/00-simulated_data/' directory.
#  - Familiarity with IPUMS USA data structure and types.
#
# Additional Information:
#  - Tests include checking for valid ranges, logical relationships, and
#    the presence of missing or invalid values in key fields.
#  - Ensure that the simulated data conforms to expected structures for 
#    downstream analysis.

In [9]:
#### Workspace setup ####

import pandas as pd
import numpy as np

#### Load data ####

# Load the simulated dataset
file_path = '../data/00-simulated_data/simulated_ipums_data.csv'
data = pd.read_csv(file_path)

#### Test data ####

# Test 1: Column presence
def test_column_presence():
    """Check if all required columns are present."""
    required_columns = ['STATEICP', 'OWNERSHP', 'MORTGAGE', 'GQ', 'SEX', 'AGE', 
                        'MARST', 'EDUC', 'SCHLTYPE', 'OCC2010', 'VETSTAT', 
                        'IND1990', 'INCTOT']
    assert all(column in data.columns for column in required_columns), "Missing columns in the dataset."

# Test 2: Valid ranges
def test_valid_ranges():
    """Check if numerical columns fall within valid ranges."""
    assert data['STATEICP'].between(1, 56).all(), "Invalid state codes."
    assert data['OWNERSHP'].isin([1, 2]).all(), "Invalid ownership values."
    assert data['MORTGAGE'].isin([0, 1, 2, 3, 4]).all(), "Invalid mortgage values."
    assert data['GQ'].isin([1, 2, 3, 5]).all(), "Invalid group quarters values."
    assert data['SEX'].isin([1, 2, 9]).all(), "Invalid sex values."
    assert data['AGE'].between(0, 100).all(), "Invalid age values."
    assert data['MARST'].isin([1, 2, 3, 4, 5, 6]).all(), "Invalid marital status values."
    assert data['SCHLTYPE'].isin([0, 1, 2]).all(), "Invalid school type values."
    assert data['INCTOT'].ge(0).all(), "Income cannot be negative."

# Test 3: Logical consistency
def test_logical_consistency():
    """Check for logical consistency between columns."""
    # If OWNERSHP is 2 (rented), then MORTGAGE should be 0 (not applicable)
    rented_units = data[data['OWNERSHP'] == 2]
    inconsistent_rows = rented_units[rented_units['MORTGAGE'] != 0]
    assert len(inconsistent_rows) == 0, f"Inconsistent mortgage data found for rented units: {inconsistent_rows}"

# Test 4: Missing values
def test_missing_values():
    """Check for missing values in key columns."""
    key_columns = ['STATEICP', 'SEX', 'AGE', 'INCTOT']
    assert data[key_columns].notnull().all().all(), "Missing values in key columns."

# Run all tests
if __name__ == "__main__":
    try:
        test_column_presence()
        test_valid_ranges()
        test_logical_consistency()
        test_missing_values()
        print("All tests passed!")
    except AssertionError as e:
        print(f"Test failed: {e}")

All tests passed!
