In [1]:
# Re-importing necessary packages since execution context reset
import pandas as pd
import numpy as np
import random
from faker import Faker
import os


fake = Faker()
np.random.seed(42)
random.seed(42)

# Constants
N = 500
provinces = ["ON", "QC", "BC", "AB", "MB", "SK", "NS", "NB", "NL", "PE"]
indigenous_groups = ["First Nations", "Métis", "Inuit", "Non-status"]
service_types = ["Health", "Education", "Infrastructure", "Child & Family Support"]
program_types = ["CERB", "CRB", "CCB", "GIS", "GST Credit"]
income_brackets = ["Low", "Medium", "High"]
housing_conditions = ["Adequate", "Needs Major Repairs"]
employment_statuses = ["Employed", "Unemployed", "Retired", "Student"]
education_levels = ["None", "High School", "College", "University"]
urban_rural = ["Urban", "Suburban", "Rural", "Remote"]
age_groups = ["0-17", "18-34", "35-54", "55+"]
genders = ["Male", "Female", "Other"]

# Output directory setup
output_root_directory = "" # Set the root directory for output files

# Create the 'df' folder if it doesn't exist
output_folder = os.path.join(output_root_directory, "df")
os.makedirs(output_folder, exist_ok=True) # exist_ok=True prevents an error if the directory already exists


# Helper functions
def random_age_group():
    return np.random.choice(age_groups, p=[0.2, 0.35, 0.3, 0.15])

def generate_person_id(index): return f"P{index+1:04d}"
def generate_household_id(index): return f"H{index%200+1:03d}"
def generate_community_id(index): return f"C{index%100+1:03d}"

# Dataset 1: Indigenous Services
indigenous_df = pd.DataFrame({
    "PersonID": [generate_person_id(i) for i in range(N)],
    "CommunityID": [generate_community_id(i) for i in range(N)],
    "IndigenousGroup": np.random.choice(indigenous_groups, N),
    "ServiceType": np.random.choice(service_types, N),
    "ServiceReceivedDate": [fake.date_between(start_date='-3y', end_date='today') for _ in range(N)],
    "AnnualServiceValue": np.random.randint(1000, 20000, N),
    "AgeGroup": [random_age_group() for _ in range(N)],
    "Gender": np.random.choice(genders, N),
    "PostalCode": [fake.postalcode() for _ in range(N)],
    "HasReceivedFinancialAid": np.random.choice([True, False], N, p=[0.6, 0.4])
})

# Dataset 2: CRA Income/Taxfiler
incomes = np.random.randint(10000, 120000, N)

cra_df = pd.DataFrame({
    "TaxFilerID": [f"T{i+1:04d}" for i in range(N)],
    "PersonID": indigenous_df["PersonID"],
    "HouseholdID": [generate_household_id(i) for i in range(N)],
    "FilingYear": np.random.choice([2020, 2021, 2022, 2023], N),
    "TotalIncome": incomes,
    "EmploymentIncome": [int(x * np.random.uniform(0.5, 0.9)) for x in incomes],
    "GovernmentTransfers": [int(x * np.random.uniform(0.1, 0.3)) for x in incomes],
    "DeductionsClaimed": np.random.randint(1000, 10000, N),
    "NetTaxPayable": [int(x * np.random.uniform(0.05, 0.2)) for x in incomes],
    "Province": np.random.choice(provinces, N),
    "AgeGroup": indigenous_df["AgeGroup"],
    "IndigenousStatus": [True if g in indigenous_groups else False for g in indigenous_df["IndigenousGroup"]]
})

# Dataset 3: Financial Assistance Programs
financial_df = pd.DataFrame({
    "AidID": [f"A{i+1:04d}" for i in range(N)],
    "PersonID": indigenous_df["PersonID"],
    "ProgramType": np.random.choice(program_types, N),
    "DisbursementDate": [fake.date_between(start_date='-3y', end_date='today') for _ in range(N)],
    "Amount": np.random.randint(500, 3000, N),
    "EligibilityStatus": np.random.choice(["Eligible", "Ineligible", "Pending"], N, p=[0.75, 0.15, 0.1]),
    "PaymentMethod": np.random.choice(["Direct Deposit", "Cheque"], N, p=[0.8, 0.2]),
    "Province": cra_df["Province"],
    "IncomeBracket": np.random.choice(income_brackets, N, p=[0.4, 0.4, 0.2]),
    "AgeGroup": indigenous_df["AgeGroup"],
    "ReceivedThroughIndigenousServices": indigenous_df["HasReceivedFinancialAid"]
})

# Dataset 4: Demographic & Census
census_df = pd.DataFrame({
    "CommunityID": indigenous_df["CommunityID"],
    "HouseholdID": cra_df["HouseholdID"],
    "PersonID": indigenous_df["PersonID"],
    "Province": cra_df["Province"],
    "UrbanRural": np.random.choice(urban_rural, N),
    "HouseholdSize": np.random.randint(1, 7, N),
    "HousingCondition": np.random.choice(housing_conditions, N, p=[0.8, 0.2]),
    "AccessToInternet": np.random.choice([True, False], N, p=[0.9, 0.1]),
    "EducationLevel": np.random.choice(education_levels, N),
    "EmploymentStatus": np.random.choice(employment_statuses, N),
    "IndigenousStatus": cra_df["IndigenousStatus"],
    "AgeGroup": indigenous_df["AgeGroup"],
    "Gender": indigenous_df["Gender"]
})

# Save files
indigenous_path = os.path.join(output_folder, "indigenous_services.xlsx")
cra_path = os.path.join(output_folder, "income_taxfiler_data.xlsx")
financial_path = os.path.join(output_folder, "financial_assistance_programs.xlsx")
census_path = os.path.join(output_folder, "demographic_census_data.xlsx")

# Assuming indigenous_df, cra_df, financial_df, and census_df are already defined DataFrames
indigenous_df.to_excel(indigenous_path, index=False)
cra_df.to_excel(cra_path, index=False)
financial_df.to_excel(financial_path, index=False)
census_df.to_excel(census_path, index=False)

indigenous_path, cra_path, financial_path, census_path


('df/indigenous_services.xlsx',
 'df/income_taxfiler_data.xlsx',
 'df/financial_assistance_programs.xlsx',
 'df/demographic_census_data.xlsx')