In [None]:
import pandas as pd
import numpy as np

# URL to the raw CSV file in the GitHub release
csv_url = 'https://github.com/Joeyvdelft/SemesterProject/releases/download/File/CPS.Demographic.Data.Semester.Project.csv'

# Read the CSV file into a DataFrame
data = pd.read_csv(csv_url)

# Convert year and month to a datetime format and sort the data
data['Date'] = pd.to_datetime(data['HRYEAR4'].astype(str) + data['HRMONTH'].astype(str).str.zfill(2), format='%Y%m')
data.sort_values(by='Date', inplace=True)

# Update the cities list
cities = ['New York', 'Chicago', 'Minneapolis', 'Atlanta', 'Houston', 'Seattle', 'Phoenix', 'Miami']

# Filter data to include only the cities in the list
data = data[data['City'].isin(cities)]

# Define the variables
numeric_vars = ['PRTAGE', 'PESEX']  # Numeric variables to average
categorical_vars = ['PTDTRACE', 'PEEDUCA', 'PEMARITL', 'PEMLR', 'HEFAMINC']  # Categorical variables to count

# Process numeric variables
for var in numeric_vars:
    data[var] = pd.to_numeric(data[var], errors='coerce')  # Convert to numeric, handling non-numeric entries
    data[f'{var}_mean'] = data.groupby(['Date', 'City'])[var].transform('mean')

# Process categorical variables
categorical_summary = pd.DataFrame()
for var in categorical_vars:
    # Pivot table with counts of each category per city per month
    pivot = data.pivot_table(index=['Date', 'City'], columns=var, aggfunc='size', fill_value=0)
    pivot.columns = [f'{var}_{col}' for col in pivot.columns]  # Rename columns
    if categorical_summary.empty:
        categorical_summary = pivot
    else:
        categorical_summary = categorical_summary.join(pivot, how='outer')

# Combine numeric averages with categorical counts
final_data = data[['Date', 'City'] + [f'{var}_mean' for var in numeric_vars]].drop_duplicates()
final_data = final_data.merge(categorical_summary, on=['Date', 'City'], how='outer')

# Save the processed data to a new CSV file
final_data.to_csv('Processed.Demographic.Data.csv', index=False)
print("Data processing complete. The processed dataset is saved as 'Processed.Demographic.Data.csv'.")

final_data.head()  # Display the first few rows of the processed data