In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error, explained_variance_score

# Load the datasets
census_data = pd.read_csv('Census total.csv')
district_data = pd.read_csv('updated_districts_cleaned.csv')

# Process 'Census Tract' column and ensure 'year' is an integer
census_data['Census Tract'] = census_data['Census Tract'].str.extract('(\d+\.?\d*)')[0].astype(str)
census_data['year'] = pd.to_numeric(census_data['year'], errors='coerce').astype('Int64')
district_data['Census Tract'] = district_data['Census Tract'].astype(str)

# Merge the data
combined_data = pd.merge(census_data, district_data, on='Census Tract', how='inner')

# Convert 'Median income (dollars)' to numeric and handle missing values
combined_data['Median income (dollars)'] = pd.to_numeric(combined_data['Median income (dollars)'], errors='coerce').fillna(0)

# Define and process all race columns
race_columns = ['White alone', 'Black or African American alone', 'Asian alone', 
                'American Indian and Alaska Native alone', 'Native Hawaiian and Other Pacific Islander alone', 
                'Some other race alone', 'Two or more races']
for col in race_columns:
    combined_data[col] = pd.to_numeric(combined_data[col], errors='coerce').fillna(0)

# Define age midpoints and calculate average age
age_midpoints = {
    'Under 5 years': 2.5, '5 to 9 years': 7.5, '10 to 14 years': 12.5, '15 to 19 years': 17.5, 
    '20 to 24 years': 22.5, '25 to 29 years': 27.5, '30 to 34 years': 32.5, '35 to 39 years': 37.5, 
    '40 to 44 years': 42.5, '45 to 49 years': 47.5, '50 to 54 years': 52.5, '55 to 59 years': 57.5, 
    '60 to 64 years': 62.5, '65 to 69 years': 67.5, '70 to 74 years': 72.5, '75 to 79 years': 77.5, 
    '80 to 84 years': 82.5
}

for age_group in age_midpoints:
    if age_group in combined_data.columns:
        combined_data[age_group] = pd.to_numeric(combined_data[age_group], errors='coerce').fillna(0)

combined_data['Total years'] = sum(combined_data[col] * age_midpoints[col] for col in age_midpoints if col in combined_data.columns)
combined_data['Total Population'] = combined_data[[col for col in age_midpoints if col in combined_data.columns]].sum(axis=1)
combined_data['Average Age'] = combined_data['Total years'] / combined_data['Total Population']

# Select features and target variable
features = combined_data[['Median income (dollars)', 'Average Age'] + race_columns]  
target = combined_data['Average Age']  
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)

# Using a simplified base estimator for AdaBoost
base_estimator = DecisionTreeRegressor(max_depth=3)

# Create an AdaBoost model with the simplified base estimator
ada_model = AdaBoostRegressor(base_estimator=base_estimator, n_estimators=30, random_state=42)

# Train the model
ada_model.fit(X_train, y_train)

# Predictions
y_pred = ada_model.predict(X_test)

# Metrics
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
explained_variance = explained_variance_score(y_test, y_pred)

# Print metrics
print(f"Mean Squared Error (MSE): {mse}")
print(f"Root Mean Squared Error (RMSE): {rmse}")
print(f"Mean Absolute Error (MAE): {mae}")
print(f"R-squared: {r2}")
print(f"Explained Variance Score: {explained_variance}")



Mean Squared Error (MSE): 0.151278038375316
Root Mean Squared Error (RMSE): 0.3889447754827361
Mean Absolute Error (MAE): 0.3217795709120849
R-squared: 0.9864408868411487
Explained Variance Score: 0.9864422439758719
