# Urban-Level Model Comparison

We perform two urban level modeling experiments, comparing a baseline OLS to a baseline XGBoost with 100 estimators and a maximum depth of 10. To maximize the policy implications of this assessment, we examine the top 20 urban areas by population.

1. We fit both baseline models to the full urban-level data. This experiment is relevant to urban planners who are curious about whether or not machine learning models may be useful for assessing heat mitigation policy.

2. We fit both baseline models to the full data, save a given urban area. That is, we seek to examine how well an XGBoost vs OLS model generalizes in our case. Can a model accurately predict on data for a previously unseen city? This experiment is relevant from a scientific perspective.

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from scipy.stats import pearsonr

from sklearnex import patch_sklearn
patch_sklearn()

import os
os.environ["SKLEARNEX_VERBOSE"] = "INFO"

from sklearn.metrics import r2_score, mean_squared_error
import statsmodels.api as sm
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

## Data

In [2]:
data = pd.read_csv('data/data.csv', low_memory=False)

In [3]:
data['Total Population'].sum()

256946061.0

In [4]:
data['Coastal?'] = data['Coastal?'].fillna(False).astype(int)

mapping_dict = {'Arid': 0, 'Snow': 1, 'Temperate': 2, 'Tropical': 3}
data['Climate Zone'] = data['Climate Zone'].map(mapping_dict)

In [5]:
features = ['$\\Delta$Built Fraction', '$\\Delta$Grass Fraction', '$\\Delta$Tree Fraction', 
            '$\\Delta$Built Albedo', '$\\Delta$Grass Albedo', '$\\Delta$Tree Albedo', 
            '$\\Delta$Elevation', 'Coastal?', 'Climate Zone']
label = '$\\Delta$AT Day'

In [6]:
pop_df = pd.DataFrame(data[['Urban_name', 'Total Population']].groupby(['Urban_name'])['Total Population'].sum())
pop_df = pop_df.sort_values(by='Total Population', ascending=False, inplace=False).head(20).reset_index()
pop_df['Total Population (in millions)'] = (pop_df['Total Population'] / 1000000).round(1)

In [7]:
pop_df['Total Population'].sum()

111729797.0

111,729,797

In [8]:
round(111729797 / 256946061, 2)

0.43

In [9]:
pop_df

Unnamed: 0,Urban_name,Total Population,Total Population (in millions)
0,"New York--Newark, NY--NJ--CT",19063826.0,19.1
1,"Los Angeles--Long Beach--Anaheim, CA",12717924.0,12.7
2,"Chicago, IL--IN",8836803.0,8.8
3,"Dallas--Fort Worth--Arlington, TX",6204081.0,6.2
4,"Miami, FL",6051748.0,6.1
5,"Houston, TX",5859524.0,5.9
6,"Philadelphia, PA--NJ--DE--MD",5846088.0,5.8
7,"Atlanta, GA",5482880.0,5.5
8,"Washington, DC--VA--MD",5263796.0,5.3
9,"Boston, MA--NH--RI",4773811.0,4.8


In [10]:
cities20 = pop_df['Urban_name'].to_list()

## Planning Relevant Models

In [11]:
def get_scores(group):
    series_dict = {}
    df = group[features+[label]].dropna().copy()
    X  = df[features]
    y  = df[label]
      
    # XGBoost Model
    xgb = XGBRegressor(n_estimators=100, max_depth=10, importance_type='gain', 
                       random_state=1, tree_method='hist', grow_policy = 'lossguide')
    xgb.fit(X, y)
    series_dict['XGBoost'] = xgb.score(X,y)
    series_dict['XGBoost (RMSE)'] = mean_squared_error(y, xgb.predict(X), squared = False)
    
    # Random Forest Model
    rf  = RandomForestRegressor(n_estimators=150, max_depth=15, random_state=1)
    rf.fit(X, y) 
    series_dict['Random Forest'] = rf.score(X,y)
    series_dict['RF (RMSE)'] = mean_squared_error(y, rf.predict(X), squared = False)
    
    # OLS Model
    X   = sm.add_constant(X)
    ols = sm.OLS(y, np.asarray(X))
    results = ols.fit()
    series_dict['OLS'] = results.rsquared
    
    return pd.Series(series_dict)

In [12]:
urban_df20 = data[data['Urban_name'].isin(cities20)].copy().groupby(['Urban_name']).apply(lambda group: get_scores(group)).reset_index()

In [13]:
urban_df20 = urban_df20.merge(pop_df, how='left', on='Urban_name')

In [14]:
urban_df20['OLS'] = urban_df20['OLS'].round(2)
urban_df20['Random Forest'] = urban_df20['Random Forest'].round(2)
urban_df20['RF (RMSE)'] = urban_df20['RF (RMSE)'].round(2)

In [15]:
urban_df20 = urban_df20.sort_values(by='Total Population', ascending=False).reset_index().drop(columns='index')
urban_df20

Unnamed: 0,Urban_name,XGBoost,XGBoost (RMSE),Random Forest,RF (RMSE),OLS,Total Population,Total Population (in millions)
0,"New York--Newark, NY--NJ--CT",0.999937,0.003488,0.97,0.08,0.74,19063826.0,19.1
1,"Los Angeles--Long Beach--Anaheim, CA",0.999995,0.003509,0.94,0.38,0.4,12717924.0,12.7
2,"Chicago, IL--IN",0.999983,0.001262,0.92,0.09,0.34,8836803.0,8.8
3,"Dallas--Fort Worth--Arlington, TX",0.999978,0.001222,0.93,0.07,0.35,6204081.0,6.2
4,"Miami, FL",0.999965,0.000823,0.92,0.04,0.19,6051748.0,6.1
5,"Houston, TX",0.999984,0.001069,0.97,0.04,0.67,5859524.0,5.9
6,"Philadelphia, PA--NJ--DE--MD",0.999992,0.00108,0.99,0.05,0.87,5846088.0,5.8
7,"Atlanta, GA",0.999988,0.001063,0.98,0.04,0.85,5482880.0,5.5
8,"Washington, DC--VA--MD",0.999991,0.001121,0.98,0.05,0.86,5263796.0,5.3
9,"Boston, MA--NH--RI",0.999993,0.00119,0.97,0.08,0.71,4773811.0,4.8


In [16]:
urban_df20['OLS'].min(), urban_df20['OLS'].max()

(0.19, 0.95)

In [17]:
drop_cols = ['Total Population', 'XGBoost (RMSE)']
urban_df20.drop(columns=drop_cols).to_csv('tables/urban_level_model_comparison.csv', index=False)