# Urban-Level Model Comparison

We perform two urban level modeling experiments, comparing a baseline OLS to a baseline XGBoost with 100 estimators and a maximum depth of 10. To maximize the policy implications of this assessment, we examine the top 20 urban areas by population.

1. We fit both baseline models to the full urban-level data. This experiment is relevant to urban planners who are curious about whether or not machine learning models may be useful for assessing heat mitigation policy.

2. We fit both baseline models to the full data, save a given urban area. That is, we seek to examine how well an XGBoost vs OLS model generalizes in our case. Can a model accurately predict on data for a previously unseen city? This experiment is relevant from a scientific perspective.

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from scipy.stats import pearsonr
from sklearn.metrics import r2_score, mean_squared_error
import statsmodels.api as sm
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

## Data

In [2]:
data = pd.read_csv('data/data.csv', low_memory=False)

In [3]:
data['Coastal?'] = data['Coastal?'].fillna(False).astype(int)

mapping_dict = {'Arid': 0, 'Snow': 1, 'Temperate': 2, 'Tropical': 3}
data['Climate Zone'] = data['Climate Zone'].map(mapping_dict)

In [4]:
features = ['Built Fraction', 'Grass Fraction', 'Tree Fraction', 
            'Built Albedo', 'Grass Albedo', 'Tree Albedo', 
            'Elevation', 'Climate Zone', 'Coastal?']
label = 'CUHI Day'

In [5]:
pop_df = pd.DataFrame(data[['Urban_name', 'Total Population']].groupby(['Urban_name'])['Total Population'].sum())
pop_df = pop_df.sort_values(by='Total Population', ascending=False, inplace=False).head(20).reset_index()
pop_df['Total Population (in millions)'] = (pop_df['Total Population'] / 1000000).round(1)

In [6]:
pop_df['Total Population'].sum()

111729797.0

111,729,797

In [7]:
pop_df

Unnamed: 0,Urban_name,Total Population,Total Population (in millions)
0,"New York--Newark, NY--NJ--CT",19063826.0,19.1
1,"Los Angeles--Long Beach--Anaheim, CA",12717924.0,12.7
2,"Chicago, IL--IN",8836803.0,8.8
3,"Dallas--Fort Worth--Arlington, TX",6204081.0,6.2
4,"Miami, FL",6051748.0,6.1
5,"Houston, TX",5859524.0,5.9
6,"Philadelphia, PA--NJ--DE--MD",5846088.0,5.8
7,"Atlanta, GA",5482880.0,5.5
8,"Washington, DC--VA--MD",5263796.0,5.3
9,"Boston, MA--NH--RI",4773811.0,4.8


In [8]:
cities20 = pop_df['Urban_name'].to_list()

## Planning Relevant Models

In [9]:
def get_scores(group):
    series_dict = {}
    df = group[features+[label]].dropna().copy()
    X  = df[features]
    y  = df[label]
      
    # XGBoost Model
    xgb = XGBRegressor(n_estimators=100, max_depth=10, importance_type='gain', 
                       random_state=1, tree_method='hist', grow_policy = 'lossguide')
    xgb.fit(X, y)
    series_dict['XGBoost'] = xgb.score(X,y)
    
    # OLS Model
    X   = sm.add_constant(X)
    ols = sm.OLS(y, np.asarray(X))
    results = ols.fit()
    series_dict['OLS'] = results.rsquared
    
    return pd.Series(series_dict)

In [10]:
urban_df20 = data[data['Urban_name'].isin(cities20)].copy().groupby(['Urban_name']).apply(lambda group: get_scores(group)).reset_index()

In [12]:
urban_df20 = urban_df20.merge(pop_df, how='left', on='Urban_name')

In [13]:
urban_df20['OLS'] = urban_df20['OLS'].round(2)

In [14]:
urban_df20 = urban_df20.sort_values(by='Total Population', ascending=False).reset_index().drop(columns='index')
urban_df20

Unnamed: 0,Urban_name,XGBoost,OLS,Total Population,Total Population (in millions)
0,"New York--Newark, NY--NJ--CT",0.999938,0.74,19063826.0,19.1
1,"Los Angeles--Long Beach--Anaheim, CA",0.999992,0.4,12717924.0,12.7
2,"Chicago, IL--IN",0.999985,0.34,8836803.0,8.8
3,"Dallas--Fort Worth--Arlington, TX",0.999982,0.35,6204081.0,6.2
4,"Miami, FL",0.999942,0.19,6051748.0,6.1
5,"Houston, TX",0.999984,0.67,5859524.0,5.9
6,"Philadelphia, PA--NJ--DE--MD",0.999992,0.87,5846088.0,5.8
7,"Atlanta, GA",0.999989,0.85,5482880.0,5.5
8,"Washington, DC--VA--MD",0.999993,0.86,5263796.0,5.3
9,"Boston, MA--NH--RI",0.999993,0.71,4773811.0,4.8


In [20]:
urban_df20['OLS'].min(), urban_df20['OLS'].max()

(0.19, 0.95)

In [15]:
urban_df20.drop(columns='Total Population').to_csv('tables/urban_level_model_comparison.csv', index=False)

## Scientifically Relevant Models

In [9]:
def get_score(city_name):
    df = data[data['Urban_name']!=city_name].copy()  # Drop given city from the dataset
    df = df[features+[label]].dropna().copy()
    X  = df[features]
    y  = df[label]
    df_test = data[data['Urban_name']==city_name].copy() # Only test city
    df_test = df_test[features+[label]].dropna().copy()
    X_test  = df_test[features] 
    y_test  = df_test[label]
      
    # XGBoost Model
    xgb = XGBRegressor(n_estimators=100, max_depth=10, importance_type='gain', 
                       random_state=1, tree_method='hist', grow_policy = 'lossguide')
    xgb.fit(X, y)
    # Random Forest Model
    rf  = RandomForestRegressor(n_estimators=150, max_depth=15, random_state=1)
    rf.fit(X, y)                            
    # OLS Model
    ols = LinearRegression()
    ols.fit(X, y)
    # Get predictions
    xgb_pred, rf_pred, ols_pred = xgb.predict(X_test), rf.predict(X_test), ols.predict(X_test)
    xgb_r, rf_r, ols_r = pearsonr(y_test, xgb_pred), pearsonr(y_test, rf_pred), pearsonr(y_test, ols_pred)
    xgb_r2, rf_r2, ols_r2 = r2_score(y_test, xgb_pred), r2_score(y_test, rf_pred), r2_score(y_test, ols_pred)
    return xgb_r2, rf_r2, ols_r2, xgb_r, rf_r, ols_r

In [10]:
xgb_r2_list, xgb_r_list = [], []
rf_r2_list, rf_r_list   = [], []
ols_r2_list, ols_r_list = [], []

for city in cities20:
    xgb_r2, rf_r2, ols_r2, xgb_r, rf_r, ols_r = get_score(city)
    xgb_r2_list.append(xgb_r2)
    rf_r2_list.append(rf_r2)
    ols_r2_list.append(ols_r2)
    xgb_r_list.append(xgb_r)
    rf_r_list.append(rf_r)
    ols_r_list.append(ols_r)

In [13]:
pd.DataFrame({'City': cities20, 
              'XGBoost $R^2$': xgb_r2_list, 'XGBoost R': xgb_r_list,
              'Random Forest $R^2$': rf_r2_list, 'Random Forest R': rf_r_list,
              'OLS $R^2$': ols_r2_list, 'OLS R': ols_r_list})

Unnamed: 0,City,XGBoost $R^2$,XGBoost R,Random Forest $R^2$,Random Forest R,OLS $R^2$,OLS R
0,"New York--Newark, NY--NJ--CT",0.521706,"(0.7442542493977425, 0.0)",0.643699,"(0.8027348744365769, 0.0)",0.579396,"(0.7728273075990202, 0.0)"
1,"Los Angeles--Long Beach--Anaheim, CA",-0.012729,"(0.37443660411460533, 9.96938962473389e-85)",-0.132649,"(0.5006317672412399, 4.782586937656583e-160)",-0.412763,"(-0.204690418577913, 2.9807866584096005e-25)"
2,"Chicago, IL--IN",-0.465546,"(0.4417894678674982, 1.7118708525643554e-99)",-0.06686,"(0.529597346565025, 8.333979481018061e-150)",-0.116304,"(0.509746971912327, 4.1112422511625454e-137)"
3,"Dallas--Fort Worth--Arlington, TX",-1.235063,"(0.3558614800014646, 7.803615901387313e-35)",-0.985109,"(0.4006175567159353, 1.681548217618143e-44)",-0.051996,"(0.3864409865221252, 2.833359260694313e-41)"
4,"Miami, FL",-22.471168,"(0.20259211229882584, 7.106589443191013e-10)",-30.368005,"(0.22289200154456132, 1.0715937097825549e-11)",-5.511818,"(0.3897316773431084, 2.412721420846994e-34)"
5,"Houston, TX",-0.909985,"(0.3899600203954865, 2.812398318729496e-34)",-0.29321,"(0.4409170333432517, 2.1953383109047233e-44)",0.230194,"(0.5590061557633156, 1.3365718601188656e-75)"
6,"Philadelphia, PA--NJ--DE--MD",0.357548,"(0.6821349687217634, 1.8026656981132483e-191)",0.635326,"(0.8017922764458966, 1.5567278405e-313)",0.653934,"(0.8173109990311416, 0.0)"
7,"Atlanta, GA",0.622843,"(0.8003275377172065, 2.256657673719534e-198)",0.675584,"(0.8438051413179679, 6.598445925339356e-241)",0.452969,"(0.6970070261266099, 1.103781843703022e-129)"
8,"Washington, DC--VA--MD",0.619343,"(0.7940616117555818, 1.8182595995036482e-250)",0.725621,"(0.8542964716876428, 0.0)",0.502954,"(0.7222591367813886, 4.81512623290949e-186)"
9,"Boston, MA--NH--RI",0.589406,"(0.7901614819188139, 1.1605704807003702e-208)",0.658586,"(0.819014383661138, 1.703502012998299e-236)",0.488966,"(0.7945114277198152, 1.4360450333400072e-212)"


In [14]:
[x[0] for x in rf_r_list]

[0.8027348744365769,
 0.5006317672412399,
 0.529597346565025,
 0.4006175567159353,
 0.22289200154456132,
 0.4409170333432517,
 0.8017922764458966,
 0.8438051413179679,
 0.8542964716876428,
 0.819014383661138,
 0.9020259436251542,
 0.917843742382367,
 0.37675818419578083,
 0.5911007543312277,
 -0.17559670978599473,
 0.7896104496779703,
 0.5575708979197277,
 0.8624248976856114,
 0.8551424641544209,
 0.8621199643747428]

In [15]:
pd.DataFrame({'City': cities20, 
              'XGBoost $R^2$': xgb_r2_list, 'XGBoost R': [x[0] for x in xgb_r_list],
              'Random Forest $R^2$': rf_r2_list, 'Random Forest R': [x[0] for x in rf_r_list],
              'OLS $R^2$': ols_r2_list, 'OLS R': [x[0] for x in ols_r_list]})

Unnamed: 0,City,XGBoost $R^2$,XGBoost R,Random Forest $R^2$,Random Forest R,OLS $R^2$,OLS R
0,"New York--Newark, NY--NJ--CT",0.521706,0.744254,0.643699,0.802735,0.579396,0.772827
1,"Los Angeles--Long Beach--Anaheim, CA",-0.012729,0.374437,-0.132649,0.500632,-0.412763,-0.20469
2,"Chicago, IL--IN",-0.465546,0.441789,-0.06686,0.529597,-0.116304,0.509747
3,"Dallas--Fort Worth--Arlington, TX",-1.235063,0.355861,-0.985109,0.400618,-0.051996,0.386441
4,"Miami, FL",-22.471168,0.202592,-30.368005,0.222892,-5.511818,0.389732
5,"Houston, TX",-0.909985,0.38996,-0.29321,0.440917,0.230194,0.559006
6,"Philadelphia, PA--NJ--DE--MD",0.357548,0.682135,0.635326,0.801792,0.653934,0.817311
7,"Atlanta, GA",0.622843,0.800328,0.675584,0.843805,0.452969,0.697007
8,"Washington, DC--VA--MD",0.619343,0.794062,0.725621,0.854296,0.502954,0.722259
9,"Boston, MA--NH--RI",0.589406,0.790161,0.658586,0.819014,0.488966,0.794511


In [16]:
pd.DataFrame({'City': cities20, 
              'XGBoost $R^2$': xgb_r2_list, 'XGBoost R': [x[0] for x in xgb_r_list],
              'Random Forest $R^2$': rf_r2_list, 'Random Forest R': [x[0] for x in rf_r_list],
              'OLS $R^2$': ols_r2_list, 'OLS R': [x[0] for x in ols_r_list]}).round(2).to_csv('tables/urban_comparison_test_set.csv',index=False)
