In [1]:
import folium
import geopandas as gpd
import pandas as pd
import numpy as np
import math

In [None]:
sf = gpd.read_file("../data/raw/taxi_zones/taxi_zones.shp")
zones = pd.read_csv("../data/raw/taxi_zones/taxi+_zone_lookup.csv")

sf.head()

In [None]:
# LR r-square table
# read in data required
df1 = pd.read_csv('../data/curated/LR_test_prediction.csv')
df2 = pd.read_csv('../data/curated/y_test')
df3 = pd.read_csv('../data/curated/X_test')

In [None]:
# merge three dataframes read in into one dataframe that includes real y test values, predicted y test values
# and PULocationID
lr_test = pd.merge(df1['0'],df2['income_rate'],left_index=True, right_index=True)
lr_test = pd.merge(lr_test,df3['PULocationID'],left_index=True, right_index=True)
lr_test = lr_test.rename(columns={'0': 'predicted_income_rate'})

In [None]:
# calculated the r squared value by locationID
lr_rsq = []
for i in range(1, 264):
    sub_df = lr_test[lr_test['PULocationID'] == i]
    if len(sub_df) > 0:
        mean_testing = np.mean(sub_df['income_rate'])
        sub_df['Null_SS'] = (sub_df['income_rate'] - mean_testing)**2
        sub_df['model_SS'] = (sub_df['income_rate'] - sub_df['predicted_income_rate'])**2

        SSR_test = sum(sub_df['model_SS'])/len(sub_df)
        SST_test = sum(sub_df['Null_SS'])/len(sub_df)
        if SST_test > 0:
            R2_test = 1-SSR_test/SST_test
            R2_test
            lr_rsq.append(R2_test)
        else:
            lr_rsq.append(0)
    else:
        lr_rsq.append(0)

In [None]:
# sort the calculataed locationID r squared value into dataframe
lr_map_data = pd.DataFrame({'PULocationID': range(1, 264), 'r2': lr_rsq})
lr_map_data

In [None]:
lr_map_data[lr_map_data['r2'] < 0] = 0

In [None]:
# RFR r-square table
# read in required data
df4 = pd.read_csv('../data/curated/RFR_test_prediction.csv')

In [None]:
# merge three dataframes read in into one dataframe that includes real y test values, predicted y test values
# and PULocationID for random forest regeression
rfr_test = pd.merge(df4['predicted_income_rate'],df2['income_rate'],left_index=True, right_index=True)
rfr_test = pd.merge(rfr_test,df3['PULocationID'],left_index=True, right_index=True)

In [None]:
# calculated the r squared value by locationID
rfr_rsq = []
for i in range(1, 264):
    sub_df = rfr_test[rfr_test['PULocationID'] == i]
    if len(sub_df) > 0:
        mean_testing = np.mean(sub_df['income_rate'])
        sub_df['Null_SS'] = (sub_df['income_rate'] - mean_testing)**2
        sub_df['model_SS'] = (sub_df['income_rate'] - sub_df['predicted_income_rate'])**2

        SSR_test = sum(sub_df['model_SS'])/len(sub_df)
        SST_test = sum(sub_df['Null_SS'])/len(sub_df)
        if SST_test > 0:
            R2_test = 1-SSR_test/SST_test
            R2_test
            rfr_rsq.append(R2_test)
        else:
            rfr_rsq.append(0)
    else:
        rfr_rsq.append(0)

In [None]:
# sort the calculataed locationID r squared value into dataframe
rfr_map_data = pd.DataFrame({'PULocationID': range(1, 264), 'r2': rfr_rsq})
rfr_map_data

In [None]:
rfr_map_data[rfr_map_data['r2'] < 0] = 0

In [None]:
sf['geometry'] = sf['geometry'].to_crs("+proj=longlat +ellps=WGS84 +datum=WGS84 +no_defs")
sf.head()

In [None]:
# create map for accuracy (for linear regression)
lr_gdf = gpd.GeoDataFrame(pd.merge(lr_map_data, sf,
                                       left_on='PULocationID',
                                       right_on='LocationID',
                                       how='right'))
lr_gdf

In [None]:
geoJSON = lr_gdf[['LocationID','geometry']]
lr_map = folium.Map(location=[40.66, -73.94],tiles="Stamen Terrain", zoom_start=10)

lr_map.add_child(folium.Choropleth(
    geo_data=geoJSON,
    name='choropleth',
    data = lr_gdf,
    columns=['PULocationID', 'r2'],
    key_on = 'properties.LocationID',
    fill_color='YlOrRd',
    nan_fill_color='black',
    legend_name = 'R2'
))

In [None]:
# create map for accuracy (for random forest regression)
rfr_gdf = gpd.GeoDataFrame(pd.merge(rfr_map_data, sf,
                                       left_on='PULocationID',
                                       right_on='LocationID',
                                       how='right'))
rfr_gdf

In [None]:
geoJSON = rfr_gdf[['LocationID','geometry']]
rfr_map = folium.Map(location=[40.66, -73.94],tiles="Stamen Terrain", zoom_start=10)

rfr_map.add_child(folium.Choropleth(
    geo_data=geoJSON,
    name='choropleth',
    data = rfr_gdf,
    columns=['PULocationID', 'r2'],
    key_on = 'properties.LocationID',
    fill_color='YlOrRd',
    nan_fill_color='black',
    legend_name = 'R2'
))