# Preprocessing

### Prerequisites

**Imports**

In [None]:
import os
import sys
import warnings
import pandas as pd
import geopandas as gpd
from shapely import wkt
import matplotlib.pyplot as plt

**Fix directories, define path variables**

In [None]:
# make sure notebook is ran from src
cwd = os.getcwd()
if not cwd.split('/')[-1] == 'src':
    try:
        os.chdir('src')
    except FileNotFoundError:
        print('Error: please run from src dir or project root')
        sys.exit(1)

# define paths
data_path = '../data/'
csv_path = data_path + 'csv/'
shp_path = data_path + 'shapefiles/'
plot_path = '../plots/'
if not os.path.exists(plot_path):
    os.mkdir(plot_path)

---

### Read data

**Trees**

In [None]:
df_trees = pd.read_csv(csv_path + '2015_Street_Tree_Census_-_Tree_Data.csv')
df_trees.shape

In [None]:
fig, ax = plt.subplots(figsize=(5, 5))
ax.scatter('longitude', 'latitude', data=df_trees, s=1e-5, alpha=0.5)
ax.set_axis_off()
ax.set_title('NYC Street Trees (2017)');

In [None]:
gdf_trees = gpd.GeoDataFrame(
    df_trees,
    geometry = gpd.points_from_xy(df_trees['longitude'], df_trees['latitude']),
    crs = 4326
)

**Traffic**

In [None]:
df_traffic = pd.read_csv(
    csv_path+'Automated_Traffic_Volume_Counts.csv',
    index_col = 'RequestID',
).rename(columns={'WktGeom': 'geometry'})

# only keep data of 2017
df_traffic = df_traffic[df_traffic['Yr'] == 2017]

# create a new column with the average traffic volume for each measurement point
df_traffic['Avg_Vol'] = df_traffic.groupby(['geometry', 'Direction'])['Vol'].transform('mean')

# drop unnecessary columns and rows
df_traffic = df_traffic.drop(['Yr', 'M', 'D', 'HH', 'MM','Vol'], axis=1)
df_traffic = df_traffic.drop_duplicates(subset=['geometry', 'Direction'])

# convert geometry column to shapely geometry
df_traffic['geometry'] = df_traffic['geometry'].apply(wkt.loads)

In [None]:
for col in df_traffic.columns:
    if col == 'geometry':
        continue
    print(f'{col:>10}: {len(df_traffic[col].unique())}')

In [None]:
gdf_traffic = gpd.GeoDataFrame(
    df_traffic,
    geometry = 'geometry'
)

# set crs to epsg 2263 NAD83 / New York Long Island (ftUS)
gdf_traffic.crs = 2263

# convert geometry to epsg 4326
gdf_traffic = gdf_traffic.to_crs(epsg=4326)

**Streets**

In [None]:
my_hash = os.listdir(
    shp_path + 'NYC Street Centerline (CSCL)'
)[0].split('_')[-1].split('.')[0]

# 'geo_export_f4098162-199c-4100-9c2a-6bc6e35f8734.prj'

gdf_streets = gpd.read_file(
    shp_path + f'NYC Street Centerline (CSCL)/geo_export_{my_hash}.shp'
).to_crs(epsg=4326)

---

### Visualize data

First we just plot the measurement locations on street network.

In [None]:
fig, ax = plt.subplots(figsize=(10, 10))
gdf_streets.plot(ax=ax, color='grey', alpha=0.5, zorder=1)
gdf_traffic.plot(
    ax = ax,
    column = 'Avg_Vol',
    legend = True,
    legend_kwds = dict(
        label = 'Average Traffic Volume',
        orientation = 'horizontal',
        shrink = 0.8
    ),
    markersize = 5,
    cmap = 'viridis_r',
    zorder = 2
)
ax.set_axis_off()
ax.set_title('Average Traffic Volume in NYC in 2017');

**Manhattan**

We restrict the scope of the analysis to Manhattan, because its traffic datapoints are relatively dense.

In [None]:
gdf_streets_f = gdf_streets[gdf_streets['borocode'] == '1']
gdf_traffic_f = gdf_traffic[gdf_traffic['Boro'] == 'Manhattan']
gdf_trees_f = gdf_trees[gdf_trees['borough'] == 'Manhattan']

for df, df_f in zip([gdf_streets, gdf_traffic, gdf_trees], [gdf_streets_f, gdf_traffic_f, gdf_trees_f]):
    print(f'{df.shape[0]} -> {df_f.shape[0]} ({df_f.shape[0]/df.shape[0]:.2%})')

Visualize again.

In [None]:
fig, ax = plt.subplots(figsize=(6, 10))
gdf_streets_f.plot(ax=ax, color='grey', alpha=0.5, zorder=1)
gdf_traffic_f.plot(
    ax = ax,
    column = 'Avg_Vol',
    legend = True,
    legend_kwds = dict(
        label = 'Average Traffic Volume',
        orientation = 'horizontal',
        shrink = 0.5,
    ),
    markersize = 5,
    cmap = 'viridis_r',
    zorder = 2
)
ax.set_axis_off()
ax.set_title('Manhattan Traffic Volume in 2017');

Top 10 busiest streets in Manhattan.

In [None]:
gdf_traffic_f.sort_values('Avg_Vol', ascending=False).head(10)

Now we want to add the tree data to the plot.

In [None]:
fig, ax = plt.subplots(figsize=(6, 10))
gdf_streets_f.plot(ax=ax, color='grey', alpha=0.5, zorder=1)
gdf_trees_f.plot(
    ax = ax,
    markersize = 0.05,
    color = 'green',
    alpha = 0.1,
    zorder = 2
)
gdf_traffic_f.plot(
    ax = ax,
    column = 'Avg_Vol',
    legend = True,
    legend_kwds = dict(
        label = 'Average Traffic Volume',
        orientation = 'horizontal',
        shrink = 0.5,
        pad = 0,
    ),
    markersize = 5,
    cmap = 'Reds',
    zorder = 3
)
ax.set_axis_off()
ax.set_title('Manhattan Traffic Volume in 2017\nwith Street Trees')
fig.tight_layout()
fig.savefig(plot_path + 'traffic_trees_manhattan.png', dpi=300)

In [None]:
gdf_streets_f

In [None]:
# remove all streets left of -74.035 (Statue of Liberty and Ellis Island)
size_before = gdf_streets_f.shape[0]
gdf_streets_f = gdf_streets_f[gdf_streets_f['geometry'].apply(lambda x: x.centroid.x > -74.035)]
print(f'Removed {size_before - gdf_streets_f.shape[0]} streets')

---

Finally, we save the Manhattan gdfs to shapefiles for later use.

In [None]:
for gdf, name in zip([gdf_streets_f, gdf_traffic_f, gdf_trees_f], ['streets', 'traffic', 'trees']):
    target_path = shp_path + name + '/'
    if not os.path.exists(target_path):
        os.mkdir(target_path)
    # some column names will be truncated, but it's no big deal
    with warnings.catch_warnings():
        warnings.simplefilter('ignore')
        gdf.to_file(target_path + f'M_{name}.shp')