# Preprocessing

### Prerequisites

**Imports**

In [4]:
import os
import sys
import warnings
import requests
import pandas as pd
import geopandas as gpd
from shapely import wkt
import matplotlib.pyplot as plt

**Fix directories, define path variables**

In [None]:
# make sure notebook is ran from src
cwd = os.getcwd()
if not cwd.split(os.sep)[-1] == 'src':
    try:
        os.chdir('src')
    except FileNotFoundError:
        print('Error: please run from src dir or project root')
        sys.exit(1)

# define paths
path = lambda x: os.path.join(*x.split('/')) + os.sep
data_path = path('../data')
csv_path = path('../data/csv')
shp_path = path('../data/shapefiles')
plot_path = path('../plots')
for path in [data_path, csv_path, shp_path, plot_path]:
    if not os.path.exists(path):
        os.mkdir(path)

---

### Read data

**Function that handles everything**

In [None]:
def load_data(name: str) -> gpd.GeoDataFrame:
    """ Check if the file exists, if not download it. """

    assert name in ['trees', 'traffic', 'streets'], 'Invalid dataset name'
    api = 'https://data.cityofnewyork.us/api/'
    lookup = {
        'trees': {
            'path': csv_path + '2015_Street_Tree_Census_-_Tree_Data.csv',
            'link': f'{api}views/uvpi-gqnh/rows.csv?accessType=DOWNLOAD',
        },
        'traffic': {
            'path': csv_path + 'Automated_Traffic_Volume_Counts.csv',
            'link': f'{api}views/7ym2-wayt/rows.csv?accessType=DOWNLOAD',
        },
        'streets': {
            'path': shp_path + 'NYC Street Centerline (CSCL)',  # this is a dir!
            'link': f'{api}geospatial/exjm-f27b?method=export&format=Shapefile',
        },
    }

    if not os.path.exists(lookup[name]['path']):
        print(f'Could not find {name} data, downloading...')
        r = requests.get(lookup[name]['link'])
        if r.status_code != 200:
            print(f'Error: {r.status_code}')
            sys.exit(1)
        if name in ['trees', 'traffic']:
            with open(lookup[name]['path'], 'w') as f:
                f.write(r.text)
        elif name == 'streets':
            if not os.path.exists(lookup[name]['path']):
                os.mkdir(lookup[name]['path'])
            with open(lookup[name]['path'] + '/geo_export.zip', 'wb') as f:
                f.write(r.content)
            os.system(f'unzip "{lookup[name]["path"]}/geo_export.zip" -d "{lookup[name]["path"]}"')
            os.system(f'rm "{lookup[name]["path"]}/geo_export.zip"')
        print(f'Download complete. Data saved to {lookup[name]["path"]}.')
    else:
        print(f'Found {name} data, loading from disk...')

    if name == 'trees':
        df = pd.read_csv(lookup[name]['path'])
        gdf = gpd.GeoDataFrame(df, geometry=gpd.points_from_xy(df['longitude'], df['latitude']), crs='epsg:4326')
    elif name == 'traffic':
        df = pd.read_csv(lookup[name]['path'], index_col='RequestID').rename(columns={'WktGeom': 'geometry'})
        # only keep data of 2017
        df = df[df['Yr'] == 2017]
        # create a new column with the average traffic volume for each measurement point
        df['Avg_Vol'] = df.groupby(['geometry', 'Direction'])['Vol'].transform('mean')
        # drop unnecessary columns and rows
        df = df.drop(['Yr', 'M', 'D', 'HH', 'MM','Vol'], axis=1)
        df = df.drop_duplicates(subset=['geometry', 'Direction'])
        # convert geometry column to shapely geometry
        df.geometry = df.geometry.apply(wkt.loads)
        gdf = gpd.GeoDataFrame(df, geometry='geometry')
        # convert geometry from epsg 2263 (NAD83 / New York Long Island (ftUS)) to epsg 4326 (WGS 84)
        gdf.crs = 2263
        gdf = gdf.to_crs(epsg=4326)
    elif name == 'streets':
        # every download gets a different hash, so we need to find it
        my_hash = os.listdir(lookup[name]['path'])[0].split('_')[-1].split('.')[0]
        gdf = gpd.read_file(lookup[name]['path'] + f'/geo_export_{my_hash}.shp').to_crs(epsg=4326)

    return gdf

**Call function to get all gdfs**

In [None]:
TREES = load_data('trees')
TRAFFIC = load_data('traffic')
STREETS = load_data('streets')

**Trees**

In [None]:
TREES

**Traffic**

In [None]:
TRAFFIC

**Streets**

In [None]:
STREETS

---

### Visualize data

First we just plot the measurement locations on street network.

In [None]:
fig, ax = plt.subplots(figsize=(10, 10))
STREETS.plot(ax=ax, color='grey', alpha=0.5, zorder=1)
TRAFFIC.plot(
    ax = ax,
    column = 'Avg_Vol',
    legend = True,
    legend_kwds = dict(
        label = 'Average Traffic Volume',
        orientation = 'horizontal',
        shrink = 0.8
    ),
    markersize = 5,
    cmap = 'viridis_r',
    zorder = 2
)
ax.set_axis_off()
ax.set_title('Average Traffic Volume in NYC in 2017');

**Manhattan**

We restrict the scope of the analysis to Manhattan, because its traffic datapoints are relatively dense.

In [None]:
STREETS_ = STREETS[STREETS['borocode'] == '1']
TRAFFIC_ = TRAFFIC[TRAFFIC['Boro'] == 'Manhattan']
TREES_ = TREES[TREES['borough'] == 'Manhattan']

for name, df, df_ in zip(['streets', 'traffic', 'trees'], [STREETS, TRAFFIC, TREES], [STREETS_, TRAFFIC_, TREES_]):
    print(f'{name}: {df.shape[0]} -> {df_.shape[0]} ({df_.shape[0]/df.shape[0]:.2%})')

Visualize again.

In [None]:
fig, ax = plt.subplots(figsize=(6, 10))
STREETS_.plot(ax=ax, color='grey', alpha=0.5, zorder=1)
TRAFFIC_.plot(
    ax = ax,
    column = 'Avg_Vol',
    legend = True,
    legend_kwds = dict(
        label = 'Average Traffic Volume',
        orientation = 'horizontal',
        shrink = 0.5,
    ),
    markersize = 5,
    cmap = 'viridis_r',
    zorder = 2
)
ax.set_axis_off()
ax.set_title('Manhattan Traffic Volume in 2017');

Now we want to add the tree data to the plot.

In [None]:
fig, ax = plt.subplots(figsize=(6, 10))
STREETS_.plot(ax=ax, color='grey', alpha=0.5, zorder=1)
TREES_.plot(
    ax = ax,
    markersize = 0.05,
    color = 'green',
    alpha = 0.1,
    zorder = 2
)
TRAFFIC_.plot(
    ax = ax,
    column = 'Avg_Vol',
    legend = True,
    legend_kwds = dict(
        label = 'Average Traffic Volume',
        orientation = 'horizontal',
        shrink = 0.5,
        pad = 0,
    ),
    markersize = 5,
    cmap = 'Reds',
    zorder = 3
)
ax.set_axis_off()
ax.set_title('Manhattan Traffic Volume in 2017\nwith Street Trees')
fig.tight_layout()
fig.savefig(plot_path + 'traffic_trees_manhattan.pdf', dpi=300)

In [None]:
STREETS_

In [None]:
# remove all streets left of -74.035 (Statue of Liberty and Ellis Island)
size_before = STREETS_.shape[0]
STREETS_ = STREETS_[STREETS_['geometry'].apply(lambda x: x.centroid.x > -74.035)]
print(f'Removed {size_before - STREETS_.shape[0]} streets')

---

Finally, we save the Manhattan gdfs to shapefiles for later use.

In [None]:
for gdf, name in zip([STREETS_, TRAFFIC_, TREES_], ['streets', 'traffic', 'trees']):
    target_path = shp_path + name + '/'
    if not os.path.exists(target_path):
        os.mkdir(target_path)
    # some column names will be truncated, but it's no big deal
    with warnings.catch_warnings():
        warnings.simplefilter('ignore')
        gdf.to_file(target_path + f'M_{name}.shp')