# Setup

## Prerequisites

#### Imports

In [None]:
import os
import sys
import numpy as np
import pandas as pd
import geopandas as gpd
import matplotlib.pyplot as plt

#### Fix directories, define default variables

In [None]:
# make sure notebook is ran from src
cwd = os.getcwd()
if not cwd.split('/')[-1] == 'src':
    try:
        os.chdir('src')
    except FileNotFoundError:
        print('Error: please run from src dir or project root')
        sys.exit(1)

# define paths
data_path = '../data/'
csv_path = data_path + 'csv/'
shp_path = data_path + 'shapefiles/'
plot_path = '../plots/'
if not os.path.exists(plot_path):
    os.mkdir(plot_path)

## Data

#### Read in data

##### Trees

In [None]:
df_trees = pd.read_csv(csv_path + '2015_Street_Tree_Census_-_Tree_Data.csv')

In [None]:
df_trees.info()

In [None]:
fig, ax = plt.subplots(figsize=(5, 5))
ax.scatter('longitude', 'latitude', data=df_trees, s=1e-5, alpha=0.5)
ax.set_axis_off()

In [None]:
df_trees.hist(figsize=(15, 15));

##### Traffic

For now, we skip the geometry column, as reading in 10 of those already takes a couple of minutes, let alone 27.2M.

In [None]:
df_traffic = pd.read_csv(
    csv_path+'Automated_Traffic_Volume_Counts.csv',
    index_col='RequestID',
    usecols=['RequestID', 'Boro', 'Yr', 'M', 'D', 'HH', 'MM', 'Vol', 'SegmentID', 'street', 'fromSt', 'toSt', 'Direction']
)

We print how many unique values there are for each column.

In [None]:
for col in df_traffic.columns:
    print(f'{col:>10}: {len(df_traffic[col].unique())}')

In [None]:
df_traffic['Boro'].hist(figsize=(4, 4));