# Setup

## Prerequisites

#### Imports

In [None]:
import os
import sys
import numpy as np
import pandas as pd
import geopandas as gpd
from shapely import wkt
import matplotlib.pyplot as plt


#### Fix directories, define default variables

In [None]:
# make sure notebook is ran from src
cwd = os.getcwd()
if not cwd.split('/')[-1] == 'src':
    try:
        os.chdir('src')
    except FileNotFoundError:
        print('Error: please run from src dir or project root')
        sys.exit(1)

# define paths
data_path = '../data/'
csv_path = data_path + 'csv/'
shp_path = data_path + 'shapefiles/'
plot_path = '../plots/'
if not os.path.exists(plot_path):
    os.mkdir(plot_path)

## Data

#### Read in data

##### Trees

In [None]:
df_trees = pd.read_csv(csv_path + '2015_Street_Tree_Census_-_Tree_Data.csv')

In [None]:
df_trees.info()

In [None]:
fig, ax = plt.subplots(figsize=(5, 5))
ax.scatter('longitude', 'latitude', data=df_trees, s=1e-5, alpha=0.5)
ax.set_axis_off()

In [None]:
df_trees.hist(figsize=(15, 15));

##### Traffic

_Create the traffic dataframe._

In [None]:
df_traffic = pd.read_csv(
    csv_path+'Automated_Traffic_Volume_Counts.csv',
    index_col = 'RequestID',
).rename(columns={'WktGeom':'geometry'})
#only keep data of 2017
df_traffic = df_traffic[df_traffic['Yr'] == 2017]
#create a new column with the average traffic volume for each measurement point
df_traffic['Avg_Vol'] = df_traffic.groupby('geometry')['Vol'].transform('mean')
#drop unnecessary columnns and rows
df_traffic = df_traffic.drop(['Yr', 'M', 'D', 'HH', 'MM','Vol'], axis=1)
df_traffic = df_traffic.drop_duplicates(subset='geometry')
#convert geometry column to shapely geometry
df_traffic['geometry'] = df_traffic['geometry'].apply(wkt.loads)

In [None]:
gdf_traffic = gpd.GeoDataFrame(
    df_traffic,
    geometry='geometry',
)

In [None]:
#print top 10 max average volume street
gdf_traffic.nlargest(10, 'Avg_Vol')

In [None]:
#plot the traffic data, size depends on the traffic volume
gdf_traffic.plot(
    figsize=(10, 10),
    column='Avg_Vol',
    legend=True,
    markersize=5,
    cmap='viridis_r',
);

We print how many unique values there are for each column.

In [None]:
for col in df_traffic.columns:
    print(f'{col:>10}: {len(df_traffic[col].unique())}')