# Setup

## Prerequisites

#### Imports

In [None]:
import os
import sys
import json
import requests
import numpy as np
import pandas as pd
import geopandas as gpd
import matplotlib.pyplot as plt

#### Fix directories, define default variables

In [None]:
# make sure notebook is ran from src
cwd = os.getcwd()
if not cwd.split('/')[-1] == 'src':
    try:
        os.chdir('src')
    except FileNotFoundError:
        print('Error: please run from src dir or project root')
        sys.exit(1)

# define paths
data_path = '../data/'
csv_path = data_path + 'csv/'
shp_path = data_path + 'shp/'
plot_path = '../plots/'
if not os.path.exists(plot_path):
    os.mkdir(plot_path)

# projection in which trees data is stored: "Amersfoort / RD New"
proj = 28992

## Data

#### Read in data

Trees data (converted to csv), tif file obtained from [Nationaal Georegister](https://www.nationaalgeoregister.nl/geonetwork/srv/dut/catalog.search#/metadata/89611780-75d6-4163-935f-9bc0a738f7ca).

In [None]:
df = pd.read_csv(csv_path+'bomenkaart.csv')
df.shape

Shapefile, obtained from [EarthWorks](https://earthworks.stanford.edu/catalog/stanford-gp502yc4422).

In [None]:
nl_base = gpd.read_file(shp_path+'ADM0/NLD_adm0.shp').to_crs(epsg=proj)
nl_base.shape

#### Visualize the data on a high level

Spatial plot

In [None]:
fig, ax = plt.subplots(figsize=(5, 5))

nl_base.plot(ax=ax, color='white', edgecolor='black')

df_sample = df.sample(100_000)
points = ax.scatter(
    'x',
    'y',
    c = 'z',
    data = df_sample,
    s = 0.1,
    cmap = 'viridis'
)
ax.set_axis_off()

cbar = fig.colorbar(points, ax=ax, location='bottom', shrink=0.5, pad=0.05)
cbar.set_label('number of trees per $100 \: m^2$')
fig.suptitle('Tree density in the Netherlands')
fig.tight_layout()
fig.savefig(plot_path+'trees.png', dpi=500)

Distribution plot

In [None]:
fig, ax = plt.subplots(figsize=(3, 3))
df['z'].hist(ax=ax, bins=100)
ax.set_title('Distribution of tree density')
ax.set_xlabel('number of trees per $100 \: m^2$')
ax.set_ylabel('number of observations')
fig.tight_layout()

## Foursquare

#### Set up Foursquare API

Define credentials, which will be used as global variables.

In [None]:
with open('../credentials.json', 'r') as creds_file:
    credentials = json.load(creds_file)

fsq_creds = credentials['foursquare']
client_id, client_secret = fsq_creds['client_id'], fsq_creds['client_secret']

url = 'https://api.foursquare.com/v3/places/search'

headers = dict(
    accept = 'application/json',
    authorization = fsq_creds['authorization']
)

Define functions to extract useful data from a Foursquare API response.

In [None]:
def try_keys(d: dict, keys: list[str]) -> str:
    """   Recurses through a list of keys to access a path in a dictionary, robust against KeyError.   """
    data = d
    try:
        for key in keys:
            data = data[key]
    except KeyError:
        return '-'
    return data

def venue_scraper(response: dict) -> pd.DataFrame:
    """   Extracts relevant venue data from a Foursquare API response.   """    
    df = pd.DataFrame(columns=['name', 'latitude', 'longitude', 'distance', 'address', 'genre'])
    
    for i, result in enumerate(response['results']):
        name = result['name']
        latitude = try_keys(result, ['geocodes', 'main', 'latitude'])
        longitude = try_keys(result, ['geocodes', 'main', 'longitude'])
        distance = try_keys(result, ['distance'])
        address = try_keys(result, ['location', 'address'])
        genre = [try_keys(category, ['name']) for category in result['categories']]

        df.loc[i] = [name, latitude, longitude, distance, address, genre]
    
    return df

#### Use Foursquare API as POC

POC: get some venues that are nearby Snellius.

In [None]:
response = requests.get(
    url,
    headers = headers,
    params = dict(
        client_id = client_id,
        client_secret = client_secret,
        ll = '52.1665,4.4870',          # Snellius
        radius = 3000,                  # should be enough to get 50 results
        limit = 50                      # maximum limit allowed by fsq
    )
).json()

fsq_df = venue_scraper(response)
fsq_df.head(2)

POC: get "all" venues in the Netherlands (very spaced out as to not strain the API).

In [None]:
def get_venues(granularity: int = 5) -> pd.DataFrame:
    """   Searches for all venues given a degree of granularity.   """

    # define bounding box for the Netherlands
    bbox = [3.314971144228537, 50.80372101501058, 7.092053256873896, 53.51040334737801]

    # define grid of evenly spaced points within bbox
    xl = np.linspace(bbox[0], bbox[2], granularity)
    yl = np.linspace(bbox[1], bbox[3], granularity)
    xv, yv = np.meshgrid(xl, yl)
    points = np.array([xv.flatten(), yv.flatten()]).T

    # narrow down list of points that are within Dutch borders
    points_gdf = gpd.GeoDataFrame(
        points,
        geometry = gpd.points_from_xy(points[:, 0], points[:, 1]),
        crs = 'EPSG:4326'
    ).to_crs(epsg=proj)

    points_gdf = points_gdf[
        points_gdf.within(nl_base.unary_union)
    ].rename(
        columns={0: 'lat', 1: 'lon'}
    ).reset_index(drop=True)

    # intialize empty dataframe
    fsq_df = pd.DataFrame(columns=['name', 'latitude', 'longitude', 'distance', 'address', 'genre'])

    # loop over cells in grid and add venues to dataframe
    for i, (lat, lon, _) in points_gdf.iterrows():
        print(f'{i+1}/{points_gdf.shape[0]}', end='\r')

        response = requests.get(
            url,
            headers = headers,
            params = dict(
                client_id = client_id,
                client_secret = client_secret,
                ll = f'{lon},{lat}',
                radius = 3000,
                limit = 50
            )
        ).json()
        temp_df = venue_scraper(response)
        fsq_df = pd.concat([fsq_df, temp_df], axis=0, ignore_index=True)

    print()
    return fsq_df

Conditionally load or create the Foursquare dataframe.

In [None]:
if os.path.exists(csv_path+'fsq_venues.csv'):
    fsq_df = pd.read_csv(csv_path+'fsq_venues.csv')
else:
    fsq_df = get_venues(granularity=5)
    fsq_df.to_csv(csv_path+'fsq_venues.csv', index=False)

# fsq_df = get_venues(granularity=10)
# fsq_df.to_csv(csv_path+'fsq_venues.csv', index=False)

fsq_df.shape

In [None]:
# convert to geodataframe
fsq_gdf = gpd.GeoDataFrame(
    fsq_df,
    geometry = gpd.points_from_xy(fsq_df['longitude'], fsq_df['latitude']),
    crs = 'EPSG:4326'
).to_crs(epsg=proj)

# very basic plot
fig, ax = plt.subplots(figsize=(5, 5))
nl_base.plot(ax=ax, color='white', edgecolor='black')
fsq_gdf.plot(ax=ax, markersize=0.5, color='red')
ax.set_axis_off()
ax.set_title('Foursquare venues in the Netherlands')
fig.tight_layout()

In [None]:
# read in shapefiles from "shapefiles_rw"
gdf_rw = gpd.read_file(shp_path+'restwarmte/restwarmte.shp').to_crs(epsg=proj)
gdf_rw.shape

In [None]:
# select outliers in 2013
big_players = gdf_rw[gdf_rw['KGCO2_2013'] > 5e8].copy()

In [None]:
fig, ax = plt.subplots(figsize=(5, 5))

nl_base.plot(ax=ax, color='white', edgecolor='black')

df_sample = df.sample(10000)
points = ax.scatter(
    'x',
    'y',
    c = 'z',
    data = df_sample,
    s = 0.1,
    cmap = 'viridis'
)

big_players.plot(
    ax = ax,
    column = 'KGCO2_2013',
    s = 100,
    cmap = 'Reds'
)

fig.tight_layout()
fig.savefig(plot_path+'rw_heatmap.png', dpi=300)

In [None]:
big_players.hist(column='KGCO2_2013', bins=20)

In [None]:
gdf_co2 = gpd.read_file(data_path+'co2/gemeentes.geojson').to_crs(epsg=proj)

gdf_co2.shape

In [None]:
gdf_co2.head(2)

In [None]:
# convert co2 to log
gdf_co2['co2_log'] = np.log(gdf_co2['co2'])

In [None]:
fig, ax = plt.subplots(figsize=(5, 5))

gdf_co2.plot(
    column = 'co2_log',
    cmap = 'Reds',
    linewidth = 1,
    edgecolor = 'black',
    legend = True,
    ax = ax
)

In [None]:
density = pd.read_csv(csv_path+'bevolkingsdichtheid.csv', sep=';')
density = density.rename({'Inwoners per km² land': 'density', 'Gemeente': 'rname'}, axis=1)
print(density.shape)
density.head(2)

In [None]:
# merge density data with gdf
gdf_m = gdf_co2.merge(density, on='rname', how='right')
print(gdf_co2.shape)
print(gdf_m.shape)
gdf_m.head(2)

In [None]:
# print difference gdf_co2 - gdf_m
gdf_co2[~gdf_co2['rname'].isin(gdf_m['rname'])]['rname'].unique()

In [None]:
# print difference density - gdf_co2
density[~density['Gemeente'].isin(gdf_co2['rname'])]['Gemeente'].unique()

This is a mess, probably will need different datasets.

In [None]:
# cast density to float
gdf_m['density'] = gdf_m['density'].apply(lambda x: x.replace(' ', '')).astype(int)

In [None]:
fig, ax = plt.subplots(figsize=(5, 5))

gdf_co2.plot(
    color = 'Blue',
    linewidth = 1,
    edgecolor = 'black',
    ax = ax
)

gdf_m.plot(
    column = 'density',
    cmap = 'Reds',
    linewidth = 1,
    edgecolor = 'black',
    legend = True,
    ax = ax
)



Blue values are "missing" from our population density dataset.