# Imports

In [32]:
import pandas as pd
import json
import datetime

# Data
Import the following data:
- GVB: *Number of travelers per hour at given station coordinates*
- Event Data: *Dates and locations for events in Amsterdam* --> **OUTLIERS**

## GVB

In [18]:
#Read from file
gvb = pd.read_csv("../../../Data_thesis/Sample_data/GVB/ritten.csv")

In [19]:
#Check contents
gvb.head()

Unnamed: 0.1,Unnamed: 0,Datum,UurgroepOmschrijving (van vertrek),VertrekHalteCode,VertrekHalteNaam,VertrekXCoord,VertrekYCoord,AantalRitten
0,5,2018-12-24,01:00 - 01:59,5031,Dam,489348393.0,5237341000.0,58.0
1,8,2018-12-24,01:00 - 01:59,5068,Dam,489124121.0,5237436000.0,28.0
2,16,2018-12-24,02:00 - 02:59,5031,Dam,489348393.0,5237341000.0,103.0
3,18,2018-12-24,02:00 - 02:59,5068,Dam,489124121.0,5237436000.0,25.0
4,26,2018-12-24,03:00 - 03:59,5031,Dam,489348393.0,5237341000.0,39.0


### Descrive the dataframe
- *Count*: Count number of non-NA/null observations
- *Mean*: Mean of the values.
- *STD*: Standard deviation of the obersvations.
- *Min*: Minimum of the values in the object.
- *25%, 50%, 75%*: Returns the given percentile
- *Max*: Maximum of the values in the object.

In [20]:
gvb.describe()

Unnamed: 0.1,Unnamed: 0,VertrekXCoord,VertrekYCoord,AantalRitten
count,972.0,972.0,972.0,972.0
mean,24975.330247,489275500.0,5237234000.0,203.260288
std,14484.601039,387136.2,190351.6,253.135267
min,5.0,488925900.0,5236910000.0,10.0
25%,12567.75,489027100.0,5237194000.0,40.0
50%,24419.5,489079400.0,5237318000.0,94.5
75%,37513.25,489348400.0,5237368000.0,271.25
max,49712.0,490123900.0,5237493000.0,1371.0


### Stations in GVB Dataset

In [26]:
stations = gvb.VertrekHalteNaam.unique()

for station in stations:
    print("Station: ", station)

Station:  Dam
Station:  Nieuwmarkt
Station:  Spui


## Events

In [21]:
with open("../Data/Modified/Events.json") as f:
    events = json.load(f)

In [22]:
print(events[0])

{'Event': 'Springsnow Festival', 'Coordinates': {'Latitude': '52,3726380', 'Longtitude': '4,8941060'}, 'Data': {'startdate': '20-04-2018', 'enddate': '20-05-2018'}}


# Model
Below a few models will be instantiated

## GVB Data Only
Below all the models use only the GVB data

### Data Preperation
Make the model with the GVB data from *Dam* station

#### Dataframe formation

In [84]:
#Select all rows with "Dam" as station
dam_df = gvb.loc[gvb["VertrekHalteNaam"] == "Dam"]

#Select usable columns
dam_df = dam_df[["Datum", "UurgroepOmschrijving (van vertrek)", "AantalRitten"]]

#Add columns
#Gives day in numbers --> 0: Monday, 1: Tuesday,..., 6: Sunday
dam_df.insert(2, "weekday", 99)

#Adds whether day is weekend --> Day 5 and 6
dam_df.insert(3, "is_weekend", 0)

#### Transform DF

In [85]:
#Dataframe to dict
dam = dam_df.to_dict("index")

In [86]:
#Loop over dict
for k, v in dam.items():
    
    #Transform the date string to datatime.date object
    date = datetime.date.fromisoformat(v["Datum"])
    
    #Transfrom date to weekday number
    v["weekday"] = date.weekday()
    
    #Check if weekday is in the weekend
    if date.weekday() == 5 or date.weekday() == 6:
        v["is_weekend"] = 1

In [87]:
#Return to dict
dam_df = pd.DataFrame.from_dict(dam, orient="index")

#Remove the "Datum" columns
dam_df = dam_df.drop(columns=["Datum"])

#### Show contents

In [88]:
dam_df.head()

Unnamed: 0,UurgroepOmschrijving (van vertrek),weekday,is_weekend,AantalRitten
0,01:00 - 01:59,0,0,58.0
1,01:00 - 01:59,0,0,28.0
2,02:00 - 02:59,0,0,103.0
3,02:00 - 02:59,0,0,25.0
4,03:00 - 03:59,0,0,39.0


### Training/Test
Split the dataset into training and test data

In [93]:
#Select only the values from each row
data = dam_df.values

#Split the labels from the rest
x = data[:, :3]
y = data[:, 3]

In [94]:
y

array([58.0, 28.0, 103.0, 25.0, 39.0, 10.0, 14.0, 12.0, 10.0, 13.0, 21.0,
       24.0, 18.0, 21.0, 62.0, 62.0, 16.0, 16.0, 107.0, 135.0, 22.0, 28.0,
       217.0, 12.0, 185.0, 54.0, 49.0, 292.0, 19.0, 338.0, 92.0, 75.0,
       511.0, 22.0, 374.0, 134.0, 74.0, 668.0, 49.0, 521.0, 232.0, 99.0,
       796.0, 49.0, 506.0, 293.0, 143.0, 1033.0, 46.0, 640.0, 358.0,
       125.0, 1190.0, 65.0, 609.0, 415.0, 137.0, 1231.0, 64.0, 595.0,
       349.0, 140.0, 1108.0, 48.0, 242.0, 138.0, 102.0, 644.0, 35.0,
       268.0, 82.0, 52.0, 442.0, 37.0, 238.0, 77.0, 32.0, 403.0, 23.0,
       170.0, 53.0, 17.0, 374.0, 27.0, 189.0, 88.0, 43.0, 427.0, 12.0,
       104.0, 18.0, 201.0, 58.0, 27.0, 45.0, 23.0, 49.0, 11.0, 10.0, 10.0,
       16.0, 27.0, 22.0, 74.0, 65.0, 11.0, 18.0, 133.0, 109.0, 17.0, 32.0,
       225.0, 11.0, 150.0, 27.0, 47.0, 293.0, 20.0, 202.0, 39.0, 70.0,
       434.0, 24.0, 219.0, 85.0, 85.0, 529.0, 27.0, 408.0, 79.0, 75.0,
       607.0, 12.0, 362.0, 108.0, 74.0, 612.0, 16.0, 328.0, 103.0