# Imports

In [1]:
# For JSON imports
import json

#For DF, CSV, Excel
import pandas as pd

import re
import datetime

# Open Data
Contains the coordinates and metro lines that stop at those coordinates. Could be used in the explanation of nieuwmarkt.

## Data Exploration

In [2]:
#Import JSON file
with open("../Data/Original/TRAMMETRO_PUNTEN_2019.json") as gvb_data:
    stations = json.load(gvb_data)
    stations = stations["features"]
    
    #Example line in file:
    #Lines that stop at the given station
    print("Lines: ", stations[0]["properties"]["Lijn_select"])

    #Coordinates of the station
    print("Coordinates: ", stations[0]["geometry"]["coordinates"])
    
    #Construct actual file

    #Loop over all stations in file
    for station in stations:
        #Select only stations where the tram stops
        if station["properties"]["Modaliteit"] == "Tram":
            
            #Per station, select which lines stop there and the coordinates of the station            
            tram_dict = {"Lines": station["properties"]["Lijn_select"], "Coordinates": station["geometry"]["coordinates"]}
            
            #Append the given results to the file and save it            
            with open("../Data/Modified/TramStations.json", "a") as f:
                json.dump(tram_dict, f)
        
        #Select only stations where the metro stops
        elif station["properties"]["Modaliteit"] == "Metro":
            
            #Per station, select which lines stop there and the coordinates of the station            
            metro_dict = {"Lines": station["properties"]["Lijn_select"], "Coordinates": station["geometry"]["coordinates"]}
            
            #Append the given results to the file and save it
            with open("../Data/Modified/MetroStations.json", "a") as f:
                json.dump(metro_dict, f)

Lines:  02|11|12|13|17
Coordinates:  [4.893349, 52.376064]


# Official Dataset

## Import the data

In [4]:
#Import dataset
arr_df = pd.read_csv("../../../Data_thesis/GVB/Datalab_Reis_Bestemming_Uur_20190402.csv", sep=";")
dep_df = pd.read_csv("../../../Data_thesis/GVB/Datalab_Reis_Herkomst_Uur_20190403.csv", sep=";")

#Rename 'AantalReizen' column
arr_df = arr_df.rename(index=str, columns={"AantalReizen": "AantalAankomsten", "UurgroepOmschrijving (van aankomst)": "Uurgroep"})
dep_df = dep_df.rename(index=str, columns={"AantalReizen": "AantalVertrekken"})

In [5]:
#Concatenate the two DataFrames based on columns
full = pd.concat([arr_df, dep_df], axis=1, sort=True)

In [6]:
full.head()

Unnamed: 0,Datum,Uurgroep,AankomstHalteCode,AankomstHalteNaam,AankomstLat,AankomstLon,AantalAankomsten,Datum.1,UurgroepOmschrijving (van vertrek),VertrekHalteCode,VertrekHalteNaam,VertrekLat,VertrekLon,AantalVertrekken
0,1/1/2018 12:00:00 AM,00:00 - 00:59,,Overig,,,30,1/1/2018 12:00:00 AM,00:00 - 00:59,,Overig,,,34.0
1,1/1/2018 12:00:00 AM,01:00 - 01:59,,Overig,,,592,1/1/2018 12:00:00 AM,01:00 - 01:59,,Overig,,,360.0
10,1/1/2018 12:00:00 AM,01:00 - 01:59,HLD,Station Holendrecht,4.960219,52.297805,13,1/1/2018 12:00:00 AM,01:00 - 01:59,05100,Centraal Station,4.900683,52.37975,71.0
100,1/1/2018 12:00:00 AM,03:00 - 03:59,07035,Zeilstraat,4.856729,52.351241,21,1/1/2018 12:00:00 AM,03:00 - 03:59,WBS,Wibautstraat,4.91203,52.354289,38.0
1000,1/1/2018 12:00:00 AM,13:00 - 13:59,00238,Anton de Komplein,4.954134,52.316037,10,1/1/2018 12:00:00 AM,11:00 - 11:59,09121,Marie Heinekenplein,4.890772,52.357221,84.0


## Construct Dataframes
Now that the contents is clear, we construct Dataframes for each station that is of interest to us. 

### Columns

**Arrival and Departure Stations:**
- *Metro*
    - Nieuwmarkt
- *Tram*
    - Nieuwezijdse Kolk
    - Dam
    - Spui
- *Type*: String

**Weekday:** Select the day number of the week, based on the given date
- *Type*: Int

**Weekend:** 1 if day is a weekend day
- *Type*: Binary

In [7]:
#Selection of stations to use
stations = ["Nieuwmarkt", "Nieuwezijds Kolk", "Dam", "Spui"]

#Select rows that contain one of the listed stations in either 'Aankomst' or 'Vertrek'
df = full[(full["AankomstHalteNaam"].isin(stations)) | (full["VertrekHalteNaam"].isin(stations))]

#Drop unusable columns
df = df.drop(columns=["VertrekHalteCode", "AankomstHalteCode", "UurgroepOmschrijving (van vertrek)"])

#Drop duplicate Columns
df = df.loc[:,~df.columns.duplicated()]

#Replace the NaN value with 0
df = df.fillna(0.0)

#Add columns
#Gives day in numbers --> 0: Monday, 1: Tuesday,..., 6: Sunday
df.insert(10, "weekday", 99)

#Adds whether day is weekend --> Day 5 and 6
df.insert(11, "is_weekend", 0)

#Add total number of travelers
df.insert(12, "AantalReizigers", 0)

In [8]:
#Describe the dataset
df.describe()

Unnamed: 0,AankomstLat,AankomstLon,AantalAankomsten,VertrekLat,VertrekLon,AantalVertrekken,weekday,is_weekend,AantalReizigers
count,166557.0,166557.0,166557.0,166557.0,166557.0,166557.0,166557.0,166557.0,166557.0
mean,4.881627,52.271281,92.359901,4.872971,52.177218,95.101881,99.0,0.0,0.0
std,0.210724,2.227903,181.422038,0.295105,3.140009,166.530551,0.0,0.0,0.0
min,0.0,0.0,10.0,0.0,0.0,0.0,99.0,0.0,0.0
25%,4.88465,52.361576,20.0,4.883549,52.361459,20.0,99.0,0.0,0.0
50%,4.890794,52.372446,42.0,4.891241,52.372446,41.0,99.0,0.0,0.0
75%,4.897513,52.374929,105.0,4.897029,52.374929,100.0,99.0,0.0,0.0
max,5.044677,52.422069,5377.0,5.044677,52.422103,4634.0,99.0,0.0,0.0


## Data to Dict
Needed to ease certain operations

In [9]:
full_dict = df.to_dict("index")

In [28]:
#Datetime format
date_format_1 = '%d/%m/%Y %H:%M:%S'
date_format_2 = '%m/%d/%Y %H:%M:%S'

#Loop over dict
for k, v in full_dict.items():
    try:
        #Replace time string with time blok
        time_blok = v["Uurgroep"][:5]
        time_blok = re.sub('[:]', '', time_blok)
        v["Uurgroep"] = int(time_blok)
    except:
        v["Uurgroep"] = 99
    
    try:
        #Remove AM/PM from string
        v["Datum"] = v["Datum"][:-3]
        try:
            #Transform the date string to datatime.date object
            date = datetime.datetime.strptime(v["Datum"], date_format_1)
            #Transfrom date to weekday number
            v["weekday"] = date.weekday()
        except:
            #Transform the date string to datatime.date object
            date = datetime.datetime.strptime(v["Datum"], date_format_2)
            
            #Transfrom date to weekday number
            v["weekday"] = date.weekday()
        v["Datum"] = date.date()
    except:
          v["weekday"] = 99  
    
    #Check if weekday is in the weekend
    if date.weekday() == 5 or date.weekday() == 6:
        v["is_weekend"] = 1
        
    v["Datum"] = date.date()
        
    #Add the total number of travelers
    v["AantalReizigers"] = v["AantalAankomsten"] + v["AantalVertrekken"]

In [29]:
#Return to dict
df = pd.DataFrame.from_dict(full_dict, orient="index")

## Data Preperation
Make the model with the GVB data from *Dam* station

Variables final df:
- *UurgroepOmschrijving (van vertrek)*
    - The hour the counts were gathered --> 100 means 01:00
    - int
- *VertrekHalteNaam* / *AankomstHalteNaam*
    - Name of the leaving/arrival station
    - str
- *VertrekLat* / *AankomstLat*
    - Latitude leaving/arrival station
    - int
- *VertrekLon* / *AankomstLon*
    - Longitude leaving/arrival station
    - int
- *weekday*
    - Day of the week in numbers --> 0: Monday, 1: Tuesday,..., 6: Sunday
    - int
- *is_weekend*
    - Whether is it is weekend or not
    - binary
- *AantalRitten*
    - Number of the rides takes from the station
    - int

In [30]:
df.head()

Unnamed: 0,Datum,Uurgroep,AankomstHalteNaam,AankomstLat,AankomstLon,AantalAankomsten,VertrekHalteNaam,VertrekLat,VertrekLon,AantalVertrekken,weekday,is_weekend,AantalReizigers
1000011,2018-02-05,99,Osdorpplein,4.803229,52.359132,19,Dam,4.891309,52.373554,87.0,99,0,106.0
1000012,2018-02-05,99,Meer en Vaart,4.809944,52.356369,42,Dam,4.891245,52.372727,72.0,99,0,114.0
1000029,2018-02-05,99,Dam,4.895494,52.374929,14,E. Wolffstraat,4.867279,52.370288,11.0,99,0,25.0
1000031,2018-02-05,99,Dam,4.890646,52.372446,22,Bilderdijkstraat,4.869857,52.370829,49.0,99,0,71.0
1000032,2018-02-05,99,Nieuwezijds Kolk,4.893731,52.376288,23,De Clercqstraat,4.870403,52.370616,49.0,99,0,72.0


## DF to File
Save the dataframe in a file, so that it can be imported for other uses. Later this will be probably be removed as the dataframe can simple function as input to another funtion. 

In [31]:
df.to_csv("../../../Data_thesis/Full_Datasets/GVB.csv", index=False)