# Imports

In [1]:
# For JSON imports
import json

#For DF, CSV, Excel
import pandas as pd

import re
import datetime

# Open Data
Contains the coordinates and metro lines that stop at those coordinates. Could be used in the explanation of nieuwmarkt.

## Data Exploration

In [43]:
#Import JSON file
with open("../Data/Original/TRAMMETRO_PUNTEN_2019.json") as gvb_data:
    stations = json.load(gvb_data)
    stations = stations["features"]
    
    #Example line in file:
    #Lines that stop at the given station
    print("Lines: ", stations[0]["properties"]["Lijn_select"])

    #Coordinates of the station
    print("Coordinates: ", stations[0]["geometry"]["coordinates"])
    
    #Construct actual file

    #Loop over all stations in file
    for station in stations:
        #Select only stations where the tram stops
        if station["properties"]["Modaliteit"] == "Tram":
            
            #Per station, select which lines stop there and the coordinates of the station            
            tram_dict = {"Lines": station["properties"]["Lijn_select"], "Coordinates": station["geometry"]["coordinates"]}
            
            #Append the given results to the file and save it            
            with open("../Data/Modified/TramStations.json", "a") as f:
                json.dump(tram_dict, f)
        
        #Select only stations where the metro stops
        elif station["properties"]["Modaliteit"] == "Metro":
            
            #Per station, select which lines stop there and the coordinates of the station            
            metro_dict = {"Lines": station["properties"]["Lijn_select"], "Coordinates": station["geometry"]["coordinates"]}
            
            #Append the given results to the file and save it
            with open("../Data/Modified/MetroStations.json", "a") as f:
                json.dump(metro_dict, f)

Lines:  02|11|12|13|17
Coordinates:  [4.893349, 52.376064]


In [14]:
ritten_df.to_csv("../../../Data_thesis/Sample_data/GVB/ritten.csv")

# Official Dataset

## Import the data

In [61]:
#Import dataset
full = pd.read_csv("../../../Data_thesis/GVB/Datalab_Rit_Herkomst_Bestemming_Uur_20190403.csv", sep=";")

## Construct Dataframes
Now that the contents is clear, we construct Dataframes for each station that is of interest to us. 

### Columns

**Arrival and Departure Stations:**
- *Metro*
    - Nieuwmarkt
- *Tram*
    - Nieuwezijdse Kolk
    - Dam
    - Spui
- *Type*: String

**Weekday:** Select the day number of the week, based on the given date
- *Type*: Int

**Weekend:** 1 if day is a weekend day
- *Type*: Binary

In [62]:
#Selection of stations to use
stations = ["Nieuwmarkt", "Nieuwezijds Kolk", "Dam", "Spui"]

#Select rows that contain one of the listed stations in either 'Aankomst' or 'Vertrek'
df = full[(full["AankomstHalteNaam"].isin(stations)) | (full["VertrekHalteNaam"].isin(stations))]

#Drop unusable columns
df = df.drop(columns=["VertrekHalteCode", "AankomstHalteCode"])

#Replace the NaN value with 0
df = df.fillna(0.0)

#Add columns
#Gives day in numbers --> 0: Monday, 1: Tuesday,..., 6: Sunday
df.insert(8, "weekday", 99)

#Adds whether day is weekend --> Day 5 and 6
df.insert(9, "is_weekend", 0)

In [63]:
#Describe the dataset
df.describe()

Unnamed: 0,VertrekLat,VertrekLon,AankomstLat,AankomstLon,weekday,is_weekend,AantalRitten
count,611318.0,611318.0,611318.0,611318.0,611318.0,611318.0,611318.0
mean,4.892896,52.367332,4.34722,46.530006,99.0,0.0,30.753835
std,0.020426,0.013493,1.539859,16.480242,0.0,0.0,35.057023
min,4.77478,52.274346,0.0,0.0,99.0,0.0,10.0
25%,4.88933,52.366048,4.880855,52.358685,99.0,0.0,12.0
50%,4.891309,52.371942,4.890658,52.371942,99.0,0.0,18.0
75%,4.900781,52.373678,4.899218,52.373411,99.0,0.0,32.0
max,4.989523,52.401172,5.006931,52.401172,99.0,0.0,548.0


## Data to Dict
Needed to ease certain operations

In [64]:
full_dict = df.to_dict("index")

In [65]:
#Datetime format
date_format_1 = '%d/%m/%Y %H:%M:%S'
date_format_2 = '%m/%d/%Y %H:%M:%S'

#Loop over dict
for k, v in full_dict.items():
    
    #Replace time string with time blok
    time_blok = v["UurgroepOmschrijving (van vertrek)"][:5]
    time_blok = re.sub('[:]', '', time_blok)
    v["UurgroepOmschrijving (van vertrek)"] = int(time_blok)

    v["Datum"] = v["Datum"][:-3]
    
    #Transform the date string to datatime.date object
    try:
        date = datetime.datetime.strptime(v["Datum"], date_format_1)
    except:
        date = datetime.datetime.strptime(v["Datum"], date_format_2)
    
    #Transfrom date to weekday number
    v["weekday"] = date.weekday()
    
    #Check if weekday is in the weekend
    if date.weekday() == 5 or date.weekday() == 6:
        v["is_weekend"] = 1

In [66]:
#Return to dict
df = pd.DataFrame.from_dict(full_dict, orient="index")

## Data Preperation
Make the model with the GVB data from *Dam* station

Variables final df:
- *UurgroepOmschrijving (van vertrek)*
    - The hour the counts were gathered --> 100 means 01:00
    - int
- *VertrekHalteNaam* / *AankomstHalteNaam*
    - Name of the leaving/arrival station
    - str
- *VertrekLat* / *AankomstLat*
    - Latitude leaving/arrival station
    - int
- *VertrekLon* / *AankomstLon*
    - Longitude leaving/arrival station
    - int
- *weekday*
    - Day of the week in numbers --> 0: Monday, 1: Tuesday,..., 6: Sunday
    - int
- *is_weekend*
    - Whether is it is weekend or not
    - binary
- *AantalRitten*
    - Number of the rides takes from the station
    - int

In [67]:
df.head()

Unnamed: 0,Datum,UurgroepOmschrijving (van vertrek),VertrekHalteNaam,VertrekLat,VertrekLon,AankomstHalteNaam,AankomstLat,AankomstLon,weekday,is_weekend,AantalRitten
6,1/1/2018 12:00:00,100,Nieuwezijds Kolk,4.892841,52.375754,Overig,0.0,0.0,0,0,27
29,1/1/2018 12:00:00,100,Amstelstation,4.917514,52.346473,Nieuwmarkt,4.901239,52.371942,0,0,10
48,1/1/2018 12:00:00,100,Nieuwmarkt,4.901239,52.371942,Overig,0.0,0.0,0,0,44
49,1/1/2018 12:00:00,100,Nieuwmarkt,4.901239,52.371942,Amstelstation,4.917514,52.346473,0,0,30
50,1/1/2018 12:00:00,100,Nieuwmarkt,4.901239,52.371942,Bullewijk,4.952336,52.306422,0,0,29


## DF to File
Save the dataframe in a file, so that it can be imported for other uses. Later this will be probably be removed as the dataframe can simple function as input to another funtion. 

In [69]:
df.to_csv("../../../Data_thesis/Full_Datasets/GVB.csv", index=False)