# Imports

In [45]:
# For JSON imports
import json

#For DF, CSV, Excel
import pandas as pd

import re
import datetime

# Open Data
Contains the coordinates and metro lines that stop at those coordinates. Could be used in the explanation of nieuwmarkt.

## Data Exploration

In [43]:
#Import JSON file
with open("../Data/Original/TRAMMETRO_PUNTEN_2019.json") as gvb_data:
    stations = json.load(gvb_data)
    stations = stations["features"]
    
    #Example line in file:
    #Lines that stop at the given station
    print("Lines: ", stations[0]["properties"]["Lijn_select"])

    #Coordinates of the station
    print("Coordinates: ", stations[0]["geometry"]["coordinates"])
    
    #Construct actual file

    #Loop over all stations in file
    for station in stations:
        #Select only stations where the tram stops
        if station["properties"]["Modaliteit"] == "Tram":
            
            #Per station, select which lines stop there and the coordinates of the station            
            tram_dict = {"Lines": station["properties"]["Lijn_select"], "Coordinates": station["geometry"]["coordinates"]}
            
            #Append the given results to the file and save it            
            with open("../Data/Modified/TramStations.json", "a") as f:
                json.dump(tram_dict, f)
        
        #Select only stations where the metro stops
        elif station["properties"]["Modaliteit"] == "Metro":
            
            #Per station, select which lines stop there and the coordinates of the station            
            metro_dict = {"Lines": station["properties"]["Lijn_select"], "Coordinates": station["geometry"]["coordinates"]}
            
            #Append the given results to the file and save it
            with open("../Data/Modified/MetroStations.json", "a") as f:
                json.dump(metro_dict, f)

Lines:  02|11|12|13|17
Coordinates:  [4.893349, 52.376064]


# Sample Data
Old version of the data, used to make a start in the data prep

## Data Exploration

### Ritten
Contains the "Ritten" data for each station. Can be used to count the number of people that originated from the station on an hourly basis

#### Import the Dataset
First we'll import the dataset and check what it contains

In [3]:
#Import the full dataset
ritten = pd.read_excel("../../../Data_thesis/Sample_data/GVB/full.xlsx")

In [4]:
#Check contents
ritten.head()

Unnamed: 0,Datum,UurgroepOmschrijving (van vertrek),VertrekHalteCode,VertrekHalteNaam,VertrekXCoord,VertrekYCoord,AantalRitten
0,2018-12-24,00:00 - 00:59,0,Overig,0.0,0.0,77.0
1,2018-12-24,00:00 - 00:59,5046,Rembrandtplein,489627975.0,5236626000.0,13.0
2,2018-12-24,00:00 - 00:59,5096,Centraal Station,490049319.0,5237968000.0,10.0
3,2018-12-24,00:00 - 00:59,6063,Leidseplein,488064547.0,5236476000.0,10.0
4,2018-12-24,01:00 - 01:59,0,Overig,0.0,0.0,132.0


#### Construct Dataframes
Now that the contents is clear, we construct Dataframes for each station that is of interest to us. 

##### Metro
- *Nieuwmarkt*
- *Rokin*

##### Tram
- *Niewezijdse Kolk*
- *Dam*
- *Spui*
- *Rokin*

In [12]:
#List al stations
stations = ["Nieuwmarkt", "Niewezijdse Kolk", "Dam", "Spui"]

#Select all the rows that have the station name in departure station
ritten_df = ritten[ritten["VertrekHalteNaam"].isin(stations)]

In [13]:
#Example
ritten_df.head()

Unnamed: 0,Datum,UurgroepOmschrijving (van vertrek),VertrekHalteCode,VertrekHalteNaam,VertrekXCoord,VertrekYCoord,AantalRitten
5,2018-12-24,01:00 - 01:59,5031,Dam,489348393.0,5237341000.0,58.0
8,2018-12-24,01:00 - 01:59,5068,Dam,489124121.0,5237436000.0,28.0
16,2018-12-24,02:00 - 02:59,5031,Dam,489348393.0,5237341000.0,103.0
18,2018-12-24,02:00 - 02:59,5068,Dam,489124121.0,5237436000.0,25.0
26,2018-12-24,03:00 - 03:59,5031,Dam,489348393.0,5237341000.0,39.0


#### DF to File
Save the dataframe in a file, so that it can be imported for other uses. Later this will be probably be removed as the dataframe can simple function as input to another funtion. 

In [14]:
ritten_df.to_csv("../../../Data_thesis/Sample_data/GVB/ritten.csv")

### Official Dataset

#### Import the data

In [37]:
#Import dataset
full = pd.read_csv("../../../Data_thesis/GVB/Datalab_Rit_Herkomst_Bestemming_Uur_20190403.csv", sep=";")

#### Select usable data

In [39]:
#Selection of stations to use
stations = ["Nieuwmarkt", "Nieuwezijds Kolk", "Dam", "Spui"]

#Select rows that contain one of the listed stations in either 'Aankomst' or 'Vertrek'
df = full[(full["AankomstHalteNaam"].isin(stations)) | (full["VertrekHalteNaam"].isin(stations))]

#Drop unusable columns
df = df.drop(columns=["VertrekHalteCode", "AankomstHalteCode"])

#Replace the NaN value with 0
df = df.fillna(0.0)

#Add columns
#Gives day in numbers --> 0: Monday, 1: Tuesday,..., 6: Sunday
df.insert(3, "weekday", 99)

#Adds whether day is weekend --> Day 5 and 6
df.insert(4, "is_weekend", 0)

In [40]:
#Describe the dataset
df.describe()

Unnamed: 0,weekday,is_weekend,VertrekLat,VertrekLon,AankomstLat,AankomstLon,AantalRitten
count,611318.0,611318.0,611318.0,611318.0,611318.0,611318.0,611318.0
mean,99.0,0.0,4.892896,52.367332,4.34722,46.530006,30.753835
std,0.0,0.0,0.020426,0.013493,1.539859,16.480242,35.057023
min,99.0,0.0,4.77478,52.274346,0.0,0.0,10.0
25%,99.0,0.0,4.88933,52.366048,4.880855,52.358685,12.0
50%,99.0,0.0,4.891309,52.371942,4.890658,52.371942,18.0
75%,99.0,0.0,4.900781,52.373678,4.899218,52.373411,32.0
max,99.0,0.0,4.989523,52.401172,5.006931,52.401172,548.0


#### Data to Dict

In [41]:
full_dict = df.to_dict("index")

In [51]:
#Loop over dict
for k, v in full_dict.items():
    
    #Replace time string with time blok
    time_blok = v["UurgroepOmschrijving (van vertrek)"]
    v["UurgroepOmschrijving (van vertrek)"] = int(time_blok)
    
    #Transform the date string to datatime.date object
    date = datetime.date.fromisoformat(v["Datum"])
    
    #Transfrom date to weekday number
    v["weekday"] = date.weekday()
    
    #Check if weekday is in the weekend
    if date.weekday() == 5 or date.weekday() == 6:
        v["is_weekend"] = 1

NameError: name 'date' is not defined

In [47]:
df.head()

Unnamed: 0,Datum,UurgroepOmschrijving (van vertrek),VertrekHalteNaam,weekday,is_weekend,VertrekLat,VertrekLon,AankomstHalteNaam,AankomstLat,AankomstLon,AantalRitten
6,1/1/2018 12:00:00 AM,01:00 - 01:59,Nieuwezijds Kolk,99,0,4.892841,52.375754,Overig,0.0,0.0,27
29,1/1/2018 12:00:00 AM,01:00 - 01:59,Amstelstation,99,0,4.917514,52.346473,Nieuwmarkt,4.901239,52.371942,10
48,1/1/2018 12:00:00 AM,01:00 - 01:59,Nieuwmarkt,99,0,4.901239,52.371942,Overig,0.0,0.0,44
49,1/1/2018 12:00:00 AM,01:00 - 01:59,Nieuwmarkt,99,0,4.901239,52.371942,Amstelstation,4.917514,52.346473,30
50,1/1/2018 12:00:00 AM,01:00 - 01:59,Nieuwmarkt,99,0,4.901239,52.371942,Bullewijk,4.952336,52.306422,29


In [27]:
df.to_csv("../../../Data_thesis/Full_Datasets/GVB_Bestemming_20190402")