In [76]:
# import packages
import os
import math
import numpy as np
import pandas as pd

from matplotlib import pyplot as plt
import plotly.graph_objects as go
from datetime import datetime
import seaborn as sns

In [77]:
# read in all the data
facilities = pd.read_csv('./Data/facilities.csv')
incidents = pd.read_csv('./Data/incidents.csv')
satisfaction = pd.read_csv('./Data/satisfaction.csv')
stations = pd.read_csv('./Data/stations.csv')
stops = pd.read_csv('./Data/stops.csv')
subscriptions = pd.read_csv('./Data/subscriptions.csv')
tickets = pd.read_csv('./Data/tickets.csv')
travelers = pd.read_csv('./Data/travelers.csv', sep=';')

In [78]:
# Only keep all Belgian stations 
facilities = facilities[facilities['zip'].notna()]
stations = stations[stations['country-code'] == 'be']

In [79]:
# Merge stations and facilities based on station_id
stations_facilities = pd.merge(stations, facilities, on='station_id', how='outer')
stations_facilities

Unnamed: 0,station_id,name_x,alternative-fr,alternative-nl,alternative-de,alternative-en,country-code,longitude,latitude,avg_stop_times,...,sales_open_wednesday,sales_close_wednesday,sales_open_thursday,sales_close_thursday,sales_open_friday,sales_close_friday,sales_open_saturday,sales_close_saturday,sales_open_sunday,sales_close_sunday
0,8811007,Schaarbeek/Schaerbeek,Schaerbeek,Schaarbeek,,,be,4.378636,50.878513,737.231343,...,,,,,,,,,,
1,8811106,Evere,,,,,be,4.400965,50.867780,166.641791,...,,,,,,,,,,
2,8811130,Haren-Sud/Haren-Zuid,Haren-Sud,Haren-Zuid,,,be,4.415357,50.889696,324.111940,...,,,,,,,,,,
3,8811148,Buda,,,,,be,4.417074,50.907495,208.134328,...,,,,,,,,,,
4,8811155,Haren,,,,,be,4.419978,50.888878,166.641791,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
569,8896503,Ieper,Ypres,,,,be,2.876593,50.847402,42.067164,...,06:00,13:15,06:00,13:15,06:00,13:15,07:00,14:15,,
570,8896735,Poperinge,,,,,be,2.736343,50.854449,41.544776,...,05:45,11:10,05:45,11:10,05:45,11:10,07:00,12:10,07:00,12:10
571,8896800,Roeselare,Roulers,,,,be,3.130412,50.949025,68.574627,...,06:15,20:00,06:15,20:00,06:15,20:00,06:30,20:00,06:30,20:00
572,8896909,Izegem,,,,,be,3.212088,50.921149,68.574627,...,06:00,13:15,06:00,13:15,06:00,13:15,07:00,14:15,07:00,14:15


In [80]:
stations_facilities['days_open'] = stations_facilities[['sales_open_monday', 'sales_open_tuesday', 'sales_open_wednesday', 'sales_open_thursday', 'sales_open_friday', 'sales_open_saturday', 'sales_open_sunday']].notna().sum(axis = 1)
stations_facilities 

Unnamed: 0,station_id,name_x,alternative-fr,alternative-nl,alternative-de,alternative-en,country-code,longitude,latitude,avg_stop_times,...,sales_close_wednesday,sales_open_thursday,sales_close_thursday,sales_open_friday,sales_close_friday,sales_open_saturday,sales_close_saturday,sales_open_sunday,sales_close_sunday,days_open
0,8811007,Schaarbeek/Schaerbeek,Schaerbeek,Schaarbeek,,,be,4.378636,50.878513,737.231343,...,,,,,,,,,,0
1,8811106,Evere,,,,,be,4.400965,50.867780,166.641791,...,,,,,,,,,,0
2,8811130,Haren-Sud/Haren-Zuid,Haren-Sud,Haren-Zuid,,,be,4.415357,50.889696,324.111940,...,,,,,,,,,,0
3,8811148,Buda,,,,,be,4.417074,50.907495,208.134328,...,,,,,,,,,,0
4,8811155,Haren,,,,,be,4.419978,50.888878,166.641791,...,,,,,,,,,,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
569,8896503,Ieper,Ypres,,,,be,2.876593,50.847402,42.067164,...,13:15,06:00,13:15,06:00,13:15,07:00,14:15,,,6
570,8896735,Poperinge,,,,,be,2.736343,50.854449,41.544776,...,11:10,05:45,11:10,05:45,11:10,07:00,12:10,07:00,12:10,7
571,8896800,Roeselare,Roulers,,,,be,3.130412,50.949025,68.574627,...,20:00,06:15,20:00,06:15,20:00,06:30,20:00,06:30,20:00,7
572,8896909,Izegem,,,,,be,3.212088,50.921149,68.574627,...,13:15,06:00,13:15,06:00,13:15,07:00,14:15,07:00,14:15,7


In [81]:
stations_facilities.rename(columns={'name_x': 'station'}, inplace=True)

In [85]:
stations_facilities['station'] = stations_facilities['station'].str.lower()
satisfaction['station'] = satisfaction['station'].str.lower()
merged = pd.merge(stations_facilities, satisfaction, on='station', how='left')
merged = merged.drop(columns=['Unnamed: 0'])
merged

Unnamed: 0,station_id,station,alternative-fr,alternative-nl,alternative-de,alternative-en,country-code,longitude,latitude,avg_stop_times,...,sales_open_thursday,sales_close_thursday,sales_open_friday,sales_close_friday,sales_open_saturday,sales_close_saturday,sales_open_sunday,sales_close_sunday,days_open,Avg Satisfaction
0,8811007,schaarbeek/schaerbeek,Schaerbeek,Schaarbeek,,,be,4.378636,50.878513,737.231343,...,,,,,,,,,0,
1,8811106,evere,,,,,be,4.400965,50.867780,166.641791,...,,,,,,,,,0,41.76
2,8811130,haren-sud/haren-zuid,Haren-Sud,Haren-Zuid,,,be,4.415357,50.889696,324.111940,...,,,,,,,,,0,
3,8811148,buda,,,,,be,4.417074,50.907495,208.134328,...,,,,,,,,,0,18.35
4,8811155,haren,,,,,be,4.419978,50.888878,166.641791,...,,,,,,,,,0,40.14
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
569,8896503,ieper,Ypres,,,,be,2.876593,50.847402,42.067164,...,06:00,13:15,06:00,13:15,07:00,14:15,,,6,49.95
570,8896735,poperinge,,,,,be,2.736343,50.854449,41.544776,...,05:45,11:10,05:45,11:10,07:00,12:10,07:00,12:10,7,61.95
571,8896800,roeselare,Roulers,,,,be,3.130412,50.949025,68.574627,...,06:15,20:00,06:15,20:00,06:30,20:00,06:30,20:00,7,58.54
572,8896909,izegem,,,,,be,3.212088,50.921149,68.574627,...,06:00,13:15,06:00,13:15,07:00,14:15,07:00,14:15,7,60.08
