# Imports

In [39]:
import pandas as pd
import json

#Train/Test section
from sklearn.model_selection import train_test_split

#Models
from sklearn.ensemble import RandomForestRegressor

# Import Data

## GVB Data

In [45]:
#Full GVB Dataset
gvb_df = pd.read_csv("../../../Data_thesis/Full_Datasets/GVB.csv")

In [46]:
#Contents
gvb_df.head()

Unnamed: 0,Datum,UurgroepOmschrijving (van vertrek),VertrekHalteNaam,VertrekLat,VertrekLon,AankomstHalteNaam,AankomstLat,AankomstLon,weekday,is_weekend,AantalRitten
0,1/1/2018 12:00:00,100,Nieuwezijds Kolk,4.892841,52.375754,Overig,0.0,0.0,0,0,27
1,1/1/2018 12:00:00,100,Amstelstation,4.917514,52.346473,Nieuwmarkt,4.901239,52.371942,0,0,10
2,1/1/2018 12:00:00,100,Nieuwmarkt,4.901239,52.371942,Overig,0.0,0.0,0,0,44
3,1/1/2018 12:00:00,100,Nieuwmarkt,4.901239,52.371942,Amstelstation,4.917514,52.346473,0,0,30
4,1/1/2018 12:00:00,100,Nieuwmarkt,4.901239,52.371942,Bullewijk,4.952336,52.306422,0,0,29


## Amsterdam Event Data

In [43]:
#Full Events Dataset
events_df = pd.read_csv("../../../Data_thesis/Full_Datasets/Events.csv")

In [44]:
events_df.head()

Unnamed: 0,Event,Coordinates,Data
0,Springsnow Festival,"{'Latitude': '52,3726380', 'Longtitude': '4,89...","{'startdate': '20-04-2018', 'enddate': '20-05-..."
1,Vurige Tongen,"{'Latitude': '52,4103320', 'Longtitude': '4,74...","{'singles': ['20-05-2018', '21-05-2018']}"
2,Sneakerness,"{'Latitude': '52,3828340', 'Longtitude': '4,92...","{'singles': ['03-06-2018', '04-06-2018']}"
3,Dutch Raw Food Festival,"{'Latitude': '52,4362550', 'Longtitude': '4,81...",{'singles': ['17-06-2018']}
4,Holland Festival,"{'Latitude': '52,3615820', 'Longtitude': '4,88...","{'singles': ['02-06-2018', '03-06-2018', '04-0..."


# Data Preperation

## Train/Test Split
Split the dataset into training and test data, using a [function from Sklearn](https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.train_test_split.html)

#### Train_Test_split Parameters
- *input*: 
    - *x*: data used to predict laballed data
        - Hour slot
        - Weekday (int)
        - Weekend (Binary)
    - *y*: laballed data
        - Number of rides (int)
- *test_size*: Proportion of total data used as test data, rest is training data
- *random_state*: Number used as seed by the *Random Number Generator* 

In [47]:
#Drop Unusable columns
gvb_df = gvb_df.drop(columns=["Datum", "VertrekHalteNaam", "AankomstHalteNaam"])

#Select only the values from each row
data = gvb_df.values

#Split the labels from the rest
x = data[:, :7]
y = data[:, 7]

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.1, random_state = 42)

In [48]:
#First three rows explanatory variables of the training set
x_train[:3]

array([[1.30000000e+03, 4.88932991e+00, 5.23693307e+01, 4.89921803e+00,
        5.23781079e+01, 5.00000000e+00, 1.00000000e+00],
       [1.60000000e+03, 4.89373093e+00, 5.23762877e+01, 0.00000000e+00,
        0.00000000e+00, 6.00000000e+00, 1.00000000e+00],
       [1.80000000e+03, 4.89079364e+00, 5.23736779e+01, 4.86292316e+00,
        5.23648757e+01, 0.00000000e+00, 0.00000000e+00]])

In [49]:
#First three rows response variable training set
y_train[:3]

array([32., 17., 10.])

In [50]:
gvb_df.head()

Unnamed: 0,UurgroepOmschrijving (van vertrek),VertrekLat,VertrekLon,AankomstLat,AankomstLon,weekday,is_weekend,AantalRitten
0,100,4.892841,52.375754,0.0,0.0,0,0,27
1,100,4.917514,52.346473,4.901239,52.371942,0,0,10
2,100,4.901239,52.371942,0.0,0.0,0,0,44
3,100,4.901239,52.371942,4.917514,52.346473,0,0,30
4,100,4.901239,52.371942,4.952336,52.306422,0,0,29


# Models

## Random Forrest Regressor
Implement the [Sklearn Version](https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestRegressor.html)

Parameters
- *N-Estimators*: Number of trees in the model
- *criterion*: loss function
- *n_jobs*: The number of jobs to run in parallel for both fit and predict
- *random_state*: random_state is the seed used by the random number generator
- *bootstrap*: Whether bootstrap samples are used when building trees

In [51]:
#Set parameters model
rfg = RandomForestRegressor(n_estimators=100, criterion="mse", n_jobs=5, random_state=42, bootstrap=True)

In [52]:
#fit the model
rfg.fit(x_train, y_train)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=5,
           oob_score=False, random_state=42, verbose=0, warm_start=False)

In [53]:
#Score the model
rfg_score = rfg.score(x_test, y_test)
print("Accuracy: ", rfg_score)

Accuracy:  0.8604274733629216
