# Imports

In [1]:
import pandas as pd
import json

#Train/Test section
from sklearn.model_selection import train_test_split

#Models
from sklearn.ensemble import RandomForestRegressor

# Import Data

## GVB Data

In [10]:
#Full GVB Dataset
gvb_df = pd.read_csv("../../../Data_thesis/Full_Datasets/GVB.csv")

In [11]:
#Contents
gvb_df.head()

Unnamed: 0,Datum,Uurgroep,AankomstHalteNaam,AankomstLat,AankomstLon,AantalAankomsten,VertrekHalteNaam,VertrekLat,VertrekLon,AantalVertrekken,weekday,is_weekend,AantalReizigers
0,2018-02-05,0,Osdorpplein,4.803229,52.359132,19,Dam,4.891309,52.373554,87.0,0,0,106.0
1,2018-02-05,0,Meer en Vaart,4.809944,52.356369,42,Dam,4.891245,52.372727,72.0,0,0,114.0
2,2018-02-05,0,Dam,4.895494,52.374929,14,E. Wolffstraat,4.867279,52.370288,11.0,0,0,25.0
3,2018-02-05,0,Dam,4.890646,52.372446,22,Bilderdijkstraat,4.869857,52.370829,49.0,0,0,71.0
4,2018-02-05,0,Nieuwezijds Kolk,4.893731,52.376288,23,De Clercqstraat,4.870403,52.370616,49.0,0,0,72.0


## Amsterdam Event Data

In [4]:
#Full Events Dataset
events_df = pd.read_csv("../../../Data_thesis/Full_Datasets/Events.csv")

In [5]:
events_df.head()

Unnamed: 0,Event,Latitude,Longtitude,Data
0,Springsnow Festival,523726380,48941060,"{'startdate': '20-04-2018', 'enddate': '20-05-..."
1,Vurige Tongen,524103320,47490690,"{'singles': ['20-05-2018', '21-05-2018']}"
2,Sneakerness,523828340,49204560,"{'singles': ['03-06-2018', '04-06-2018']}"
3,Dutch Raw Food Festival,524362550,48167080,{'singles': ['17-06-2018']}
4,Holland Festival,523615820,48854790,"{'singles': ['02-06-2018', '03-06-2018', '04-0..."


## Crowdedness Data

In [6]:
#Full Crowdedness Dataset
crowd_df = pd.read_csv("../../../Data_thesis/Full_Datasets/Crowdedness.csv")

In [7]:
crowd_df.head()

Unnamed: 0,richting,datum,jaar,maand,dag,uur,SampleCount
0,2,2018-03-11,2018,3,11,0,0
1,2,2018-03-11,2018,3,11,1,0
2,2,2018-03-11,2018,3,11,2,0
3,2,2018-03-11,2018,3,11,3,0
4,2,2018-03-11,2018,3,11,4,0


# Data Preperation

## Train/Test Split
Split the dataset into training and test data, using a [function from Sklearn](https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.train_test_split.html)

#### Train_Test_split Parameters
- *input*: 
    - *x*: data used to predict laballed data
        - Hour slot
        - Weekday (int)
        - Weekend (Binary)
    - *y*: laballed data
        - Number of rides (int)
- *test_size*: Proportion of total data used as test data, rest is training data
- *random_state*: Number used as seed by the *Random Number Generator* 

In [8]:
#Drop Unusable columns
gvb_df = gvb_df.drop(columns=["Datum", "VertrekHalteNaam", "AankomstHalteNaam", "AantalAankomsten", "AantalVertrekken"])

#Select only the values from each row
data = gvb_df.values

#Split the labels from the rest
x = data[:, :7]
y = data[:, 7]

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.1, random_state = 42)

In [9]:
#First three rows explanatory variables of the training set
x_train[:3]

array([[1.50000000e+03, 4.91829259e+00, 5.23853212e+01, 4.88932991e+00,
        5.23693307e+01, 0.00000000e+00, 0.00000000e+00],
       [1.70000000e+03, 4.95544506e+00, 5.23350145e+01, 4.89373093e+00,
        5.23762877e+01, 2.00000000e+00, 0.00000000e+00],
       [2.10000000e+03, 4.78689401e+00, 5.23555627e+01, 4.89027056e+00,
        5.23731814e+01, 2.00000000e+00, 0.00000000e+00]])

In [10]:
#First three rows response variable training set
y_train[:3]

array([34., 35., 62.])

In [11]:
gvb_df.head()

Unnamed: 0,Uurgroep,AankomstLat,AankomstLon,VertrekLat,VertrekLon,weekday,is_weekend,AantalReizigers
0,0,4.803229,52.359132,4.891309,52.373554,0,0,106.0
1,0,4.809944,52.356369,4.891245,52.372727,0,0,114.0
2,0,4.895494,52.374929,4.867279,52.370288,0,0,25.0
3,0,4.890646,52.372446,4.869857,52.370829,0,0,71.0
4,0,4.893731,52.376288,4.870403,52.370616,0,0,72.0


# Models

## Random Forrest Regressor
Implement the [Sklearn Version](https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestRegressor.html)

Parameters
- *N-Estimators*: Number of trees in the model
- *criterion*: loss function
- *n_jobs*: The number of jobs to run in parallel for both fit and predict
- *random_state*: random_state is the seed used by the random number generator
- *bootstrap*: Whether bootstrap samples are used when building trees

In [12]:
#Set parameters model
rfg = RandomForestRegressor(n_estimators=100, criterion="mse", n_jobs=5, random_state=42, bootstrap=True)

In [13]:
#fit the model
rfg.fit(x_train, y_train)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=5,
           oob_score=False, random_state=42, verbose=0, warm_start=False)

In [14]:
#Score the model
rfg_score = rfg.score(x_test, y_test)
print("Accuracy: ", rfg_score)

Accuracy:  0.6591519389178006
