# Full Dataset

## Imports

In [1]:
import pandas as pd
import numpy as np

## Import Files

### Crowdedness Dataset
Constructed in the [Crowdedness Notebook](Crowdedness.ipynb)

In [2]:
crowd_df = pd.read_csv("../../../Data_thesis/Full_Datasets/Crowdedness.csv")

#### Convert Date from string to datetime object

In [3]:
crowd_df["Date"] = pd.to_datetime(crowd_df["Date"], format="%Y-%m-%d")

#### Set Start and End Date
Accroding to the ranges in the crowd_df, as these consist of the target values and represent the smallest range in dates

In [4]:
# Start data
start_date = crowd_df.min().Date

# End date
end_date = crowd_df.max().Date

#### Contents

In [5]:
crowd_df.head()

Unnamed: 0,Sensor,Date,Hour,SensorLongitude,SensorLatitude,CrowdednessCount
0,GAWW-01,2018-03-11,100,4.899847,52.374601,3133
1,GAWW-01,2018-03-11,200,4.899847,52.374601,2120
2,GAWW-01,2018-03-11,300,4.899847,52.374601,1419
3,GAWW-01,2018-03-11,400,4.899847,52.374601,1085
4,GAWW-01,2018-03-11,500,4.899847,52.374601,498


### GVB Dataset
Constructed in the [GVBData Notebook](GVBData.ipynb)

In [6]:
gvb_df = pd.read_csv("../../../Data_thesis/Full_Datasets/GVB.csv")

#### Convert Date from string to datetime object

In [7]:
gvb_df["Date"] = pd.to_datetime(gvb_df["Date"], format="%Y-%m-%d")

#### Only select date between start and end data

In [8]:
gvb_df = gvb_df[(gvb_df["Date"] > start_date) & (gvb_df["Date"] < end_date)].reset_index().drop(columns=["index"])

#### Contents

In [9]:
gvb_df.head()

Unnamed: 0,Date,Hour,NieuwmarktCode,NieuwmarktLat,NieuwmarktLon,NieuwmarktArrivals,NieuwmarktDepartures,NieuwezijdsCode,NieuwezijdsLat,NieuwezijdsLon,...,DamLon,DamArrivals,DamDepartures,SpuiCode,SpuiLat,SpuiLon,SpuiArrivals,SpuiDepartures,weekday,is_weekend
0,2018-04-01,2400,NMT,52.371942,4.901239,31.0,115.0,5069,52.376288,4.893731,...,4.89008,43.0,254.0,5062,52.369097,4.889259,0.0,46.0,6,1
1,2018-04-01,600,NMT,52.371942,4.901239,67.0,50.0,5069,52.376288,4.893731,...,4.89008,129.0,0.0,5062,52.369097,4.889259,11.0,0.0,6,1
2,2018-04-01,700,NMT,52.371942,4.901239,99.0,120.0,5069,52.376288,4.893731,...,4.89008,174.0,57.0,5062,52.369097,4.889259,54.0,17.0,6,1
3,2018-04-01,800,NMT,52.371942,4.901239,222.0,236.0,5069,52.376288,4.893731,...,4.89008,491.0,111.0,5062,52.369097,4.889259,139.0,64.0,6,1
4,2018-04-01,900,NMT,52.371942,4.901239,269.0,169.0,5069,52.376288,4.893731,...,4.89008,764.0,151.0,5062,52.369097,4.889259,215.0,61.0,6,1


### Amsterdam Events
Constructed in the [AmsterdamEvent Notebook](AmsterdamEvent.ipynb)

In [10]:
events_df = pd.read_csv("../../../Data_thesis/Full_Datasets/Events.csv")

#### Convert Date from string to datetime object

In [11]:
events_df["Date"] = pd.to_datetime(events_df["Date"], format="%Y-%m-%d")

#### Only select date between start and end data

In [12]:
events_df = events_df[(events_df["Date"] > start_date) & (events_df["Date"] < end_date)].reset_index().drop(columns=["index"])

#### Contenst

In [13]:
events_df.head()

Unnamed: 0,Date,is_event,event_lat,event_lon
0,2018-04-20,1.0,52.372638,4.894106
1,2018-05-20,1.0,52.372638,4.894106
2,2018-06-02,1.0,52.361582,4.885479
3,2018-06-03,1.0,52.361582,4.885479
4,2018-06-04,1.0,52.361582,4.885479


## Combine Datasets

In [14]:
gvb_crowd_df = pd.merge(gvb_df, crowd_df, on=["Date", "Hour"], how="outer")
full_df = pd.merge(gvb_crowd_df, events_df, on=["Date"], how="outer")

full_df = full_df.sort_values(by=["Date"]).reset_index().drop(columns=["index"])

## Transform Df

### Fill NaN with 0.0

In [15]:
full_df = full_df.fillna(0.0)

### Make Time Circular

In [16]:
#Add columns for the cos and sin of month, day and year
full_df = full_df.assign(Year=0, month_sin=0, month_cos=0, day_sin=0, day_cos=0, hour_sin=0, hour_cos=0)

#Transform DF to Dict
time_dict = full_df.to_dict("index")

#Transform Date to seperate year, month, day and hour. And transform month, day, hour to cos/sin to make it circular
for k, v in time_dict.items():    
    v["Year"] = v["Date"].year
    
    v["month_sin"] = np.sin(2 * np.pi * v["Date"].month / 12)
    v["month_cos"] = np.cos(2 * np.pi * v["Date"].month / 12)
    
    v["day_sin"] = np.sin(2 * np.pi * v["Date"].day / 365)
    v["day_cos"] = np.cos(2 * np.pi * v["Date"].day / 365)

    v["hour_sin"] = np.sin(2 * np.pi * v["Hour"] / 2400)
    v["hour_cos"] = np.cos(2 * np.pi * v["Hour"] / 2400)
    
#Transform dict back to DF    
full_df = pd.DataFrame.from_dict(time_dict, orient="index").reset_index()

### Change order columns

In [17]:
cols = ["Date", "Hour",'is_event', 'event_lat', 'event_lon', 'NieuwmarktLat', 'NieuwmarktLon', 
        'NieuwmarktArrivals', 'NieuwmarktDepartures', 'NieuwezijdsLat', 'NieuwezijdsLon', 
        'NieuwezijdsArrivals', 'NieuwezijdsDepartures', 'DamLat', 'DamLon', 'DamArrivals', 
        'DamDepartures', 'SpuiLat', 'SpuiLon', 'SpuiArrivals', 'SpuiDepartures', 'weekday', 'is_weekend',
        'SensorLongitude', 'SensorLatitude', 'Year', 'month_sin', 'month_cos', 
        'day_sin', 'day_cos', 'hour_sin', 'hour_cos', 'CrowdednessCount']

full_df = full_df[cols]

### Contents

In [18]:
full_df.head()

Unnamed: 0,Date,Hour,is_event,event_lat,event_lon,NieuwmarktLat,NieuwmarktLon,NieuwmarktArrivals,NieuwmarktDepartures,NieuwezijdsLat,...,SensorLongitude,SensorLatitude,Year,month_sin,month_cos,day_sin,day_cos,hour_sin,hour_cos,CrowdednessCount
0,2018-03-11,1200,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,4.898903,52.373786,2018,1.0,6.123234000000001e-17,0.188227,0.982126,1.224647e-16,-1.0,0.0
1,2018-03-11,1500,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,4.897193,52.37165,2018,1.0,6.123234000000001e-17,0.188227,0.982126,-0.7071068,-0.707107,45.0
2,2018-03-11,1500,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,4.898479,52.37504,2018,1.0,6.123234000000001e-17,0.188227,0.982126,-0.7071068,-0.707107,68.0
3,2018-03-11,1500,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,4.898808,52.372369,2018,1.0,6.123234000000001e-17,0.188227,0.982126,-0.7071068,-0.707107,25.0
4,2018-03-11,1600,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,4.899847,52.374601,2018,1.0,6.123234000000001e-17,0.188227,0.982126,-0.8660254,-0.5,2404.0


## From DF to CSV File

In [19]:
full_df.to_csv("../../../Data_thesis/Full_Datasets/Full.csv", index=False)