# Full Dataset

## Imports

In [1]:
import pandas as pd
import numpy as np

## Import Files

### Crowdedness Dataset
Constructed in the [Crowdedness Notebook](Crowdedness.ipynb)

In [2]:
crowd_df = pd.read_csv("../../../Data_thesis/Full_Datasets/Crowdedness.csv")

## Functions

In [25]:
def DateToDatetime(df, format):
    """
    Convert column to datetime object
    """

    return pd.to_datetime(df, format=format)

In [26]:
def StartEndDate(df):
    """
    Return the start and end date of a given df
    """

    #Return the earliest and latest date in Date column of given dataframe
    return df.min().Date, df.max().Date 

In [35]:
def importData():
    """
    Import Data from given file location and save as DF. 
    Furhtermore, change date from string to datetime object
    """

    #Format Datetime
    date_format = "%Y-%m-%d"

    #Crowdedness DF
    crowd_df = pd.read_csv("../../../Data_thesis/Full_Datasets/Crowdedness.csv")
    crowd_df["Date"] = DateToDatetime(crowd_df["Date"], date_format)

    #GVB DF
    gvb_df = pd.read_csv("../../../Data_thesis/Full_Datasets/GVBData.csv")
    gvb_df["Date"] = DateToDatetime(gvb_df["Date"], date_format)

    #Event Df
    event_df = pd.read_csv("../../../Data_thesis/Full_Datasets/Events.csv")
    event_df["Date"] = DateToDatetime(event_df["Date"], date_format)

    return crowd_df, gvb_df, event_df

In [28]:
def changeStartEndDate(crowd_df, gvb_df, event_df):
    """
    Change the start and end date of the GVB and Event Df's
    """
    
    #Select start and end date
    start_date, end_date = StartEndDate(crowd_df)

    gvb_df = gvb_df[(gvb_df["Date"] >= start_date) & (
        gvb_df["Date"] <= end_date)].reset_index().drop(columns=["index"])

    event_df = event_df[(event_df["Date"] >= start_date) & (
        event_df["Date"] <= end_date)].reset_index().drop(columns=["index"])

    return gvb_df, event_df

In [29]:
def formFullDF(crowd_df, gvb_df, event_df):
    """
    Construct Full DF 
    """

    #Combine DF's
    gvb_crowd_df = pd.merge(gvb_df, crowd_df, on=["Date", "Hour"], how="outer")
    full_df = pd.merge(gvb_crowd_df, event_df, on=["Date"], how="outer")

    #Sort keys on date
    full_df = full_df.sort_values(
        by=["Date"]).reset_index().drop(columns=["index"])

    #Fill NaN values with 0.0
    full_df = full_df.fillna(0.0)

    #Add columns for the cos and sin of month, day and year
    full_df = full_df.assign(Year=0, month_sin=0, month_cos=0,
                            day_sin=0, day_cos=0, hour_sin=0, hour_cos=0)

    #Transform DF to Dict
    time_dict = full_df.to_dict("index")

    #Transform Date to seperate year, month, day and hour. And transform month, day, hour to cos/sin to make it circular
    for k, v in time_dict.items():
        v["Year"] = v["Date"].year

        v["month_sin"] = np.sin(2 * np.pi * v["Date"].month / 12)
        v["month_cos"] = np.cos(2 * np.pi * v["Date"].month / 12)

        v["day_sin"] = np.sin(2 * np.pi * v["Date"].day / 365)
        v["day_cos"] = np.cos(2 * np.pi * v["Date"].day / 365)

        v["hour_sin"] = np.sin(2 * np.pi * v["Hour"] / 2400)
        v["hour_cos"] = np.cos(2 * np.pi * v["Hour"] / 2400)

    #Transform dict back to DF
    return pd.DataFrame.from_dict(time_dict, orient="index").reset_index()

## Construct needed DF

In [36]:
crowd_df, gvb_df, event_df = importData()

## Change Start en 

#### Convert Date from string to datetime object

In [3]:
crowd_df["Date"] = pd.to_datetime(crowd_df["Date"], format="%Y-%m-%d")

#### Set Start and End Date
Accroding to the ranges in the crowd_df, as these consist of the target values and represent the smallest range in dates

In [4]:
# Start data
start_date = crowd_df.min().Date

# End date
end_date = crowd_df.max().Date

#### Contents

In [5]:
crowd_df.head()

Unnamed: 0,Sensor,Date,Hour,SensorLongitude,SensorLatitude,CrowdednessCount
0,GAWW-01,2018-03-11,100,8,8,3133
1,GAWW-01,2018-03-11,200,8,8,2120
2,GAWW-01,2018-03-11,300,8,8,1419
3,GAWW-01,2018-03-11,400,8,8,1085
4,GAWW-01,2018-03-11,500,8,8,498


### GVB Dataset
Constructed in the [GVBData Notebook](GVBData.ipynb)

In [7]:
gvb_df = pd.read_csv("../../../Data_thesis/Full_Datasets/GVBData.csv")

#### Convert Date from string to datetime object

In [8]:
gvb_df["Date"] = pd.to_datetime(gvb_df["Date"], format="%Y-%m-%d")

#### Only select date between start and end data

In [9]:
gvb_df = gvb_df[(gvb_df["Date"] > start_date) & (gvb_df["Date"] < end_date)].reset_index().drop(columns=["index"])

#### Contents

In [10]:
gvb_df.head()

Unnamed: 0,Date,Hour,Nieuwmarkt Arrivals,Nieuwezijds Kolk Arrivals,Dam Arrivals,Spui Arrivals,Nieuwmarkt Departures,Nieuwezijds Kolk Departures,Dam Departures,Spui Departures,weekday,is_weekend
0,2019-01-01,2400,10.0,0.0,0.0,0.0,20.0,0.0,0.0,0.0,1,0
1,2019-01-01,100,25.0,0.0,0.0,0.0,531.0,27.0,17.0,0.0,1,0
2,2019-01-01,200,68.0,0.0,16.0,0.0,329.0,29.0,75.0,0.0,1,0
3,2019-01-01,300,19.0,0.0,0.0,0.0,151.0,43.0,85.0,0.0,1,0
4,2019-01-01,400,21.0,0.0,0.0,0.0,89.0,17.0,75.0,0.0,1,0


### Amsterdam Events
Constructed in the [AmsterdamEvent Notebook](AmsterdamEvent.ipynb)

In [15]:
events_df = pd.read_csv("../../../Data_thesis/Full_Datasets/Events.csv")

#### Convert Date from string to datetime object

In [16]:
events_df["Date"] = pd.to_datetime(events_df["Date"], format="%Y-%m-%d")

#### Only select date between start and end data

In [17]:
events_df = events_df[(events_df["Date"] > start_date) & (events_df["Date"] < end_date)].reset_index().drop(columns=["index"])

#### Contenst

In [18]:
events_df.head()

Unnamed: 0,Date,is_event
0,2018-04-20,1.0
1,2018-05-20,1.0
2,2018-06-02,1.0
3,2018-06-03,1.0
4,2018-06-04,1.0


## Combine Datasets

In [19]:
gvb_crowd_df = pd.merge(gvb_df, crowd_df, on=["Date", "Hour"], how="outer")
full_df = pd.merge(gvb_crowd_df, events_df, on=["Date"], how="outer")

full_df = full_df.sort_values(by=["Date"]).reset_index().drop(columns=["index"])

## Transform Df

### Fill NaN with 0.0

In [20]:
full_df = full_df.fillna(0.0)

### Make Time Circular

In [21]:
#Add columns for the cos and sin of month, day and year
full_df = full_df.assign(Year=0, month_sin=0, month_cos=0, day_sin=0, day_cos=0, hour_sin=0, hour_cos=0)

#Transform DF to Dict
time_dict = full_df.to_dict("index")

#Transform Date to seperate year, month, day and hour. And transform month, day, hour to cos/sin to make it circular
for k, v in time_dict.items():    
    v["Year"] = v["Date"].year
    
    v["month_sin"] = np.sin(2 * np.pi * v["Date"].month / 12)
    v["month_cos"] = np.cos(2 * np.pi * v["Date"].month / 12)
    
    v["day_sin"] = np.sin(2 * np.pi * v["Date"].day / 365)
    v["day_cos"] = np.cos(2 * np.pi * v["Date"].day / 365)

    v["hour_sin"] = np.sin(2 * np.pi * v["Hour"] / 2400)
    v["hour_cos"] = np.cos(2 * np.pi * v["Hour"] / 2400)
    
#Transform dict back to DF    
full_df = pd.DataFrame.from_dict(time_dict, orient="index").reset_index()

### Contents

In [23]:
full_df.head()

Unnamed: 0,index,Date,Hour,Nieuwmarkt Arrivals,Nieuwezijds Kolk Arrivals,Dam Arrivals,Spui Arrivals,Nieuwmarkt Departures,Nieuwezijds Kolk Departures,Dam Departures,...,SensorLatitude,CrowdednessCount,is_event,Year,month_sin,month_cos,day_sin,day_cos,hour_sin,hour_cos
0,0,2018-03-11,600,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,9.0,22.0,0.0,2018,1.0,6.123234000000001e-17,0.188227,0.982126,1.0,6.123234000000001e-17
1,1,2018-03-11,900,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,5.0,40.0,0.0,2018,1.0,6.123234000000001e-17,0.188227,0.982126,0.707107,-0.7071068
2,2,2018-03-11,900,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,4.0,14.0,0.0,2018,1.0,6.123234000000001e-17,0.188227,0.982126,0.707107,-0.7071068
3,3,2018-03-11,900,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,6.0,0.0,0.0,2018,1.0,6.123234000000001e-17,0.188227,0.982126,0.707107,-0.7071068
4,4,2018-03-11,900,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,8.0,125.0,0.0,2018,1.0,6.123234000000001e-17,0.188227,0.982126,0.707107,-0.7071068


## From DF to CSV File

In [24]:
full_df.to_csv("../../../Data_thesis/Full_Datasets/Full.csv", index=False)