# Full Dataset

## Imports

In [1]:
import pandas as pd
import numpy as np

## Import Files

### Crowdedness Dataset
Constructed in the [Crowdedness Notebook](Crowdedness.ipynb)

In [2]:
crowd_df = pd.read_csv("../../../Data_thesis/Full_Datasets/Crowdedness.csv")

## Functions

In [25]:
def DateToDatetime(df, format):
    """
    Convert column to datetime object
    """

    return pd.to_datetime(df, format=format)

In [26]:
def StartEndDate(df):
    """
    Return the start and end date of a given df
    """

    #Return the earliest and latest date in Date column of given dataframe
    return df.min().Date, df.max().Date 

In [35]:
def importData():
    """
    Import Data from given file location and save as DF. 
    Furhtermore, change date from string to datetime object
    """

    #Format Datetime
    date_format = "%Y-%m-%d"

    #Crowdedness DF
    crowd_df = pd.read_csv("../../../Data_thesis/Full_Datasets/Crowdedness.csv")
    crowd_df["Date"] = DateToDatetime(crowd_df["Date"], date_format)

    #GVB DF
    gvb_df = pd.read_csv("../../../Data_thesis/Full_Datasets/GVBData.csv")
    gvb_df["Date"] = DateToDatetime(gvb_df["Date"], date_format)

    #Event Df
    event_df = pd.read_csv("../../../Data_thesis/Full_Datasets/Events.csv")
    event_df["Date"] = DateToDatetime(event_df["Date"], date_format)

    return crowd_df, gvb_df, event_df

In [28]:
def changeStartEndDate(crowd_df, gvb_df, event_df):
    """
    Change the start and end date of the GVB and Event Df's
    """
    
    #Select start and end date
    start_date, end_date = StartEndDate(crowd_df)

    gvb_df = gvb_df[(gvb_df["Date"] >= start_date) & (
        gvb_df["Date"] <= end_date)].reset_index().drop(columns=["index"])

    event_df = event_df[(event_df["Date"] >= start_date) & (
        event_df["Date"] <= end_date)].reset_index().drop(columns=["index"])

    return gvb_df, event_df

In [29]:
def formFullDF(crowd_df, gvb_df, event_df):
    """
    Construct Full DF 
    """

    #Combine DF's
    gvb_crowd_df = pd.merge(gvb_df, crowd_df, on=["Date", "Hour"], how="outer")
    full_df = pd.merge(gvb_crowd_df, event_df, on=["Date"], how="outer")

    #Sort keys on date
    full_df = full_df.sort_values(
        by=["Date"]).reset_index().drop(columns=["index"])

    #Fill NaN values with 0.0
    full_df = full_df.fillna(0.0)

    #Add columns for the cos and sin of month, day and year
    full_df = full_df.assign(Year=0, month_sin=0, month_cos=0,
                            day_sin=0, day_cos=0, hour_sin=0, hour_cos=0)

    #Transform DF to Dict
    time_dict = full_df.to_dict("index")

    #Transform Date to seperate year, month, day and hour. And transform month, day, hour to cos/sin to make it circular
    for k, v in time_dict.items():
        v["Year"] = v["Date"].year

        v["month_sin"] = np.sin(2 * np.pi * v["Date"].month / 12)
        v["month_cos"] = np.cos(2 * np.pi * v["Date"].month / 12)

        v["day_sin"] = np.sin(2 * np.pi * v["Date"].day / 365)
        v["day_cos"] = np.cos(2 * np.pi * v["Date"].day / 365)

        v["hour_sin"] = np.sin(2 * np.pi * v["Hour"] / 2400)
        v["hour_cos"] = np.cos(2 * np.pi * v["Hour"] / 2400)

    #Transform dict back to DF
    return pd.DataFrame.from_dict(time_dict, orient="index").reset_index()

## Construct needed DF

In [36]:
crowd_df, gvb_df, event_df = importData()

## Change Start and End Dates

In [37]:
gvb_df, event_df = changeStartEndDate(crowd_df, gvb_df, event_df)

## Construct Full DF

In [38]:
full_df = formFullDF(crowd_df, gvb_df, event_df)

### Contents

In [39]:
full_df.head()

Unnamed: 0,index,Date,Hour,Nieuwmarkt Arrivals,Nieuwezijds Kolk Arrivals,Dam Arrivals,Spui Arrivals,Nieuwmarkt Departures,Nieuwezijds Kolk Departures,Dam Departures,...,SensorLatitude,CrowdednessCount,is_event,Year,month_sin,month_cos,day_sin,day_cos,hour_sin,hour_cos
0,0,2018-03-11,1100,681.0,198.0,1075.0,165.0,149.0,116.0,496.0,...,52.373786,0.0,0.0,2018,1.0,6.123234000000001e-17,0.188227,0.982126,0.258819,-0.965926
1,1,2018-03-11,700,59.0,19.0,59.0,15.0,28.0,12.0,10.0,...,52.373786,0.0,0.0,2018,1.0,6.123234000000001e-17,0.188227,0.982126,0.965926,-0.258819
2,2,2018-03-11,800,123.0,42.0,218.0,43.0,57.0,24.0,47.0,...,52.373786,0.0,0.0,2018,1.0,6.123234000000001e-17,0.188227,0.982126,0.866025,-0.5
3,3,2018-03-11,900,248.0,108.0,494.0,94.0,91.0,67.0,115.0,...,52.373786,0.0,0.0,2018,1.0,6.123234000000001e-17,0.188227,0.982126,0.707107,-0.707107
4,4,2018-03-11,1000,449.0,130.0,817.0,99.0,107.0,105.0,290.0,...,52.373786,0.0,0.0,2018,1.0,6.123234000000001e-17,0.188227,0.982126,0.5,-0.866025


## From DF to CSV File

In [40]:
full_df.to_csv("../../../Data_thesis/Full_Datasets/Full.csv", index=False)