# Combine Data

## Imports

In [2]:
import pandas as pd
import numpy as np
import pickle

from sklearn.metrics.pairwise import rbf_kernel
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import scale

## Import Files

In [3]:
crowd_df = pd.read_csv("../../../Data_thesis/Full_Datasets/Crowdedness.csv")
gvb_df = pd.read_csv("../../../Data_thesis/Full_Datasets/GVBData.csv")
event_df = pd.read_csv("../../../Data_thesis/Full_Datasets/Events.csv")

## Functions

In [4]:
def DateToDatetime(df, format):
    """
    Convert column to datetime object
    """

    return pd.to_datetime(df, format=format)

In [5]:
def StartEndDate(df1, df2):
    """
    Return the start and end date of a given df
    """

    #Return the earliest and latest date in Date column of given dataframe
    return df1.min().Date, df2.max().Date 

In [6]:
def importData(crowd_df, gvb_df, event_df):
    """
    Import Data from given file location and save as DF. 
    Furhtermore, change date from string to datetime object
    """

    #Format Datetime
    date_format = "%Y-%m-%d"

    #Crowdedness DF
    crowd_df["Date"] = DateToDatetime(crowd_df["Date"], date_format)

    #GVB DF
    gvb_df["Date"] = DateToDatetime(gvb_df["Date"], date_format)

    #Event Df
    event_df["Date"] = DateToDatetime(event_df["Date"], date_format)

    return crowd_df, gvb_df, event_df

In [7]:
def changeStartEndDate(crowd_df, gvb_df, event_df):
    """
    Change the start and end date of the GVB and Event Df's
    """
    
    #Select start and end date
    start_date, end_date = StartEndDate(crowd_df, gvb_df)

    crowd_df = crowd_df[(crowd_df["Date"] >= start_date) & (
        crowd_df["Date"] <= end_date)].reset_index().drop(columns=["index"])

    gvb_df = gvb_df[(gvb_df["Date"] >= start_date) & (
        gvb_df["Date"] <= end_date)].reset_index().drop(columns=["index"])

    event_df = event_df[(event_df["Date"] >= start_date) & (
        event_df["Date"] <= end_date)].reset_index().drop(columns=["index"])

    return crowd_df, gvb_df, event_df

In [8]:
def calculateWeights(stations, df):
    
    sensors  = df["Sensor"].unique()
    weights = []
    scaler = StandardScaler()
    
    weights_dict = {}
    
    for sensor in sensors:
        y = np.array([df[df["Sensor"] == sensor].reset_index()["SensorLatitude"][0], df[df["Sensor"] == sensor].reset_index()["SensorLongitude"][0]]).reshape(1, -1)
        stations_dict = {}
        
        for station in stations:
            x = np.array([df[station + " Lat"][0], df[station + " Lon"][0]]).reshape(1, -1)
            weights.append(rbf_kernel(x, y)[0, 0])
            stations_dict[station] = len(weights) - 1

        weights_dict[sensor] = stations_dict

    weights = np.asarray(weights)
    weights = weights.reshape(-1, 1)

    weights = scaler.fit_transform(weights)
    filename = "../../../Data_thesis/Models/station_scaler.sav"
    pickle.dump(scaler, open(filename, 'wb'))
    
    for k, v in weights_dict.items():
        for station in stations:
            v[station] = weights[v[station]]
            
    return weights_dict

In [9]:
def formFullDF(crowd_df, gvb_df, event_df, stations):
    """
    Construct Full DF 
    """

    #Combine DF's
    gvb_crowd_df = pd.merge(gvb_df, crowd_df, on=["Date", "Hour", "weekday"], how="outer")
    full_df = pd.merge(gvb_crowd_df, event_df, on=["Date"], how="outer")

    #Sort keys on date
    full_df = full_df.sort_values(
        by=["Date"]).reset_index().drop(columns=["index"])

    #Fill NaN values with 0.0
    full_df = full_df.fillna(0.0)

    #Add columns for the cos and sin of month, day and year
    full_df = full_df.assign(Year=0, month_sin=0, month_cos=0,
                            day_sin=0, day_cos=0, hour_sin=0, hour_cos=0)
    
    for station in stations:
        full_df[station + " score"] = 0
        full_df[station + " weight"] = 0
        
    weights = calculateWeights(stations, full_df)

    #Transform DF to Dict
    time_dict = full_df.to_dict("index")

    #Transform Date to seperate year, month, day and hour. And transform month, day, hour to cos/sin to make it circular
    for k, v in time_dict.items():
        v["Year"] = v["Date"].year
        
        v["month_sin"] = np.sin(2 * np.pi * v["Date"].month / 12)
        v["month_cos"] = np.cos(2 * np.pi * v["Date"].month / 12)

        v["day_sin"] = np.sin(2 * np.pi * v["Date"].day / 365)
        v["day_cos"] = np.cos(2 * np.pi * v["Date"].day / 365)

        v["hour_sin"] = np.sin(2 * np.pi * v["Hour"] / 2400)
        v["hour_cos"] = np.cos(2 * np.pi * v["Hour"] / 2400)
        
        for station in stations:
            v[station + " score"] = float(weights[v["Sensor"]][station] * (v[station + " Arrivals"] + v[station + " Departures"]))    
            v[station + " weight"] = float(weights[v["Sensor"]][station])

    #Transform dict back to DF
    full_df = pd.DataFrame.from_dict(time_dict, orient="index").reset_index().drop(columns="index")
    
    for station in stations:    
        full_df.drop(columns={station + " Arrivals",station + " Departures"}, inplace=True)
    
    return full_df

## Construct needed DF

In [10]:
crowd_df, gvb_df, event_df = importData(crowd_df, gvb_df, event_df)

## Change Start and End Dates

In [11]:
crowd_df, gvb_df, event_df = changeStartEndDate(crowd_df, gvb_df, event_df)

## Construct Full DF

In [12]:
stations = ["Nieuwmarkt", "Nieuwezijds Kolk", "Dam", "Spui", "Centraal Station"]

full_df = formFullDF(crowd_df, gvb_df, event_df, stations)

In [13]:
full_df.columns

Index(['Date', 'Hour', 'Nieuwmarkt Lat', 'Nieuwmarkt Lon',
       'Nieuwezijds Kolk Lat', 'Nieuwezijds Kolk Lon', 'Dam Lat', 'Dam Lon',
       'Spui Lat', 'Spui Lon', 'Centraal Station Lat', 'Centraal Station Lon',
       'weekday', 'is_weekend', 'Sensor', 'SensorLongitude', 'SensorLatitude',
       'CrowdednessCount', 'LonScaled', 'LatScaled', 'is_event', 'Year',
       'month_sin', 'month_cos', 'day_sin', 'day_cos', 'hour_sin', 'hour_cos',
       'Nieuwmarkt score', 'Nieuwmarkt weight', 'Nieuwezijds Kolk score',
       'Nieuwezijds Kolk weight', 'Dam score', 'Dam weight', 'Spui score',
       'Spui weight', 'Centraal Station score', 'Centraal Station weight'],
      dtype='object')

### Contents

In [14]:
full_df.head()

Unnamed: 0,Date,Hour,Nieuwmarkt Lat,Nieuwmarkt Lon,Nieuwezijds Kolk Lat,Nieuwezijds Kolk Lon,Dam Lat,Dam Lon,Spui Lat,Spui Lon,...,Nieuwmarkt score,Nieuwmarkt weight,Nieuwezijds Kolk score,Nieuwezijds Kolk weight,Dam score,Dam weight,Spui score,Spui weight,Centraal Station score,Centraal Station weight
0,2018-03-11,100,52.371942,4.901239,52.376288,4.893731,52.373127,4.89008,52.369097,4.889259,...,0.0,0.882141,0.0,0.587521,-7.303229,-0.070905,-0.0,-0.681885,163.979433,0.34668
1,2018-03-11,2100,52.371942,4.901239,52.376288,4.893731,52.373127,4.89008,52.369097,4.889259,...,352.242352,1.015108,83.818349,0.421198,-448.251607,-0.35379,-130.349894,-0.97276,1668.972876,0.432376
2,2018-03-11,2100,52.371942,4.901239,52.376288,4.893731,52.373127,4.89008,52.369097,4.889259,...,384.307874,1.107515,8.440208,0.042413,-1899.118956,-1.49891,-352.192566,-2.628303,3994.026557,1.034722
3,2018-03-11,2200,52.371942,4.901239,52.376288,4.893731,52.373127,4.89008,52.369097,4.889259,...,342.287656,1.049962,42.31473,0.253382,-1231.740804,-1.207589,-279.71009,-2.350505,3446.831547,1.043862
4,2018-03-11,2200,52.371942,4.901239,52.376288,4.893731,52.373127,4.89008,52.369097,4.889259,...,343.52431,1.053756,73.166332,0.438122,-730.059883,-0.715745,-196.552679,-1.651703,2770.30116,0.838977


## From DF to CSV File

In [28]:
full_df.to_csv("../../../Data_thesis/Full_Datasets/Full.csv", index=False)