# Regression Model Train/Test/Eval Split

## Imports

In [4]:
import pandas as pd
import numpy as np

## Import File

In [2]:
full_df = pd.read_csv("../../../Data_thesis/Full_Datasets/Full.csv")
full_df["Date"] = pd.to_datetime(full_df["Date"], format="%Y-%m-%d")

full_df.head()

Unnamed: 0,Date,Hour,Nieuwmarkt Arrivals,Nieuwezijds Kolk Arrivals,Dam Arrivals,Spui Arrivals,Nieuwmarkt Departures,Nieuwezijds Kolk Departures,Dam Departures,Spui Departures,...,SensorLatitude,CrowdednessCount,is_event,Year,month_sin,month_cos,day_sin,day_cos,hour_sin,hour_cos
0,2018-03-11,2400,77.0,38.0,187.0,40.0,198.0,63.0,411.0,87.0,...,6,0,0.0,2018,1.0,6.123234000000001e-17,0.188227,0.982126,-2.449294e-16,1.0
1,2018-03-11,2100,136.0,80.0,409.0,40.0,211.0,119.0,858.0,94.0,...,0,21,0.0,2018,1.0,6.123234000000001e-17,0.188227,0.982126,-0.7071068,0.707107
2,2018-03-11,2100,136.0,80.0,409.0,40.0,211.0,119.0,858.0,94.0,...,9,88,0.0,2018,1.0,6.123234000000001e-17,0.188227,0.982126,-0.7071068,0.707107
3,2018-03-11,2100,136.0,80.0,409.0,40.0,211.0,119.0,858.0,94.0,...,2,49,0.0,2018,1.0,6.123234000000001e-17,0.188227,0.982126,-0.7071068,0.707107
4,2018-03-11,2200,116.0,64.0,288.0,21.0,210.0,103.0,732.0,98.0,...,8,2900,0.0,2018,1.0,6.123234000000001e-17,0.188227,0.982126,-0.5,0.866025


## Variables

In [37]:
#Date to split the train/test and evaluation set on
split_date = pd.Timestamp(2018, 12, 31)

#Sensor used for test/train visualization
sensor = "GAWW-02"

#Start/End date evaluation visualization
eval_start_date = pd.Timestamp(2019, 1, 1)
eval_end_date = pd.Timestamp(2019, 1, 31)

size=0.9

## Functions

### Test Sensor
Select sensor used in the test and evaluation set

In [5]:
# Possible sensors
for sensor in np.sort(full_df.Sensor.unique()):
    print("Sensor: ", sensor)

Sensor:  GAWW-01
Sensor:  GAWW-02
Sensor:  GAWW-03
Sensor:  GAWW-04
Sensor:  GAWW-05
Sensor:  GAWW-06
Sensor:  GAWW-07
Sensor:  GAWW-08
Sensor:  GAWW-09
Sensor:  GAWW-10


In [24]:
def testSensorLat(df, sensor):
    """
    Select the unique label of the latitude for the given sensor and return this
    """
    return df[df["Sensor"] == sensor].reset_index()["SensorLatitude"][0]

In [35]:
print(sensor)

Gaww-02


In [40]:
full_df[full_df["Sensor"] == sensor].reset_index()["SensorLatitude"][0]

6

### Train/Test split Dates
Split the set based on given dates (so days won't get split up)

In [42]:
def dateSplit(df, size):
    """
    This function splits the given df based on given dates. 
    
    Input:
        - df: DataFrame that needs to be split into train/test
        - size: Size of the training test between 0 and 1
    """
    
    dates = df["Date"].values
    np.random.shuffle(dates)
    split = int(dates.size * size)
    
    train_dates = dates[:split]
    test_dates = dates[split:]
    
    return train_dates, test_dates

### Split Set in Train, Test, and Eval

In [28]:
def trainTestSplit(df, size, split_date, sensor):
    
    TrainTest_df = df[df["Date"] <= split_date].reset_index().drop(columns=["index", "Hour", "Sensor", "Year"])
    test_lat = testSensorLat(df, sensor)
    
    #Split Train/Test based on dates
    train_dates, test_dates = dateSplit(TrainTest_df, size)
    
    train_df_reg = TrainTest_df[TrainTest_df["Date"].isin(train_dates)].reset_index().drop(columns=["Date","index"])
    test_df_reg = TrainTest_df[TrainTest_df["Date"].isin(test_dates)].reset_index().drop(columns=["index"])
    
    #Train
    x_train_reg = train_df_reg.drop(["CrowdednessCount"], axis=1)
    y_train_reg = train_df_reg["CrowdednessCount"]

    #Test
    x_test_reg = test_df_reg.drop(["CrowdednessCount", "Date"], axis=1)
    y_test_reg = test_df_reg["CrowdednessCount"]

    test_reg_series = test_df_reg[(test_df_reg["SensorLatitude"] == test_lat) & 
                                  (test_df_reg["Date"] == test_dates[0])].reset_index()
    x_test_reg_series = test_reg_series.drop(["CrowdednessCount", "Date", "index"], axis=1)
    y_test_reg_series = test_reg_series["CrowdednessCount"]
    
    feature_labels = x_train_reg.columns.values
    
    return x_train_reg, y_train_reg, x_test_reg, y_test_reg, x_test_reg_series, y_test_reg_series, feature_labels

In [44]:
def evalSplit(df, split_date, sensor, eval_start_date, eval_end_date):
    
    Eval_df = df[df["Date"] > split_date].reset_index().drop(columns=["index", "Hour", "Sensor", "Year"])
    test_lat = testSensorLat(df, sensor)
    
    #Timeseries
    x_eval_reg = df.drop(["CrowdednessCount", "Date"], axis=1)
    y_eval_reg = df["CrowdednessCount"]

    #Subset timeseries
    sub_series = Eval_df[(Eval_df["SensorLatitude"] == test_lat) & 
                               (Eval_df["Date"] >= eval_start_date) & 
                               (Eval_df["Date"] <= eval_end_date)].reset_index()

    #Time series
    x_series_reg = sub_series.drop(["Date","CrowdednessCount", "index"], axis=1)
    y_series_reg = sub_series["CrowdednessCount"]
    
    return x_eval_reg, y_eval_reg, x_series_reg, y_series_reg

## Test