# Model Train/Test Split

## Imports

In [41]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split

## Import File

In [2]:
full_df = pd.read_csv("../../../Data_thesis/Full_Datasets/Full.csv")
full_df["Date"] = pd.to_datetime(full_df["Date"], format="%Y-%m-%d")

full_df.head()

Unnamed: 0,Date,Hour,Nieuwmarkt Arrivals,Nieuwezijds Kolk Arrivals,Dam Arrivals,Spui Arrivals,Nieuwmarkt Departures,Nieuwezijds Kolk Departures,Dam Departures,Spui Departures,...,SensorLatitude,CrowdednessCount,is_event,Year,month_sin,month_cos,day_sin,day_cos,hour_sin,hour_cos
0,2018-03-11,100,0.0,0.0,13.0,0.0,0.0,0.0,90.0,0.0,...,5,886,0.0,2018,1.0,6.123234000000001e-17,0.188227,0.982126,0.258819,0.965926
1,2018-03-11,2100,136.0,80.0,409.0,40.0,211.0,119.0,858.0,94.0,...,7,1603,0.0,2018,1.0,6.123234000000001e-17,0.188227,0.982126,-0.707107,0.707107
2,2018-03-11,2100,136.0,80.0,409.0,40.0,211.0,119.0,858.0,94.0,...,0,21,0.0,2018,1.0,6.123234000000001e-17,0.188227,0.982126,-0.707107,0.707107
3,2018-03-11,2100,136.0,80.0,409.0,40.0,211.0,119.0,858.0,94.0,...,9,88,0.0,2018,1.0,6.123234000000001e-17,0.188227,0.982126,-0.707107,0.707107
4,2018-03-11,2100,136.0,80.0,409.0,40.0,211.0,119.0,858.0,94.0,...,2,49,0.0,2018,1.0,6.123234000000001e-17,0.188227,0.982126,-0.707107,0.707107


## Variables

In [42]:
#Date to split the train/test and evaluation set on
split_date = pd.Timestamp(2018, 12, 31)

#Sensor used for test/train visualization
sensor = "GAWW-02"

#Start/End date evaluation visualization
eval_start_date = pd.Timestamp(2019, 1, 1)
eval_end_date = pd.Timestamp(2019, 1, 31)

size=0.9

## Functions

### Classification Crowdedness Count
Divide the numerical values into four percintile classes

In [4]:
def clasCrowdednessCounts(df):
    """
    Divide the numerical counts of crowdedness into 4 classes. These classes asre based on the quantiles taken 
    over all the values. 
    """
    
    #Quantile splits
    low_split = df["CrowdednessCount"].quantile(.25)
    mid_split = df["CrowdednessCount"].quantile(.5)
    high_split = df["CrowdednessCount"].quantile(.75)
    
    clas_dict = df.to_dict("index")

    for k, v in clas_dict.items():

        if v["CrowdednessCount"] < low_split:
            v["CrowdednessCount"] = 1
        elif v["CrowdednessCount"] >= low_split and v["CrowdednessCount"] < mid_split:
            v["CrowdednessCount"] = 2
        elif v["CrowdednessCount"] >= mid_split and v["CrowdednessCount"] < high_split:
            v["CrowdednessCount"] = 3
        elif v["CrowdednessCount"] >= high_split:
            v["CrowdednessCount"] = 4
        else:
            print (k, " has class error as it fits in none")
            
    df = pd.DataFrame.from_dict(clas_dict, orient="index")

    return df

### Test Sensor
Select sensor used in the test and evaluation set

In [11]:
def listSensors(df, path):
    df = df.drop_duplicates(subset="Sensor").reset_index()
    df = df[["Sensor", "SensorLatitude", "SensorLongitude"]]
    
    df.to_csv(path, index=False)

In [12]:
# Possible sensors
for sensor in np.sort(full_df.Sensor.unique()):
    print("Sensor: ", sensor)

Sensor:  GAWW-01
Sensor:  GAWW-02
Sensor:  GAWW-03
Sensor:  GAWW-04
Sensor:  GAWW-05
Sensor:  GAWW-06
Sensor:  GAWW-07
Sensor:  GAWW-08
Sensor:  GAWW-09
Sensor:  GAWW-10


In [13]:
def testSensorLat(df, sensor):
    """
    Select the unique label of the latitude for the given sensor and return this
    """
    return df[df["Sensor"] == sensor].reset_index()["SensorLatitude"][0]

### Train/Test split Dates
Split the set based on given dates (so days won't get split up)

In [1]:
def dateSplit(df, size):
    """
    This function splits the given df based on given dates. 
    
    Input:
        - df: DataFrame that needs to be split into train/test
        - size: Size of the training test between 0 and 1
    """
    
    #Duplicate
    dates = df["Date"].unique()
    train_dates, test_dates = train_test_split(dates, train_size=size, test_size=1-size, random_state=42) 
    
    return train_dates, test_dates

### Split Set in Train and Test

In [1]:
def trainTestSplit(df, size):
    
    df = df.drop(columns=["Hour", "Sensor", "Year"])
    
    #Split Train/Test based on dates
    train_dates, test_dates = dateSplit(df, size)
    
    train_df = df[df["Date"].isin(train_dates)].reset_index().drop(columns=["index"])
    test_df = df[df["Date"].isin(test_dates)].reset_index().drop(columns=["index"])

    #Train
    x_train = train_df.drop(["CrowdednessCount"], axis=1)
    y_train = train_df[["Date", "CrowdednessCount"]]

    #Test
    x_test = test_df.drop(["CrowdednessCount", "Date"], axis=1)
    y_test = test_df["CrowdednessCount"]
    
    
    return x_train, y_train, x_test, y_test, train_dates

## Test