# Model Train/Test Split

## Imports

In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split

## Import File

In [2]:
full_df = pd.read_csv("../../../Data_thesis/Full_Datasets/Full.csv")
full_df["Date"] = pd.to_datetime(full_df["Date"], format="%Y-%m-%d")

full_df.head()

Unnamed: 0,Date,Hour,Nieuwmarkt Lat,Nieuwmarkt Lon,Nieuwezijds Kolk Lat,Nieuwezijds Kolk Lon,Dam Lat,Dam Lon,Spui Lat,Spui Lon,...,Nieuwmarkt score,Nieuwmarkt weight,Nieuwezijds Kolk score,Nieuwezijds Kolk weight,Dam score,Dam weight,Spui score,Spui weight,Centraal Station score,Centraal Station weight
0,2018-03-11,100,52.371942,4.901239,52.376288,4.893731,52.373127,4.89008,52.369097,4.889259,...,0.0,0.999992,0.0,0.999986,102.997272,0.999974,0.0,0.999962,472.991346,0.999982
1,2018-03-11,2100,52.371942,4.901239,52.376288,4.893731,52.373127,4.89008,52.369097,4.889259,...,346.9982,0.999995,198.99665,0.999983,1266.959412,0.999968,133.994081,0.999956,3859.935862,0.999983
2,2018-03-11,2100,52.371942,4.901239,52.376288,4.893731,52.373127,4.89008,52.369097,4.889259,...,346.998829,0.999997,198.995171,0.999976,1266.930956,0.999946,133.98973,0.999923,3859.981463,0.999995
3,2018-03-11,2200,52.371942,4.901239,52.376288,4.893731,52.373127,4.89008,52.369097,4.889259,...,325.998532,0.999995,166.996639,0.99998,1019.950244,0.999951,118.991528,0.999929,3301.984735,0.999995
4,2018-03-11,2200,52.371942,4.901239,52.376288,4.893731,52.373127,4.89008,52.369097,4.889259,...,325.998556,0.999996,166.997244,0.999983,1019.960083,0.999961,118.993159,0.999943,3301.971466,0.999991


## Variables

In [5]:
size=0.9

## Functions

### Classification Crowdedness Count
Divide the numerical values into four percintile classes

In [6]:
def clasCrowdednessCounts(df):
    """
    Divide the numerical counts of crowdedness into 4 classes. These classes asre based on the quantiles taken 
    over all the values. 
    """
    
    #Quantile splits
    low_split = df["CrowdednessCount"].quantile(.25)
    mid_split = df["CrowdednessCount"].quantile(.5)
    high_split = df["CrowdednessCount"].quantile(.75)
    
    clas_dict = df.to_dict("index")

    for k, v in clas_dict.items():

        if v["CrowdednessCount"] < low_split:
            v["CrowdednessCount"] = 1
        elif v["CrowdednessCount"] >= low_split and v["CrowdednessCount"] < mid_split:
            v["CrowdednessCount"] = 2
        elif v["CrowdednessCount"] >= mid_split and v["CrowdednessCount"] < high_split:
            v["CrowdednessCount"] = 3
        elif v["CrowdednessCount"] >= high_split:
            v["CrowdednessCount"] = 4
        else:
            print (k, " has class error as it fits in none")
            
    df = pd.DataFrame.from_dict(clas_dict, orient="index")

    return df

### Train/Test split Dates
Split the set based on given dates (so days won't get split up)

In [1]:
def dateSplit(df, size):
    """
    This function splits the given df based on given dates. 
    
    Input:
        - df: DataFrame that needs to be split into train/test
        - size: Size of the training test between 0 and 1
    """
    
    #Duplicate
    dates = df["Date"].unique()
    train_dates, eval_dates = train_test_split(dates, train_size=size, test_size=1-size, random_state=42)  
    
    return train_dates, eval_dates

### Split Set in Train and Test

In [3]:
def trainTestSplit(df, size, stations):
    
    df = df.drop(columns=["Hour", "Sensor", "Year", "SensorLongitude", "SensorLatitude"])
    
    for station in stations:
        df.drop(columns={station + " Lon",station + " Lat"}, inplace=True)
    
    #Split Train/Test based on dates
    train_dates, eval_dates = dateSplit(df, size)
    
    train_df = df[df["Date"].isin(train_dates)].reset_index().drop(columns=["index"])
    eval_df = df[df["Date"].isin(eval_dates)].reset_index().drop(columns=["index"])

    #Train
    x_train = train_df.drop(["CrowdednessCount"], axis=1)
    y_train = train_df[["Date", "CrowdednessCount"]]
    
    #Evaluation
    x_eval = eval_df.drop(["Date", "CrowdednessCount"], axis=1)
    y_eval = eval_df["CrowdednessCount"]
    
    
    return x_train, y_train, x_eval, y_eval, train_dates

## Test