# Model Train/Test Split

## Imports

In [2]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split

## Import File

In [3]:
full_df = pd.read_csv("../../../Data_thesis/Full_Datasets/Full.csv")
full_df["Date"] = pd.to_datetime(full_df["Date"], format="%Y-%m-%d")

full_df.head()

Unnamed: 0,Date,Hour,weekday,is_weekend,Sensor,SensorLongitude,SensorLatitude,CrowdednessCount,Lon_4.8971927,Lon_4.8973336,...,month_cos,day_sin,day_cos,hour_sin,hour_cos,Nieuwmarkt score,Nieuwezijds Kolk score,Dam score,Spui score,Centraal Station score
0,2018-03-11,100,6.0,1.0,GAWW-04,4.897908,52.373283,886,0,0,...,6.123234000000001e-17,0.188227,0.982126,0.258819,0.965926,0.0,0.0,102.996844,0.0,472.993853
1,2018-03-11,2100,6.0,1.0,GAWW-07,4.900441,52.374414,1603,0,0,...,6.123234000000001e-17,0.188227,0.982126,-0.707107,0.707107,346.998829,198.995171,1266.930956,133.98973,3859.981463
2,2018-03-11,2100,6.0,1.0,GAWW-08,4.897193,52.37165,21,1,0,...,6.123234000000001e-17,0.188227,0.982126,-0.707107,0.707107,346.997145,198.996668,1266.966573,133.995346,3859.909232
3,2018-03-11,2100,6.0,1.0,GAWW-09,4.898479,52.37504,88,0,0,...,6.123234000000001e-17,0.188227,0.982126,-0.707107,0.707107,346.997014,198.997601,1266.952991,133.991938,3859.978146
4,2018-03-11,2100,6.0,1.0,GAWW-10,4.898808,52.372369,49,0,0,...,6.123234000000001e-17,0.188227,0.982126,-0.707107,0.707107,346.998943,198.995907,1266.951383,133.993174,3859.941786


## Variables

In [5]:
size=0.9

## Functions

### Classification Crowdedness Count
Divide the numerical values into four percintile classes

In [6]:
def clasCrowdednessCounts(df):
    """
    Divide the numerical counts of crowdedness into 4 classes. These classes asre based on the quantiles taken 
    over all the values. 
    """
    
    #Quantile splits
    low_split = df["CrowdednessCount"].quantile(.25)
    mid_split = df["CrowdednessCount"].quantile(.5)
    high_split = df["CrowdednessCount"].quantile(.75)
    
    clas_dict = df.to_dict("index")

    for k, v in clas_dict.items():

        if v["CrowdednessCount"] < low_split:
            v["CrowdednessCount"] = 1
        elif v["CrowdednessCount"] >= low_split and v["CrowdednessCount"] < mid_split:
            v["CrowdednessCount"] = 2
        elif v["CrowdednessCount"] >= mid_split and v["CrowdednessCount"] < high_split:
            v["CrowdednessCount"] = 3
        elif v["CrowdednessCount"] >= high_split:
            v["CrowdednessCount"] = 4
        else:
            print (k, " has class error as it fits in none")
            
    df = pd.DataFrame.from_dict(clas_dict, orient="index")

    return df

### Train/Test split Dates
Split the set based on given dates (so days won't get split up)

In [7]:
def dateSplit(df, size):
    """
    This function splits the given df based on given dates. 
    
    Input:
        - df: DataFrame that needs to be split into train/test
        - size: Size of the training test between 0 and 1
    """
    
    #Duplicate
    dates = df["Date"].unique()
    train_dates, test_dates = train_test_split(dates, train_size=size, test_size=1-size, random_state=42) 
    eval_dates, test_dates = train_test_split(test_dates, train_size=0.5, test_size=0.5, random_state=42) 
    
    return train_dates, test_dates, eval_dates

### Split Set in Train and Test

In [9]:
def trainTestSplit(df, size):
    
    df = df.drop(columns=["Hour", "Sensor", "Year", "SensorLongitude", "SensorLatitude"])
    
    #Split Train/Test based on dates
    train_dates, test_dates, eval_dates = dateSplit(df, size)
    
    train_df = df[df["Date"].isin(train_dates)].reset_index().drop(columns=["index"])
    test_df = df[df["Date"].isin(test_dates)].reset_index().drop(columns=["index"])
    eval_df = df[df["Date"].isin(eval_dates)].reset_index().drop(columns=["index"])

    #Train
    x_train = train_df.drop(["CrowdednessCount"], axis=1)
    y_train = train_df[["Date", "CrowdednessCount"]]

    #Test
    x_test = test_df.drop(["CrowdednessCount", "Date"], axis=1)
    y_test = test_df["CrowdednessCount"]
    
    #Evaluation
    x_eval = eval_df.drop(["CrowdednessCount", "Date"], axis=1)
    y_eval = eval_df["CrowdednessCount"]
    
    
    return x_train, y_train, x_test, y_test, x_eval, y_eval, train_dates

## Test