# Model Prediction

## Imports

In [123]:
import pandas as pd
import pickle
import numpy as np
import plotly
from bubbly import bubbleplot

## Import Files

In [63]:
full_df = pd.read_csv("../../../Data_thesis/Full_Datasets/Full.csv")

In [64]:
full_df.head()

Unnamed: 0,Date,Hour,weekday,is_weekend,Sensor,SensorLongitude,SensorLatitude,CrowdednessCount,LonScaled,LatScaled,...,Nieuwmarkt score,Nieuwmarkt weight,Nieuwezijds Kolk score,Nieuwezijds Kolk weight,Dam score,Dam weight,Spui score,Spui weight,Centraal Station score,Centraal Station weight
0,2018-03-11,100,6.0,1.0,GAWW-04,4.897908,52.373283,886,-0.558826,0.035969,...,0.0,0.978944,0.0,0.616678,-32.179221,-0.31242,-0.0,-1.140386,297.734037,0.629459
1,2018-03-11,2100,6.0,1.0,GAWW-07,4.900441,52.374414,1603,1.966352,1.066966,...,396.612265,1.142975,5.596253,0.028122,-2008.443214,-1.585196,-370.823855,-2.767342,4117.774978,1.066781
2,2018-03-11,2100,6.0,1.0,GAWW-08,4.897193,52.37165,21,-1.271546,-1.4525,...,306.731105,0.883951,85.461513,0.429455,-107.617256,-0.084939,-71.079754,-0.530446,262.90113,0.068109
3,2018-03-11,2100,6.0,1.0,GAWW-09,4.898479,52.37504,88,0.011331,1.637575,...,299.730494,0.863777,135.285034,0.679824,-832.481652,-0.657049,-252.977149,-1.887889,3940.747665,1.020919
4,2018-03-11,2100,6.0,1.0,GAWW-10,4.898808,52.372369,49,0.338729,-0.797289,...,402.696352,1.160508,44.878993,0.225523,-918.302668,-0.724785,-187.014683,-1.395632,2000.274786,0.518206


## Predefine Data

In [109]:
sensors = full_df["Sensor"].unique()
stations = ["Nieuwmarkt", "Nieuwezijds Kolk", "Dam", "Spui", "Centraal Station"]

sensor_dict = {}
station_dict = {}

for sensor in sensors:
    sensor_dict[sensor] = {"LonScaled": full_df[full_df["Sensor"] == sensor].reset_index()["LonScaled"][0],
                          "LatScaled": full_df[full_df["Sensor"] == sensor].reset_index()["LatScaled"][0],
                          "Longitude": full_df[full_df["Sensor"] == sensor].reset_index()["SensorLongitude"][0],
                          "Latitude": full_df[full_df["Sensor"] == sensor].reset_index()["SensorLatitude"][0]}
    temp_dict = {}
    for station in stations:
        temp_dict[station] = {"Weight": full_df[full_df["Sensor"] == sensor].reset_index()[station + " weight"][0],
                                "Score": full_df[full_df["Sensor"] == sensor].reset_index()[station + " score"].mean()}
        
    station_dict[sensor] = temp_dict

In [110]:
station_dict["GAWW-04"]

{'Nieuwmarkt': {'Weight': 0.9789436823650742, 'Score': 375.8377017807784},
 'Nieuwezijds Kolk': {'Weight': 0.616678355133574, 'Score': 130.4541260678912},
 'Dam': {'Weight': -0.3124196256695315, 'Score': -343.8617820925388},
 'Spui': {'Weight': -1.1403862353816183, 'Score': -209.28667678187907},
 'Centraal Station': {'Weight': 0.6294588524658996,
  'Score': 3238.175229561471}}

## Import Models

### Regression

In [56]:
#RFG Model
rfg_filename = "../../../Data_thesis/Models/rfg_model.sav"

#XGB Regressor
xgbr_filename = "../../../Data_thesis/Models/xgbr_model.sav"

In [57]:
rfg = pickle.load(open(rfg_filename, 'rb'))
xgbr = pickle.load(open(xgbr_filename, 'rb'))

### Classification

In [58]:
#RFG Model
rfc_filename = "../../../Data_thesis/Models/rfc_model.sav"

#XGB Regressor
xgbc_filename = "../../../Data_thesis/Models/xgbc_model.sav"

In [59]:
rfc = pickle.load(open(rfc_filename, 'rb'))
xgbc = pickle.load(open(xgbc_filename, 'rb'))

## Functions

In [100]:
def TransformDate(date):
    
    weekday = date.weekday()
    
    if weekday == 5 or weekday == 6:
        is_weekend = 1
    else:
        is_weekend = 0
        
    time = TransformTime(date)
        
    return weekday, is_weekend, time

In [165]:
def TransformTime(date):
    
    month_sin = np.sin(2 * np.pi * date.month / 12)
    month_cos = np.cos(2 * np.pi * date.month / 12)
    
    day_sin = np.sin(2 * np.pi * date.day / 365)
    day_cos = np.cos(2 * np.pi * date.day / 365)
    
    hour_sin = []
    hour_cos = []
    hour_list = []
    
    for hour in range(100, 2401, 100):
        hour_sin.append(np.sin(2 * np.pi * hour / 2400))
        hour_cos.append(np.cos(2 * np.pi * hour / 2400))
        hour_list.append(hour)
        
    return {"Month Sin": month_sin, "Month Cos": month_cos, "Day Sin": day_sin, 
          "Day Cos": day_cos, "Hour Sin": hour_sin, "Hour Cos": hour_cos, "Hour": hour_list}

In [114]:
def SelectSensor(sensor, sensor_dict, station_dict):
    
    lat_scaled = sensor_dict[sensor]["LatScaled"]
    lon_scaled = sensor_dict[sensor]["LonScaled"]
    lat = sensor_dict[sensor]["Latitude"]
    lon = sensor_dict[sensor]["Longitude"]
    station_data = station_dict[sensor]
    
    return lat_scaled, lon_scaled, station_data, lat, lon

In [116]:
def combineData(dates, sensors, sensor_dict, station_dict):
    
    input_dict = {}
        
    if len(sensors) == 1:
        j = 0
        sensor_lat, sensor_lon, station_data, lat, lon = SelectSensor(sensors, sensor_dict, station_dict)
        
        if len(dates) == 1:
        
            weekday, is_weekend, time = TransformDate(dates)

            for i in range(len(time["Hour Sin"])):
                input_dict[j] = {"weekday": weekday, "is_weekend": is_weekend, "LonScaled": sensor_lon,
                                "LatScaled": sensor_lat, "is_event": 0.0, "month_sin": time["Month Sin"],
                                "month_cos": time["Month Cos"], "day_sin": time["Day Sin"], 
                                 "day_cos": time["Day Cos"], "hour_sin": time["Hour Sin"][i], 
                                 "hour_cos": time["Hour Cos"][i], "hour": time["Hour"][i], 
                                 "Nieuwmarkt score": station_data["Nieuwmarkt"]["Score"],
                                "Nieuwmarkt weight": station_data["Nieuwmarkt"]["Weight"],
                                "Nieuwezijds Kolk score": station_data["Nieuwezijds Kolk"]["Score"],
                                "Nieuwezijds Kolk weight": station_data["Nieuwezijds Kolk"]["Weight"],
                                "Dam score": station_data["Dam"]["Score"],
                                "Dam weight": station_data["Dam"]["Weight"],
                                "Spui score": station_data["Spui"]["Score"],
                                "Spui weight": station_data["Spui"]["Weight"],
                                "Centraal Station score": station_data["Centraal Station"]["Score"],
                                "Centraal Station weight": station_data["Centraal Station"]["Weight"],
                                "Sensor": sensors, "Date": dates, "SensorLongitude": lon, "SensorLatitude": lat}

                j += 1
        
        else:
            
            for date in dates:
                
                weekday, is_weekend, time = TransformDate(date)
                
                for i in range(len(time["Hour Sin"])):
                    input_dict[j] = {"weekday": weekday, "is_weekend": is_weekend, "LonScaled": sensor_lon,
                                    "LatScaled": sensor_lat, "is_event": 0.0, "month_sin": time["Month Sin"],
                                    "month_cos": time["Month Cos"], "day_sin": time["Day Sin"], 
                                     "day_cos": time["Day Cos"], "hour_sin": time["Hour Sin"][i], 
                                     "hour_cos": time["Hour Cos"][i], "hour": time["Hour"][i], 
                                     "Nieuwmarkt score": station_data["Nieuwmarkt"]["Score"],
                                    "Nieuwmarkt weight": station_data["Nieuwmarkt"]["Weight"],
                                    "Nieuwezijds Kolk score": station_data["Nieuwezijds Kolk"]["Score"],
                                    "Nieuwezijds Kolk weight": station_data["Nieuwezijds Kolk"]["Weight"],
                                    "Dam score": station_data["Dam"]["Score"],
                                    "Dam weight": station_data["Dam"]["Weight"],
                                    "Spui score": station_data["Spui"]["Score"],
                                    "Spui weight": station_data["Spui"]["Weight"],
                                    "Centraal Station score": station_data["Centraal Station"]["Score"],
                                    "Centraal Station weight": station_data["Centraal Station"]["Weight"],
                                    "Sensor": sensors, "Date": date, "SensorLongitude": lon, "SensorLatitude": lat}

                    j += 1
                
    else:
        j = 0
        for sensor in sensors:
            sensor_lat, sensor_lon, station_data, lat, lon = SelectSensor(sensor, sensor_dict, station_dict)

            if len(dates) == 1:
        
                weekday, is_weekend, time = TransformDate(dates)

                for i in range(len(time["Hour Sin"])):
                    input_dict[j] = {"weekday": weekday, "is_weekend": is_weekend, "LonScaled": sensor_lon,
                                    "LatScaled": sensor_lat, "is_event": 0.0, "month_sin": time["Month Sin"],
                                    "month_cos": time["Month Cos"], "day_sin": time["Day Sin"], 
                                     "day_cos": time["Day Cos"], "hour_sin": time["Hour Sin"][i], 
                                     "hour_cos": time["Hour Cos"][i], "hour": time["Hour"][i], 
                                     "Nieuwmarkt score": station_data["Nieuwmarkt"]["Score"],
                                    "Nieuwmarkt weight": station_data["Nieuwmarkt"]["Weight"],
                                    "Nieuwezijds Kolk score": station_data["Nieuwezijds Kolk"]["Score"],
                                    "Nieuwezijds Kolk weight": station_data["Nieuwezijds Kolk"]["Weight"],
                                    "Dam score": station_data["Dam"]["Score"],
                                    "Dam weight": station_data["Dam"]["Weight"],
                                    "Spui score": station_data["Spui"]["Score"],
                                    "Spui weight": station_data["Spui"]["Weight"],
                                    "Centraal Station score": station_data["Centraal Station"]["Score"],
                                    "Centraal Station weight": station_data["Centraal Station"]["Weight"],
                                    "Sensor": sensor, "Date": dates, "SensorLongitude": lon, "SensorLatitude": lat}

                    j += 1

            else:

                for date in dates:
                    
                    weekday, is_weekend, time = TransformDate(date)

                    for i in range(len(time["Hour Sin"])):
                        input_dict[j] = {"weekday": weekday, "is_weekend": is_weekend, "LonScaled": sensor_lon,
                                        "LatScaled": sensor_lat, "is_event": 0.0, "month_sin": time["Month Sin"],
                                        "month_cos": time["Month Cos"], "day_sin": time["Day Sin"], 
                                         "day_cos": time["Day Cos"], "hour_sin": time["Hour Sin"][i], 
                                         "hour_cos": time["Hour Cos"][i], "hour": time["Hour"][i], 
                                         "Nieuwmarkt score": station_data["Nieuwmarkt"]["Score"],
                                        "Nieuwmarkt weight": station_data["Nieuwmarkt"]["Weight"],
                                        "Nieuwezijds Kolk score": station_data["Nieuwezijds Kolk"]["Score"],
                                        "Nieuwezijds Kolk weight": station_data["Nieuwezijds Kolk"]["Weight"],
                                        "Dam score": station_data["Dam"]["Score"],
                                        "Dam weight": station_data["Dam"]["Weight"],
                                        "Spui score": station_data["Spui"]["Score"],
                                        "Spui weight": station_data["Spui"]["Weight"],
                                        "Centraal Station score": station_data["Centraal Station"]["Score"],
                                        "Centraal Station weight": station_data["Centraal Station"]["Weight"],
                                        "Sensor": sensor, "Date": date, "SensorLongitude": lon, "SensorLatitude": lat}

                        j += 1
        
    
    return pd.DataFrame.from_dict(input_dict, orient="index")

In [150]:
def generateDates(start_date, end_date):

    dates = []
    delta = end_date - start_date 

    for i in range(delta.days):   
        dates.append(start_date + pd.Timedelta(i, unit="D"))
        
    return dates

In [189]:
def plotTimeSeries(df, date):
    
    #Variables
    x_column = 'SensorLongitude'
    y_column = 'SensorLatitude'
    bubble_column = 'Sensor'
    time_column = 'Hour'
    size_column = 'CrowdednessCount'
    str_date = pd.Timestamp.strftime(date, format="%Y-%m-%d")

    #Initialize grid
    grid = pd.DataFrame()
    
    figure = bubbleplot(dataset=df, x_column=x_column, y_column=y_column, 
        bubble_column=bubble_column, size_column=size_column, time_column=time_column, color_column=bubble_column,
        x_title="Sensor Longitude", y_title="Sensor Latitude", title='Crowdedness Counts Amsterdam - ' + str_date,
        x_logscale=False, scale_bubble=3, height=650, x_range=[min(df[x_column])-0.001, max(df[x_column])+0.001],
                       y_range=[min(df[y_column])-0.001, max(df[y_column])+0.001])

    plotly.offline.plot(figure, filename="../../../Data_thesis/Full_Datasets/Plots/{0}_plot.html".format(str_date),
                       auto_open=False)

In [183]:
def prediction(start_date, end_date, sensors, model, sensor_dict, station_dict):
    
    predict_dict = {}
    
    dates = generateDates(start_date, end_date)  
    df = combineData(dates, sensors, sensor_dict, station_dict)
    input_df = df.drop(columns={"hour", "Sensor", "Date", "SensorLongitude", "SensorLatitude"}).copy()
    
    predict_dict["Date"] = df["Date"].copy()
    predict_dict["Hour"] = df["hour"].copy()
    predict_dict["Sensor"] = df["Sensor"].copy()
    predict_dict["SensorLongitude"] = df["SensorLongitude"].copy()
    predict_dict["SensorLatitude"] = df["SensorLatitude"].copy()
    predict_dict["CrowdednessCount"] = model.predict(input_df)
    
    predict_df = pd.DataFrame.from_dict(predict_dict)
    
    for date in dates:
            
        series_df = predict_df[predict_df["Date"] == date].copy()
        series_df.replace(2400, 0, inplace=True)
        series_df.sort_values(by=["Hour", "Sensor"],inplace=True)
        plotTimeSeries(series_df.drop(columns={"Date"}), date)
    
    return predict_df

## Test Functions

In [190]:
sensors = full_df["Sensor"].unique()
model = rfg
start_date = pd.Timestamp(2019, 1, 1)
end_date = pd.Timestamp(2019, 1, 6)

df = prediction(start_date, end_date, sensors, model, sensor_dict, station_dict)

In [191]:
df.head()

Unnamed: 0,Date,Hour,Sensor,SensorLongitude,SensorLatitude,CrowdednessCount
0,2019-01-01,100,GAWW-04,4.897908,52.373283,1691.9025
1,2019-01-01,200,GAWW-04,4.897908,52.373283,1810.0225
2,2019-01-01,300,GAWW-04,4.897908,52.373283,1034.1675
3,2019-01-01,400,GAWW-04,4.897908,52.373283,724.7325
4,2019-01-01,500,GAWW-04,4.897908,52.373283,432.2025


In [192]:
df.to_csv("../../../Data_thesis/Full_Datasets/Predictions.csv", index=False)