# Preliminary operations

In [31]:
import pandas as pd
import numpy as np
import copy
import pickle
import os
import itertools
import glob
import plotly.express as px
import seaborn as sns
import matplotlib.pyplot as plt
import warnings

from math import sqrt
from scipy import stats
import statsmodels.api as sm
from statsmodels.stats.outliers_influence import variance_inflation_factor
from scipy.ndimage import gaussian_filter1d

import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM, RepeatVector
from tensorflow.keras.layers import Dense, Flatten, Reshape, Dropout
from tensorflow.keras.optimizers.legacy import Adam
from tensorflow.keras.callbacks import EarlyStopping

from sklearn.metrics import accuracy_score, precision_score, recall_score, mean_absolute_error, mean_squared_error, mean_absolute_percentage_error
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, StandardScaler

seed = 123
tf.keras.utils.set_random_seed(seed)
df_path_lstm = []

In [33]:
%run utils.ipynb

In [34]:
%run preprocessing.ipynb

In [35]:
%run preparation.ipynb

In [37]:
%run training_lstm.ipynb

In [38]:
%run predict.ipynb

In [39]:
data_path = 'datasets/raw/sf_normal_final_indicators_93600.csv'
df = pd.read_csv(data_path)
df = df.drop('timestamp', axis = 1)
num_features = len(df.columns)

In [40]:
df.columns

Index(['rides_requested', 'rides_canceled', 'rides_not_served',
       'rides_accepted', 'rides_rejections', 'idle_drivers',
       'responding_drivers', 'pickup_drivers', 'on_road_drivers',
       'moving_drivers', 'pending_customers', 'avg_rejections_before_accepted',
       'avg_surge_multiplier', 'avg_expected_price_per_km',
       'avg_real_price_per_km', 'avg_diff_price_per_km',
       'avg_current_error_ride_distance', 'avg_speed_max_speed',
       'avg_speed_kmh', 'avg_diff_duration_min', 'avg_expected_ride_time_min',
       'avg_expected_meeting_time_min', 'avg_expected_total_time_min',
       'avg_ride_time_min', 'avg_meeting_time_min', 'avg_total_time_min',
       'avg_ride_length_km', 'avg_meeting_length_km', 'avg_total_length_km',
       'avg_distance_per_timestamp_km', 'avg_remaining_distance_covered'],
      dtype='object')

In [44]:
df.isnull().values.any()

False

# Choosing preprocessing strategy - LSTM

In [None]:
ratio_values = [True, False]
smoothing_values = [True, False]
collinearity_values = [True, False]
log_values = [True, False]
aggregation_values = [100, 200, 300, 500]  
statistics_values = [True, False]
differencing_values = [1]
seasonality_values = [True, False]
normalize_values = [True]

# Generate all combinations of parameter values
experiments = itertools.product(
    ratio_values, smoothing_values, collinearity_values, 
    log_values, aggregation_values, statistics_values, 
    differencing_values, seasonality_values, normalize_values
)

count = 1
tot_experiments = len(ratio_values) * len(collinearity_values) * len(log_values) * len(aggregation_values) * len(statistics_values) * len(differencing_values) * len(seasonality_values) * len(normalize_values)
warnings.filterwarnings("ignore", category = RuntimeWarning)
for combination in experiments:
    print(f"Experiment {count}/{tot_experiments}")
    print(f"ratio: {combination[0]}, smoothing: {combination[1]}, collinearity: {combination[2]}, log: {combination[3]}, aggregation: {combination[4]}, statistics: {combination[5]}, differencing: {combination[6]}, seasonality: {combination[7]}, normalize: {combination[8]}")
    df_path = ('df_' + str(combination[0]) + '_' + str(combination[1]) + '_' + str(combination[2]) + '_' + str(combination[3]) + '_' + str(combination[4]) + '_' + str(combination[5]) + '_' + str(combination[6]) + '_' + str(combination[7]) + '_' + str(combination[8]))
    if df_path not in df_path_lstm:
        df_path_lstm.append('datasets/proc/' + df_path + '.csv')
        df_ = copy.deepcopy(df)
        df_proc = preprocessing(df_,
                                ratio = combination[0],
                                smoothing = combination[1],
                                collinearity = combination[2],
                                log = combination[3],
                                aggregation = combination[4],
                                statistics = combination[5],
                                differencing = combination[6],
                                seasonality = combination[7],
                                normalize = combination[8],
                                load_scaler = False,
                                save = True)
    count += 1

## Preparation

In [None]:
pattern = os.path.join('datasets/proc', 'df*.csv')
csv_files = glob.glob(pattern)
data_prep_dict = {}

count = 1
for file_path in csv_files:
    if file_path in df_path_lstm:
        print(str(count) + ': ' + file_path)
        filename = os.path.basename(file_path).replace('.csv', '')
        data_prep =  pd.read_csv(file_path)
        data_prep_dict[filename] = preparation(data_prep,
                                               window_size = 20,
                                               overlap = 0,
                                               val_hours = 1.5)
        count+=1

## Training

In [None]:
mae_list = []
name_list = []

csv_file_path = 'processing_metrics_lstm.csv'

for name, data_prep in data_prep_dict.items():
    # Training
    history, model = training_lstm(data_prep[0],
                                   data_prep[1],
                                   n_layers_encoder = 2,
                                   n_layers_decoder = 2,
                                   hidden_units = [128, 64, 32, 32, 64, 128],
                                   activation_hidden = 'tanh',
                                   activation_dense = 'sigmoid',
                                   dropout_rate = 0,
                                   learning_rate = 0.0001,
                                   n_epochs = 500,
                                   metric = 'mse',
                                   batch_size = 32,
                                   window_size = 20,
                                   plot = True,
                                   save = False)

    # Predicting
    original, reconstructed = predict(model, data_prep[1], name, recurrent = True)

    # Save plots
    print(name.replace('df_', '') + ':')
    num_features = min(data_prep[1].shape[2], len(df.columns))
    mae, rmse, mape = save_predict_plots(original, reconstructed, df.columns, num_features, name, model_type = 'lstm', plot = True, compute_df = False)
    mae_list.append(mae)
    name_list.append(name.replace('df_', ''))

    # Write metrics to csv
    metrics_data = {
        'Name': [name.replace('df_', '')],
        'MAE': [mae]
    }
    metrics_df = pd.DataFrame(metrics_data)
    if os.path.exists(csv_file_path):
      metrics_df.to_csv(csv_file_path, mode = 'a', index = False, header = False)
    else:
      metrics_df.to_csv(csv_file_path, mode = 'w', index = False, header = True)

## Metrics

In [None]:
metrics = pd.read_csv('processing_metrics_lstm.csv')
top_res = min(10, len(metrics))
print('Top ' + str(top_res) + ' MAE:')
mae_arr = metrics['MAE'].nsmallest(top_res)
for i in range(0, top_res):
    print(name_list[mae_arr.index[i]] + ': ' + str(mae_arr[mae_arr.index[i]]))

ratio_values = [True, False]
smoothing_values = [True, False]
collinearity_values = [True, False]
log_values = [True, False]
aggregation_values = [100, 200, 300, 500]  
statistics_values = [True, False]
differencing_values = [1]
seasonality_values = [True, False]
normalize_values = [True]

Best combination(s): False_False_False_False_300_False_1_False_True