In [None]:
# Import necessary libraries
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, mean_absolute_percentage_error
from sklearn.multioutput import MultiOutputRegressor
import xgboost as xgb
import pickle
import scipy.stats as st

In [None]:
# Load the dataset
filepath_content = ".../Output_geocoded_lines_Bcn_filtered_19062020_NoHeader.txt"
df_Lis = pd.read_csv(filepath_content, sep="\t", header=None, index_col=0)

In [None]:
# Select a portion of the data for analysis
full_df = df_Lis.iloc[100000:400000, 10:-1]
len_full_df = full_df.shape[1]

# Using data from the last year
dataset_df = full_df.iloc[:, len_full_df - 60:]

In [None]:
# Define anomaly period and input length for prediction
Anomaly_Period = 10
len_TS = dataset_df.shape[1]
len_input = len_TS - Anomaly_Period

# Prepare input features (X) and target variable (y)
x = dataset_df.iloc[:, Anomaly_Period - 1:len_TS - 1]
y = dataset_df.iloc[:, -1:]

In [None]:
# Split the data into training and testing sets (no shuffling for time series data)
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.3, shuffle=False)
print(f"Train shapes: {X_train.shape}, {y_train.shape}")
print(f"Test shapes: {X_test.shape}, {y_test.shape}")

In [None]:
# Fitting a MultiOutput XGBoost model
regressor = MultiOutputRegressor(xgb.XGBRegressor())
model_XGB = regressor.fit(X_train, y_train)

In [None]:
# Save the trained model using pickle
XGB_EA = 'XGB_EA.sav'
pickle.dump(model_XGB, open(XGB_EA, 'wb'))

In [None]:
# Evaluate the model on training data
r2_train = model_XGB.score(X_train, y_train)
y_train_predicted = model_XGB.predict(X_train)
rmse_train = mean_squared_error(y_train, y_train_predicted, squared=False)
mae_train = mean_absolute_error(y_train, y_train_predicted)
print(f"Train R2: {r2_train}")
print(f"Train RMSE: {rmse_train}")
print(f"Train MAE: {mae_train}")

In [None]:
# Make predictions on test data
y_test_predicted = model_XGB.predict(X_test)
rmse_test = mean_squared_error(y_test, y_test_predicted, squared=False)
mae_test = mean_absolute_error(y_test, y_test_predicted)
print(f"Test RMSE: {rmse_test}")
print(f"Test MAE: {mae_test}")

In [None]:
# Calculate MAPE (Mean Absolute Percentage Error)
mape_test = mean_absolute_percentage_error(y_test, y_test_predicted)
print(f"Test MAPE: {mape_test * 100}%")

# Adjust MAPE calculation for values close to zero
y_test_1 = np.where(abs(y_test) <= 0.99, 1, y_test)
mape_test_1 = mean_absolute_percentage_error(y_test_1, y_test_predicted)
print(f"Adjusted Test MAPE: {mape_test_1 * 100}%")

In [None]:
# Prediction for new input data
Input_for_prediction = dataset_df.iloc[:, :len_input]
y_test_predicted_0 = model_XGB.predict(Input_for_prediction.iloc[0:1, :])
print("Predicted values for first input: ", y_test_predicted_0)

In [None]:
# Confidence interval calculation using the normal distribution
x_input = Input_for_prediction.iloc[25:26, :]
CI = st.norm.interval(alpha=0.99, loc=x_input.mean(axis=1), scale=x_input.sem(axis=1))
print(f"Input data: {x_input}")
print(f"Confidence Interval (99%): {CI}")