In [None]:
import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error

# Load the merged dataset
merged_df = pd.read_csv('./Best_Merged_Data_Final.csv')

# Ensure 'ref_date' is in datetime format
merged_df['ref_date'] = pd.to_datetime(merged_df['ref_date'])

# Filter data for 2023 to 2024 Q2
filtered_df = merged_df[
    (merged_df['ref_date'] >= '2023-01-01') & 
    (merged_df['ref_date'] < '2024-07-01')
]

# Ensure there are no missing values in 'total_vacancies' and 'predicted_total_vacancies'
filtered_df = filtered_df.dropna(subset=['total_vacancies', 'predicted_total_vacancies'])


# Alternatively, replace zeros with a small positive value (uncomment below if preferred)
# filtered_df['total_vacancies'] = filtered_df['total_vacancies'].replace(0, 1)


# Convert columns to numeric for calculations
filtered_df['total_vacancies'] = pd.to_numeric(filtered_df['total_vacancies'])
filtered_df['predicted_total_vacancies'] = pd.to_numeric(filtered_df['predicted_total_vacancies'])


# Alternatively, replace zeros with a small positive value (uncomment below if preferred)
#filtered_df['total_vacancies'] = filtered_df['total_vacancies'].replace(0, 1)


# Calculate RMSE
rmse = np.sqrt(mean_squared_error(
    filtered_df['total_vacancies'],
    filtered_df['predicted_total_vacancies']
))

# Calculate MAPE (avoid dividing by zero)
filtered_df['absolute_percentage_error'] = np.abs(
    (filtered_df['total_vacancies'] - filtered_df['predicted_total_vacancies']) 
    / filtered_df['total_vacancies']
)
mape = filtered_df['absolute_percentage_error'].mean() * 100  # Convert to percentage

# Calculate Accuracy
accuracy = 100 - mape

# Output the metrics
print(f"RMSE: {rmse:.2f}")
print(f"MAPE: {mape:.2f}%")
print(f"Accuracy: {accuracy:.2f}%")


RMSE: 168391856.36
MAPE: 1949616.48%
Accuracy: -1949516.48%
