In [4]:
import pandas as pd
from statsmodels.tsa.statespace.sarimax import SARIMAX
import warnings

warnings.filterwarnings("ignore")

# Load dataset
data = pd.read_csv("./MachineLearning.csv")
data['ref_date'] = pd.to_datetime(data['ref_date'])
data.drop('predicted_vacancies', axis=1,inplace=True)
data = data[data['ref_date'] >= '2019-01-01']  # Keep data from 2019 onwards

# Sentiment mapping
sentiment_mapping = {
    2019: 1.826354,
    2020: 1.826354,
    2021: 0.930538,
    2022: 0.000000,
    2023: 0.000000,
    2024: 0.568935
}

# Propagate sentiment scores forward for future years
future_years = range(2025, 2027)
for year in future_years:
    sentiment_mapping[year] = 0.568935

# Map sentiment scores to dataset
data['sentiment_score'] = data['ref_date'].dt.year.map(sentiment_mapping)

# Filter data for training until Q4 2022
train_data = data[data['ref_date'] <= '2022-12-31']

# Aggregate by geo, noc_code, noc_desc, ref_date, and job_char
aggregated_data = (
    train_data.groupby(['geo', 'noc_code', 'noc_desc', 'ref_date', 'job_char'])
    [['total_vacancies']]
    .sum()
    .reset_index()
)
aggregated_data = aggregated_data[aggregated_data['geo'] != 'Northwest Territories']  # Remove unwanted regions

# Prepare DataFrame to store predictions
predictions = []

# Define SARIMAX model parameters
order = (1, 0, 1)
seasonal_order = (1, 1, 0, 4)

# Iterate through unique combinations of geo, noc_code, noc_desc, and job_char
for (geo, noc_code, noc_desc, job_char), group in aggregated_data.groupby(['geo', 'noc_code', 'noc_desc', 'job_char']):
    # Ensure the time series is indexed by 'ref_date' and follows quarterly frequency
    group = group.set_index('ref_date').asfreq('QS')

    # Fill missing values
    group['total_vacancies'] = group['total_vacancies'].fillna(0)

    # Train SARIMAX model
    try:
        model = SARIMAX(
            group['total_vacancies'],  # Endogenous variable
            order=order,
            seasonal_order=seasonal_order,
            enforce_stationarity=False,
            enforce_invertibility=False,
        )
        results = model.fit(disp=0)

        # Forecast from Q1 2023 to Q2 2026 (14 quarters)
        forecast_steps = 14
        forecast_dates = pd.date_range(start='2023-01-01', periods=forecast_steps, freq='QS')

        # Map sentiment scores for forecasted years
        forecast_sentiment = [sentiment_mapping[date.year] for date in forecast_dates]

        forecast = results.get_forecast(steps=forecast_steps)
        forecast_df = forecast.summary_frame()

        # Prepare forecast DataFrame
        forecast_df['geo'] = geo
        forecast_df['noc_code'] = noc_code
        forecast_df['noc_desc'] = noc_desc
        forecast_df['job_char'] = job_char
        forecast_df['ref_date'] = forecast_dates
        forecast_df['sentiment_score'] = forecast_sentiment

        # Append the relevant columns to predictions
        predictions.append(forecast_df[['ref_date', 'geo', 'noc_code', 'noc_desc', 'job_char', 'sentiment_score', 'mean']])

    except Exception as e:
        print(f"Error processing geo: {geo}, noc_code: {noc_code}, job_char: {job_char} - {e}")

# Combine predictions for all groups into a single DataFrame
predictions_df = pd.concat(predictions, ignore_index=True)

# Rename the 'mean' column to 'predicted_total_vacancies'
predictions_df.rename(columns={'mean': 'predicted_total_vacancies'}, inplace=True)

# Save predictions to a CSV file
predictions_df.to_csv('predictions_with_sentiment.csv', index=False)

# Print a confirmation message
print("Predictions saved to predictions_with_sentiment.csv")


Predictions saved to predictions_with_sentiment.csv
