# Baseline

In [1]:
# import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sml
import statsmodels.formula.api as smf
import statsmodels.stats.api as sms
import datetime
import shap
from pmdarima import auto_arima

  from .autonotebook import tqdm as notebook_tqdm


# Naive Prediction: Last Month Inflation Value

In [2]:
# Use last inflation as new inflation 
# let lag-1 be the prediction


# get full data set 

# inflation (2011 - Mar 2025)
inflation_data = pd.read_csv('./yoy_inflation_till_March_2025.csv')
inflation_data.head()

Unnamed: 0,Month,Year-on-Year Change (%)
0,Jan '11,12.079665
1,Feb '11,11.099837
2,Mar '11,12.779128
3,Apr '11,11.291282
4,May '11,12.352128


In [3]:
# Convert string to datetime and then format
def format_date(date_str):
    date_obj = datetime.datetime.strptime(date_str, '%Y-%m')
    return date_obj.strftime('%b \'%y')

In [4]:
# one lag of the inflation data using df.shift
inflation_data = inflation_data.rename(columns={'Year-on-Year Change (%)': 'Inflation rate'})
inflation_data['Lag_1'] = inflation_data['Inflation rate'].shift(1)
# Ensure Month is in datetime format
inflation_data['Month'] = pd.to_datetime(inflation_data['Month'], format='%b \'%y')
inflation_data.head()

Unnamed: 0,Month,Inflation rate,Lag_1
0,2011-01-01,12.079665,
1,2011-02-01,11.099837,12.079665
2,2011-03-01,12.779128,11.099837
3,2011-04-01,11.291282,12.779128
4,2011-05-01,12.352128,11.291282


In [5]:
inflation_data.dropna(axis=0, how='any', inplace=True, subset=['Lag_1'], ignore_index=True)
inflation_data.head()

Unnamed: 0,Month,Inflation rate,Lag_1
0,2011-02-01,11.099837,12.079665
1,2011-03-01,12.779128,11.099837
2,2011-04-01,11.291282,12.779128
3,2011-05-01,12.352128,11.291282
4,2011-06-01,10.230183,12.352128


In [6]:
# calculate the RMSE, MAE, RMSPE, MAPE and R-squared for each model
def calculate_metrics(y_true, y_pred):

    # Convert inputs to numpy arrays
    y_true = np.array(y_true).flatten()
    y_pred = np.array(y_pred).flatten()

    r_squared = 1 - (np.sum((y_true - y_pred) ** 2) / np.sum((y_true - np.mean(y_true)) ** 2))
    mse = np.mean((y_true - y_pred) ** 2)
    rmse = np.sqrt(np.mean((y_true - y_pred) ** 2))
    rmspe = np.sqrt(np.mean(((y_true - y_pred) / y_true) ** 2)) * 100
    mae = np.mean(np.abs(y_true - y_pred))
    mape = np.mean(np.abs((y_true - y_pred) / y_true)) * 100
    smape = 100 * np.mean(np.abs(y_pred - y_true) / ((np.abs(y_true) + np.abs(y_pred)) / 2) )
    
    return r_squared, mse, rmse, rmspe, mae, mape, smape



In [7]:
# Calculate metrics for the naive baseline
metrics_1 = calculate_metrics(inflation_data[['Inflation rate']], inflation_data[['Lag_1']])

# Create a DataFrame to store the metrics
metrics_df = pd.DataFrame({
    'Model': ['Naive (Full Dataset)'],
    'R-squared': [metrics_1[0]],
    'MSE': [metrics_1[1]],
    'RMSE': [metrics_1[2]],
    'RMSPE': [metrics_1[3]],
    'MAE': [metrics_1[4]],
    'MAPE': [metrics_1[-2]],
    'SMAPE': [metrics_1[-1]]
})

# Print the metrics DataFrame
metrics_df = metrics_df.round(2)
display(metrics_df)



Unnamed: 0,Model,R-squared,MSE,RMSE,RMSPE,MAE,MAPE,SMAPE
0,Naive (Full Dataset),0.99,0.69,0.83,5.47,0.51,3.44,3.42


In [20]:
# split the data into train and test sets

# split_date = '2024-01-01'  # Define the split date
split_date = '2018-07-01' 

train_data = inflation_data[inflation_data['Month'] < split_date]
test_data = inflation_data[inflation_data['Month'] >= split_date]
# display the first few rows of the train and test DataFrames
# display(train_data)
# display(test_data)

# print the shape of the train and test data
# print(f"Train data shape: {train_data.shape}")
# print(f"Test data shape: {test_data.shape}")

In [21]:

X_train_1 = train_data[['Lag_1']]

# print the shape of the X-train data
# print(f"X_train_1 shape: {X_train_1.shape}")

# Get y-train for 1 - 12 lags
y_train_1 = train_data['Inflation rate'].iloc[X_train_1.index]

# print the shape of the y-train data
# print(f"y_train_1 shape: {y_train_1.shape}")


# Get X-test  
X_test_1 = test_data[['Lag_1']]

# print the shape of the X-test data
# print(f"X_test_1 shape: {X_test_1.shape}")

# Get y-test
y_test = test_data['Inflation rate']

# print the shape of the y-test data
# print(f"y_test shape: {y_test.shape}")

In [22]:
# Calculate metrics for the naive baseline
metrics_test_1 = calculate_metrics(y_test, X_test_1)

# Create a DataFrame to store the metrics
metrics_test_1_df = pd.DataFrame({
    'Model': ['Naive (Full Test Set)'],
    'R-squared': [metrics_test_1[0]],
    'MSE': [metrics_test_1[1]],
    'RMSE': [metrics_test_1[2]],
    'RMSPE': [metrics_test_1[3]],
    'MAE': [metrics_test_1[4]],
    'MAPE': [metrics_test_1[-2]],
    'SMAPE': [metrics_test_1[-1]]
})

# Print the metrics DataFrame
metrics_test_1_df = metrics_test_1_df.round(2)
display(metrics_test_1_df)



Unnamed: 0,Model,R-squared,MSE,RMSE,RMSPE,MAE,MAPE,SMAPE
0,Naive (Full Test Set),0.99,0.81,0.9,3.58,0.53,2.43,2.44


In [23]:
# PERFORMANCE BEFORE REBASE (Jan 2024 - Dec 2024)
# first 12 months in the test set

# Get the first 12 months of the test set
metrics_test_before_rebase_1 = calculate_metrics(y_test[:12], X_test_1[:12])

# Create a DataFrame to store the metrics
metrics_test_before_rebase_1_df = pd.DataFrame({
    'Model': ['Naive (Test Before Rebase)'],
    'R-squared': [metrics_test_before_rebase_1[0]],
    'MSE': [metrics_test_before_rebase_1[1]],
    'RMSE': [metrics_test_before_rebase_1[2]],
    'RMSPE': [metrics_test_before_rebase_1[3]],
    'MAE': [metrics_test_before_rebase_1[4]],
    'MAPE': [metrics_test_before_rebase_1[-2]],
    'SMAPE': [metrics_test_before_rebase_1[-1]]
})

# Print the metrics DataFrame
metrics_test_before_rebase_1_df = metrics_test_before_rebase_1_df.round(2)
display(metrics_test_before_rebase_1_df)



Unnamed: 0,Model,R-squared,MSE,RMSE,RMSPE,MAE,MAPE,SMAPE
0,Naive (Test Before Rebase),-0.29,0.01,0.09,0.83,0.08,0.7,0.7


In [24]:
# PERFORMANCE AFTER REBASE
# calculate the metrics for each model for y_test and y_pred last 3 months (Jan - Mar 2025)

# Now calculate metrics using the Series objects
metrics_test_after_rebase_1 = calculate_metrics(y_test.iloc[-3:], X_test_1.iloc[-3:])

# create a DataFrame to store the metrics
metrics_test_after_rebase_1_df = pd.DataFrame({
    'Model': ['Naive (Test After Rebase)'],
    'R-squared': [metrics_test_after_rebase_1[0]],
    'MSE': [metrics_test_after_rebase_1[1]],
    'RMSE': [metrics_test_after_rebase_1[2]],
    'RMSPE': [metrics_test_after_rebase_1[3]],
    'MAE': [metrics_test_after_rebase_1[4]],
    'MAPE': [metrics_test_after_rebase_1[-2]],
    'SMAPE': [metrics_test_after_rebase_1[-1]]
})

# display the metrics DataFrame
metrics_test_after_rebase_1_df = metrics_test_after_rebase_1_df.round(2)
display(metrics_test_after_rebase_1_df)

Unnamed: 0,Model,R-squared,MSE,RMSE,RMSPE,MAE,MAPE,SMAPE
0,Naive (Test After Rebase),-20.44,12.27,3.5,12.16,2.59,9.09,8.41


In [25]:
list_of_dfs = [
    metrics_test_1_df,
    metrics_test_before_rebase_1_df,
    metrics_test_after_rebase_1_df,
]

result_df = pd.concat(list_of_dfs, axis=0)
result_df

Unnamed: 0,Model,R-squared,MSE,RMSE,RMSPE,MAE,MAPE,SMAPE
0,Naive (Full Test Set),0.99,0.81,0.9,3.58,0.53,2.43,2.44
0,Naive (Test Before Rebase),-0.29,0.01,0.09,0.83,0.08,0.7,0.7
0,Naive (Test After Rebase),-20.44,12.27,3.5,12.16,2.59,9.09,8.41
