IMPORT


In [16]:
import pandas as pd
import numpy as np

from sklearn.linear_model import Lasso
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import MinMaxScaler

from xgboost import XGBRegressor

import warnings
warnings.filterwarnings('ignore')

pd.options.display.max_columns = 2000
pd.options.display.max_rows = 2000


cpi_pivot = pd.read_csv('CPI_Historic_Values_Zindi_Latest.csv')
cpi_pivot['Month'] = pd.to_datetime(cpi_pivot['Month'])
petrol = pd.read_csv('petrolem.csv')
lending = pd.read_csv('historical_prime_lending_rates.csv')
lending = lending.ffill()
seed = 3

ADD THE november ROW


In [17]:
date_str = '2023-11-30'
date_obj = pd.to_datetime(date_str)
new_row = pd.DataFrame({'Month': [date_obj]})
cpi_pivot = pd.concat([cpi_pivot, new_row]).reset_index(drop=True)

FEATURE ENGINNERING


In [18]:
feats_to_lag = cpi_pivot.columns[1:].to_list()
for col in feats_to_lag:
    for i in range(1,5):
        cpi_pivot[f'prev_{i}_month_{col}'] = cpi_pivot[col].shift(i)
columns = list(cpi_pivot.columns)
columns = [item for item in columns if "Transport" not in item]

In [19]:
petrol['Month'] = pd.to_datetime(petrol['Month'])
petrol = petrol.sort_values("Month").reset_index(drop=True)
feats_to_lag = petrol.columns[1:].to_list()
for col in feats_to_lag:
    for i in range(1,5):
        petrol[f'prev_{i}_month_{col}'] = petrol[col].shift(i)
start_date = pd.to_datetime('2022-01-31')
end_date = pd.to_datetime('2023-11-30')
petrol = petrol[(petrol['Month'] >= start_date) & (petrol['Month'] <= end_date)]
cpi_pivot = pd.merge(cpi_pivot, petrol, on='Month', suffixes=('_df1', '_df2'))

In [20]:
lending['Interest_per_annum'] = lending['Interest_per_annum'].str.rstrip('%').astype(float).astype(int)
lending['Month'] = pd.to_datetime(lending['Month'])
lending = lending.sort_values("Month").reset_index(drop=True)
feats_to_lag = lending.columns[1:].to_list()
for col in feats_to_lag:
    for i in range(1,5):
        lending[f'prev_{i}_month_{col}'] = lending[col].shift(i)
start_date = pd.to_datetime('2022-01-31')
end_date = pd.to_datetime('2023-11-30')
lending = lending[(lending['Month'] >= start_date) & (lending['Month'] <= end_date)]
cpi_pivot = pd.merge(cpi_pivot, lending, on='Month', suffixes=('_df1', '_df2'))

In [21]:
petcol = list(petrol.columns)
search_strings = ["petrol95" ,"randuS_exchenge" ]
petcol = [item for item in petcol if all(search_str not in item for search_str in search_strings)]
petcol = petcol[1:]

HANDLE MISSING DATA


In [22]:
cpi_pivot = cpi_pivot.drop(0)
cpi_pivot = cpi_pivot.bfill()

TRAIN AND VALIDATION


In [23]:
train = cpi_pivot[cpi_pivot['Month'] != "2023-11-30"]
test = cpi_pivot[cpi_pivot['Month'] == "2023-11-30"]

training_set = train[train['Month']!= '2023-10-31']
validation_set = train[train['Month']== '2023-10-31']

train.shape, test.shape, training_set.shape, validation_set.shape

((21, 101), (1, 101), (20, 101), (1, 101))

MODELING


In [24]:
target_cols = ['Headline_CPI', 'Food and non-alcoholic beverages', 'Alcoholic beverages and tobacco', 
               'Clothing and footwear', 'Housing and utilities', 'Household contents and services',
               'Health', 'Transport', 'Communication', 'Recreation and culture', 
               'Education', 'Restaurants and hotels ', 'Miscellaneous goods and services',]

#if you add additional data sources that have no value in the predicting month , drop it, now that you have their lags
features= [col for col in train.columns if col not in target_cols + ['Month']]
featurest= [col for col in train.columns if col not in columns + ['Transport']]

X_train = training_set[features]
X_traint = training_set[featurest]
y_train = training_set[target_cols]

X_val = validation_set[features]
X_valt = validation_set[featurest]
y_val = validation_set[target_cols]

l_models = {}

y_predl = []

rmsel_dict = {} 

scaler = MinMaxScaler()

#training
for target_col in target_cols:
    if target_col == "Transport":
        l_model = XGBRegressor(seed=seed)
        X_train_scaled = scaler.fit_transform(X_traint)

        l_model.fit(X_train_scaled, y_train[target_col])

        l_models[target_col] = l_model
        
        l_model = l_models[target_col]

        X_val_scaled = scaler.transform(X_valt)

        y_pred_coll = l_model.predict(X_val_scaled)

        rmsel_col = np.sqrt(mean_squared_error(y_pred_coll, y_val[target_col]))
        rmsel_dict[target_col] = rmsel_col 

        y_predl.append(y_pred_coll)
    else:
        l_model = Lasso(alpha=0.06, random_state=seed)
        
        X_train_scaled = scaler.fit_transform(X_train)

        l_model.fit(X_train_scaled, y_train[target_col])

        l_models[target_col] = l_model
        
        l_model = l_models[target_col]

        X_val_scaled = scaler.transform(X_val)

        y_pred_coll = l_model.predict(X_val_scaled)

        rmsel_col = np.sqrt(mean_squared_error(y_pred_coll, y_val[target_col]))
        rmsel_dict[target_col] = rmsel_col 

        y_predl.append(y_pred_coll)

# scoring
y_predl = np.array(y_predl).T
y_predl[:,0][0] = ((6.26 * y_predl[:,2][0]) + (3.65 * y_predl[:,3][0]) + (2.42 * y_predl[:,8][0]) 
                   + (2.62 * y_predl[:,10][0]) + (17.14 * y_predl[:,1][0]) + (1.44 * y_predl[:,6][0]) + 
                   (4.37 * y_predl[:,5][0]) 
                   + (24.49 * y_predl[:,4][0]) + (14.81 * y_predl[:,12][0]) + (5.2 * y_predl[:,9][0]) + 
                   (3.25 * y_predl[:,11][0]) + (14.35 * y_predl[:,7][0]))/100
dfl = pd.DataFrame({'y_pred': y_predl.flatten(), 'y_val': y_val.values.flatten()})

# Print RMSE for each target column
for target_col in target_cols:
    print(f'RMSE of Lasso Regression for {target_col}: {rmsel_dict[target_col]}')

# Calculate the average RMSE across all target columns
average_rmse = np.mean(list(rmsel_dict.values()))
print(f'Average RMSE of Lasso Regression: {average_rmse}')

RMSE of Lasso Regression for Headline_CPI: 1.6831534153047016
RMSE of Lasso Regression for Food and non-alcoholic beverages: 1.7989970499927352
RMSE of Lasso Regression for Alcoholic beverages and tobacco: 0.3481599043186634
RMSE of Lasso Regression for Clothing and footwear: 0.7298653053919253
RMSE of Lasso Regression for Housing and utilities: 0.8387270750065028
RMSE of Lasso Regression for Household contents and services: 0.286591957772103
RMSE of Lasso Regression for Health: 0.1541171258229923
RMSE of Lasso Regression for Transport: 5.1818695068359375
RMSE of Lasso Regression for Communication: 0.1599999999999966
RMSE of Lasso Regression for Recreation and culture: 0.6825067844873445
RMSE of Lasso Regression for Education: 0.3634361974726943
RMSE of Lasso Regression for Restaurants and hotels : 3.5604299072645915
RMSE of Lasso Regression for Miscellaneous goods and services: 0.0001276467562547623
Average RMSE of Lasso Regression: 1.2144601443404957


In [25]:
X_train = train[features]
X_traint = train[featurest]
y_train = train[target_cols]

X_val = test[features]
X_valt = test[featurest]
y_val = test[target_cols]

l_models = {}

y_predl = []

scaler = MinMaxScaler()

#training
for target_col in target_cols:
    if target_col == "Transport":
        l_model = XGBRegressor(seed=seed)
        X_train_scaled = scaler.fit_transform(X_traint)

        l_model.fit(X_train_scaled, y_train[target_col])

        l_models[target_col] = l_model
        l_model = l_models[target_col]

        X_val_scaled = scaler.transform(X_valt)

        y_pred_coll = l_model.predict(X_val_scaled)

        y_predl.append(y_pred_coll)
    else:
        l_model = Lasso(alpha=0.06, random_state=seed)

        X_train_scaled = scaler.fit_transform(X_train)

        l_model.fit(X_train_scaled, y_train[target_col])

        l_models[target_col] = l_model
        l_model = l_models[target_col]

        X_val_scaled = scaler.transform(X_val)

        y_pred_coll = l_model.predict(X_val_scaled)

        y_predl.append(y_pred_coll)

y_predl = np.array(y_predl).T
y_predl[:,0][0] = ((6.26 * y_predl[:,2][0]) + (3.65 * y_predl[:,3][0]) + (2.42 * y_predl[:,8][0]) 
                   + (2.62 * y_predl[:,10][0]) + (17.14 * y_predl[:,1][0]) + (1.44 * y_predl[:,6][0]) + 
                   (4.37 * y_predl[:,5][0]) 
                   + (24.49 * y_predl[:,4][0]) + (14.81 * y_predl[:,12][0]) + (5.2 * y_predl[:,9][0]) + 
                   (3.25 * y_predl[:,11][0]) + (14.35 * y_predl[:,7][0]))/100

print(f'prediction of Lasso Regression: {y_predl}') 

prediction of Lasso Regression: [[112.16440124 120.11415529 111.37430409 104.41025897 108.81943646
  108.4166872  111.20575045 118.17901611  99.65238095 105.78234986
  111.11017898 113.30460373 110.33310043]]


In [26]:
def prepSub(template_df, month_name, cpi_values):
    """
    Args:
        template_df (pd.DataFrame): The DataFrame template.
        month_name (str): The name of the month (e.g., "September").
        cpi_values (np.ndarray): Numpy ndarray containing CPI values.

    Returns:
        pd.DataFrame: The modified DataFrame with CPI values
    """
    # Find the starting row index for the given month
    month_index = template_df.index[template_df['ID'].str.startswith(month_name)].tolist()
    
    if not month_index:
        raise ValueError(f"Month '{month_name}' not found in the template.")

    start_row = month_index[0]
    cpi_values = cpi_values.squeeze()
    # Update CPI values iteratively in the DataFrame
    for i, value in enumerate(cpi_values):
        template_df.at[start_row + i, 'Value'] = value

    return template_df

In [27]:
ss = pd.read_csv("October.csv")

In [28]:
sub = prepSub(ss,"November", y_predl) 
sub.to_csv('out/November.csv', index=False)

In [29]:
sub

Unnamed: 0,ID,Value
0,September_headline CPI,110.867612
1,September_food and non-alcoholic beverages,118.96852
2,September_alcoholic beverages and tobacco,111.293235
3,September_clothing and footwear,104.144544
4,September_housing and utilities,108.071407
5,September_household contents and services,108.178395
6,September_health,110.739999
7,September_transport,112.895767
8,September_communication,99.668421
9,September_recreation and culture,105.208355
