IMPORT


In [138]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

import math


from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.preprocessing import PolynomialFeatures
from statsmodels.api import OLS
import statsmodels.api as sm
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler, MinMaxScaler

from xgboost import XGBRegressor

import warnings
warnings.filterwarnings('ignore')

pd.options.display.max_columns = 2000
pd.options.display.max_rows = 2000


#path = '/Koding_With_Kolesh/challenges/RMB_NOWCAST_APRIL/'
cpi = pd.read_csv('CPI_Historic_Values_Zindi_May_23.csv')
#vehicles = pd.read_csv('Naamsa_Vehicle_Sales.csv')
seed = 3

PIVOTING THE DATA


In [139]:
cpi_pivot = cpi.pivot(index = 'Month', columns = 'Category', values = 'Value').reset_index()#changing from a long format to a wide format,
cpi_pivot['Month'] = pd.to_datetime(cpi_pivot['Month'])
cpi_pivot = cpi_pivot.sort_values("Month").reset_index(drop=True)

ADDING JUNE'S DATA MANUALLY FROM https://www.statssa.gov.za/publications/P0141/P0141June2023.pdf


In [140]:
date_str = '2023-06-30'
date_obj = pd.to_datetime(date_str)
new_row = pd.DataFrame({'Month': [date_obj]})
cpi_pivot = pd.concat([cpi_pivot, new_row]).reset_index(drop=True)
cpi_of_june = [110.9, 104.3, 99.6, 110.4, 118.3, 109.8, 110.8, 107.7, 105.4, 109.6, 105.3, 110.0, 112.3]

for i, col in enumerate(cpi_pivot.columns[1:]):
    cpi_pivot.at[17, col] = cpi_of_june[i]

ADD THE july ROW


In [141]:
date_str = '2023-07-31'
date_obj = pd.to_datetime(date_str)
new_row = pd.DataFrame({'Month': [date_obj]})
cpi_pivot = pd.concat([cpi_pivot, new_row]).reset_index(drop=True)

FEATURE ENGINNERING


In [142]:
feats_to_lag = cpi_pivot.columns[1:].to_list()
for col in feats_to_lag:
    for i in range(1,6):
        cpi_pivot[f'prev_{i}_month_{col}'] = cpi_pivot[col].shift(i)

HANDLE MISSING DATA


In [143]:
cpi_pivot = cpi_pivot.drop(0)
cpi_pivot = cpi_pivot.bfill()

TRAIN AND VALIDATION


In [144]:
train = cpi_pivot[cpi_pivot['Month'] != "2023-07-31"]
test = cpi_pivot[cpi_pivot['Month'] == "2023-07-31"]

training_set = train[train['Month']!= '2023-06-30']
validation_set = train[train['Month']== '2023-06-30']

train.shape, test.shape, training_set.shape, validation_set.shape

((17, 79), (1, 79), (16, 79), (1, 79))

MODELING


In [145]:
import optuna


target_cols = ['Alcoholic beverages and tobacco', 'Clothing and footwear',
       'Communication', 'Education', 'Food and non-alcoholic beverages',
       'Headline_CPI', 'Health', 'Household contents and services',
       'Housing and utilities', 'Miscellaneous goods and services',
       'Recreation and culture', 'Restaurants and hotels ', 'Transport']

#if you add additional data sources that have no value in the predicting month , drop it, now that you have their lags
features= [col for col in train.columns if col not in target_cols + ['Month']]


X_train = training_set[features]
y_train = training_set[target_cols]

X_val = validation_set[features]
y_val = validation_set[target_cols]

x_models = {}
lr_models = {}
l_models = {}
r_models = {}
el_models = {}
bp = {}

y_pred = []
y_predx = []
y_predl = []
y_predr = []
y_predel = []

scaler = MinMaxScaler()

def objective(trial):
    alpha = trial.suggest_loguniform('alpha', 1e-5, 100)  # Ridge regularization strength
    fit_intercept = trial.suggest_categorical('fit_intercept', [True, False])
    copy_X = trial.suggest_categorical('copy_X', [True, False])
    max_iter = trial.suggest_int('max_iter', 100, 1000)
    tol = trial.suggest_loguniform('tol', 1e-6, 1e-3)
    solver = trial.suggest_categorical('solver', ['auto', 'svd', 'cholesky', 'lsqr', 'sparse_cg', 'sag', 'saga'])

    r_model = Ridge(alpha=alpha, fit_intercept=fit_intercept, copy_X=copy_X, max_iter=max_iter, tol=tol, solver=solver,)
    r_model.fit(X_train, y_train[target_col])
    y_pred_colr = r_model.predict(X_val)
    return mean_squared_error(y_val[target_col], y_pred_colr)

#training
for target_col in target_cols:
    #study = optuna.create_study(direction='minimize')
    #study.optimize(objective, n_trials=50)
    #best_params = study.best_params
    
    lr_model = LinearRegression()
    x_model = XGBRegressor(seed=seed)
    r_model = Ridge(random_state=seed)
    l_model = Lasso(alpha=0.1, random_state=seed)
    el_model = ElasticNet(alpha=0.1, l1_ratio=0.5, warm_start=True, random_state= seed)
    
    X_train_scaled = scaler.fit_transform(X_train)
    
    lr_model.fit(X_train_scaled, y_train[target_col])
    x_model.fit(X_train_scaled, y_train[target_col])
    r_model.fit(X_train, y_train[target_col])
    l_model.fit(X_train_scaled, y_train[target_col])
    el_model.fit(X_train_scaled, y_train[target_col])
    
    lr_models[target_col] = lr_model
    l_models[target_col] = l_model
    r_models[target_col] = r_model
    el_models[target_col] = el_model
    x_models[target_col] = x_model
    #bp[target_col] = best_params
#validation
for target_col in target_cols:
    lr_model = lr_models[target_col]
    x_model = x_models[target_col]
    l_model = l_models[target_col]
    r_model = r_models[target_col]
    el_model = el_models[target_col]
    
    X_val_scaled = scaler.transform(X_val)
    
    y_pred_col = lr_model.predict(X_val_scaled)
    y_pred_colx = x_model.predict(X_val_scaled)
    y_pred_coll = l_model.predict(X_val_scaled)
    y_pred_colr = r_model.predict(X_val)
    y_pred_colel = el_model.predict(X_val_scaled)
    
    y_pred.append(y_pred_col)
    y_predx.append(y_pred_colx)
    y_predl.append(y_pred_coll)
    y_predr.append(y_pred_colr)
    y_predel.append(y_pred_colel)

#scoring

y_pred = np.array(y_pred).T
y_predx = np.array(y_predx).T
y_predl = np.array(y_predl).T
y_predr = np.array(y_predr).T
y_predel = np.array(y_predel).T

df = pd.DataFrame({'y_pred': y_pred.flatten(), 'y_val': y_val.values.flatten()})
dfx = pd.DataFrame({'y_pred': y_predx.flatten(), 'y_val': y_val.values.flatten()})
dfl = pd.DataFrame({'y_pred': y_predl.flatten(), 'y_val': y_val.values.flatten()})
dfr = pd.DataFrame({'y_pred': y_predr.flatten(), 'y_val': y_val.values.flatten()})
dfel = pd.DataFrame({'y_pred': y_predel.flatten(), 'y_val': y_val.values.flatten()})

#calculate the rmse
rmse = np.sqrt(mean_squared_error(df['y_pred'], df['y_val']))
rmsex = np.sqrt(mean_squared_error(dfx['y_pred'], df['y_val']))
rmsel = np.sqrt(mean_squared_error(dfl['y_pred'], df['y_val']))
rmser = np.sqrt(mean_squared_error(dfr['y_pred'], df['y_val']))
rmseel = np.sqrt(mean_squared_error(dfel['y_pred'], df['y_val']))

print(f'RMSE of Liner Regression: {rmse}') 
print(f'RMSE of XGB: {rmsex}') 
print(f'RMSE of Lasso Regression: {rmsel}') 
print(f'RMSE of Riged Regression: {rmser}') 
print(f'RMSE of Elastic Regression: {rmseel}') 

RMSE of Liner Regression: 1.3683359367578005
RMSE of XGB: 0.396467953394632
RMSE of Lasso Regression: 2.732853595795069
RMSE of Riged Regression: 1.1015477968778138
RMSE of Elastic Regression: 2.091601060380883


In [146]:
#best_params

In [147]:
X_train = train[features]
y_train = train[target_cols]

X_val = test[features]
y_val = test[target_cols]

x_models = {}
lr_models = {}
l_models = {}
r_models = {}
el_models = {}

y_pred = []
y_predx = []
y_predl = []
y_predr = []
y_predel = []

scaler = MinMaxScaler()

#training
for target_col in target_cols:
    lr_model = LinearRegression()
    x_model = XGBRegressor(seed=seed)
    r_model = Ridge(random_state=seed)
    l_model = Lasso(alpha=0.1, random_state=seed)
    el_model = ElasticNet(alpha=0.1, l1_ratio=0.5, warm_start=True, random_state= seed)
    
    X_train_scaled = scaler.fit_transform(X_train)
    
    lr_model.fit(X_train_scaled, y_train[target_col])
    x_model.fit(X_train_scaled, y_train[target_col])
    r_model.fit(X_train, y_train[target_col])
    l_model.fit(X_train_scaled, y_train[target_col])
    el_model.fit(X_train_scaled, y_train[target_col])
    
    lr_models[target_col] = lr_model
    l_models[target_col] = l_model
    r_models[target_col] = r_model
    el_models[target_col] = el_model
    x_models[target_col] = x_model
    
#pridiction
for target_col in target_cols:
    lr_model = lr_models[target_col]
    x_model = x_models[target_col]
    l_model = l_models[target_col]
    r_model = r_models[target_col]
    el_model = el_models[target_col]
    
    X_val_scaled = scaler.transform(X_val)
    
    y_pred_col = lr_model.predict(X_val_scaled)
    y_pred_colx = x_model.predict(X_val_scaled)
    y_pred_coll = l_model.predict(X_val_scaled)
    y_pred_colr = r_model.predict(X_val)
    y_pred_colel = el_model.predict(X_val_scaled)
    
    y_pred.append(y_pred_col)
    y_predx.append(y_pred_colx)
    y_predl.append(y_pred_coll)
    y_predr.append(y_pred_colr)
    y_predel.append(y_pred_colel)
    
y_pred = np.array(y_pred).T
y_predx = np.array(y_predx).T
y_predl = np.array(y_predl).T
y_predr = np.array(y_predr).T
y_predel = np.array(y_predel).T

print(f'pridiction of Liner Regression: {y_pred}') 
print(f'prediction of XGB: {y_predx}') 
print(f'pridiction of Riged Regression: {y_predr}') 
print(f'prediction of Lasso Regression: {y_predl}') 
print(f'pridiction of Elastic Net Regression: {y_predel}') 

pridiction of Liner Regression: [[110.82532684 104.69229224  98.32687358 109.52677156 119.52832971
  111.04463952 111.03980081 106.92752362 107.41048622 110.68652394
  106.1430026  110.35507674 115.01430442]]
prediction of XGB: [[110.89876  104.29863   99.60103  110.39956  118.29879  109.79865
  110.79883  107.68839  105.398834 109.485954 105.29847  110.39875
  112.424164]]
pridiction of Riged Regression: [[110.43185883 104.7121366   98.60764598 109.36962369 118.3941571
  110.98307363 111.59116958 107.00552457 107.08670826 110.78975297
  106.0759272  111.03819868 116.46913728]]
prediction of Lasso Regression: [[110.67396924 104.12963235  99.68823529 110.82182279 118.23504053
  111.32732749 110.16780038 108.55788595 107.26327597 109.99559602
  105.77131931 109.74638073 116.62558234]]
pridiction of Elastic Net Regression: [[111.11808744 104.41682451  99.68823529 109.81979779 120.47484452
  111.5252384  109.62701959 108.78797567 106.54539057 109.74742597
  105.72663505 110.56060273 118.27

SUBMITION


In [148]:
def prepSub(y_pred:list, target_cols: list):
    sub_df = pd.DataFrame(y_pred, columns=target_cols)
    sub_df['Month'] = date_obj
    return sub_df

In [149]:
l = y_predl.tolist()
l[0].insert(0, date_obj)
sub = prepSub(l, ['Month']+target_cols) 

In [150]:
cpi_pivot = cpi.pivot(index = 'Month', columns = 'Category', values = 'Value').reset_index()#changing from a long format to a wide format,
cpi_pivot['Month'] = pd.to_datetime(cpi_pivot['Month'])
cpi_pivot = cpi_pivot.sort_values("Month").reset_index(drop=True)
date_str = '2023-06-30'
date_obj = pd.to_datetime(date_str)
new_row = pd.DataFrame({'Month': [date_obj]})
cpi_pivot = pd.concat([cpi_pivot, new_row]).reset_index(drop=True)
cpi_of_june = [110.9, 104.3, 99.6, 110.4, 118.3, 109.8, 110.8, 107.7, 105.4, 109.6, 105.3, 110.0, 112.3]

for i, col in enumerate(cpi_pivot.columns[1:]):
    cpi_pivot.at[17, col] = cpi_of_june[i]

cpi_pivot = pd.concat([cpi_pivot, sub]).reset_index(drop=True)
date_str = '2023-08-31'
date_obj = pd.to_datetime(date_str)
new_row = pd.DataFrame({'Month': [date_obj]})
cpi_pivot = pd.concat([cpi_pivot, new_row]).reset_index(drop=True)

In [151]:
feats_to_lag = cpi_pivot.columns[1:].to_list()
for col in feats_to_lag:
    for i in range(1,6):
        cpi_pivot[f'prev_{i}_month_{col}'] = cpi_pivot[col].shift(i)

In [152]:
cpi_pivot = cpi_pivot.drop(0)
cpi_pivot = cpi_pivot.bfill()

In [153]:
train = cpi_pivot[cpi_pivot['Month'] != "2023-08-31"]
test = cpi_pivot[cpi_pivot['Month'] == "2023-08-31"]

training_set = train[train['Month']!= '2023-06-30']
validation_set = train[train['Month']== '2023-06-30']

train.shape, test.shape, training_set.shape, validation_set.shape

((18, 105), (1, 105), (17, 105), (1, 105))

In [154]:
import optuna


target_cols = ['Alcoholic beverages and tobacco', 'Clothing and footwear',
       'Communication', 'Education', 'Food and non-alcoholic beverages',
       'Headline_CPI', 'Health', 'Household contents and services',
       'Housing and utilities', 'Miscellaneous goods and services',
       'Recreation and culture', 'Restaurants and hotels ', 'Transport']

#if you add additional data sources that have no value in the predicting month , drop it, now that you have their lags
features= [col for col in train.columns if col not in target_cols + ['Month']]


X_train = training_set[features]
y_train = training_set[target_cols]

X_val = validation_set[features]
y_val = validation_set[target_cols]

x_models = {}
lr_models = {}
l_models = {}
r_models = {}
el_models = {}
bp = {}

y_pred = []
y_predx = []
y_predl = []
y_predr = []
y_predel = []

scaler = MinMaxScaler()

def objective(trial):
    alpha = trial.suggest_loguniform('alpha', 1e-5, 100)  # Ridge regularization strength
    fit_intercept = trial.suggest_categorical('fit_intercept', [True, False])
    copy_X = trial.suggest_categorical('copy_X', [True, False])
    max_iter = trial.suggest_int('max_iter', 100, 1000)
    tol = trial.suggest_loguniform('tol', 1e-6, 1e-3)
    solver = trial.suggest_categorical('solver', ['auto', 'svd', 'cholesky', 'lsqr', 'sparse_cg', 'sag', 'saga'])

    r_model = Ridge(alpha=alpha, fit_intercept=fit_intercept, copy_X=copy_X, max_iter=max_iter, tol=tol, solver=solver,)
    r_model.fit(X_train, y_train[target_col])
    y_pred_colr = r_model.predict(X_val)
    return mean_squared_error(y_val[target_col], y_pred_colr)

#training
for target_col in target_cols:
    #study = optuna.create_study(direction='minimize')
    #study.optimize(objective, n_trials=50)
    #best_params = study.best_params
    
    lr_model = LinearRegression()
    x_model = XGBRegressor(seed=seed)
    r_model = Ridge(random_state=seed)
    l_model = Lasso(alpha=0.1, random_state=seed)
    el_model = ElasticNet(alpha=0.1, l1_ratio=0.5, warm_start=True, random_state= seed)
    
    X_train_scaled = scaler.fit_transform(X_train)
    
    lr_model.fit(X_train_scaled, y_train[target_col])
    x_model.fit(X_train_scaled, y_train[target_col])
    r_model.fit(X_train, y_train[target_col])
    l_model.fit(X_train_scaled, y_train[target_col])
    el_model.fit(X_train_scaled, y_train[target_col])
    
    lr_models[target_col] = lr_model
    l_models[target_col] = l_model
    r_models[target_col] = r_model
    el_models[target_col] = el_model
    x_models[target_col] = x_model
    #bp[target_col] = best_params
#validation
for target_col in target_cols:
    lr_model = lr_models[target_col]
    x_model = x_models[target_col]
    l_model = l_models[target_col]
    r_model = r_models[target_col]
    el_model = el_models[target_col]
    
    X_val_scaled = scaler.transform(X_val)
    
    y_pred_col = lr_model.predict(X_val_scaled)
    y_pred_colx = x_model.predict(X_val_scaled)
    y_pred_coll = l_model.predict(X_val_scaled)
    y_pred_colr = r_model.predict(X_val)
    y_pred_colel = el_model.predict(X_val_scaled)
    
    y_pred.append(y_pred_col)
    y_predx.append(y_pred_colx)
    y_predl.append(y_pred_coll)
    y_predr.append(y_pred_colr)
    y_predel.append(y_pred_colel)

#scoring

y_pred = np.array(y_pred).T
y_predx = np.array(y_predx).T
y_predl = np.array(y_predl).T
y_predr = np.array(y_predr).T
y_predel = np.array(y_predel).T

df = pd.DataFrame({'y_pred': y_pred.flatten(), 'y_val': y_val.values.flatten()})
dfx = pd.DataFrame({'y_pred': y_predx.flatten(), 'y_val': y_val.values.flatten()})
dfl = pd.DataFrame({'y_pred': y_predl.flatten(), 'y_val': y_val.values.flatten()})
dfr = pd.DataFrame({'y_pred': y_predr.flatten(), 'y_val': y_val.values.flatten()})
dfel = pd.DataFrame({'y_pred': y_predel.flatten(), 'y_val': y_val.values.flatten()})

#calculate the rmse
rmse = np.sqrt(mean_squared_error(df['y_pred'], df['y_val']))
rmsex = np.sqrt(mean_squared_error(dfx['y_pred'], df['y_val']))
rmsel = np.sqrt(mean_squared_error(dfl['y_pred'], df['y_val']))
rmser = np.sqrt(mean_squared_error(dfr['y_pred'], df['y_val']))
rmseel = np.sqrt(mean_squared_error(dfel['y_pred'], df['y_val']))

print(f'RMSE of Liner Regression: {rmse}') 
print(f'RMSE of XGB: {rmsex}') 
print(f'RMSE of Lasso Regression: {rmsel}') 
print(f'RMSE of Riged Regression: {rmser}') 
print(f'RMSE of Elastic Regression: {rmseel}') 

RMSE of Liner Regression: 1.1060831490148735
RMSE of XGB: 0.9811874514292074
RMSE of Lasso Regression: 1.459569323391948
RMSE of Riged Regression: 0.7713745422817264
RMSE of Elastic Regression: 1.0878555934917027


In [155]:
X_train = train[features]
y_train = train[target_cols]

X_val = test[features]
y_val = test[target_cols]

x_models = {}
lr_models = {}
l_models = {}
r_models = {}
el_models = {}

y_pred = []
y_predx = []
y_predl = []
y_predr = []
y_predel = []

scaler = MinMaxScaler()

#training
for target_col in target_cols:
    lr_model = LinearRegression()
    x_model = XGBRegressor(seed=seed)
    r_model = Ridge(random_state=seed)
    l_model = Lasso(alpha=0.1, random_state=seed)
    el_model = ElasticNet(alpha=0.1, l1_ratio=0.5, warm_start=True, random_state= seed)
    
    X_train_scaled = scaler.fit_transform(X_train)
    
    lr_model.fit(X_train_scaled, y_train[target_col])
    x_model.fit(X_train_scaled, y_train[target_col])
    r_model.fit(X_train, y_train[target_col])
    l_model.fit(X_train_scaled, y_train[target_col])
    el_model.fit(X_train_scaled, y_train[target_col])
    
    lr_models[target_col] = lr_model
    l_models[target_col] = l_model
    r_models[target_col] = r_model
    el_models[target_col] = el_model
    x_models[target_col] = x_model
    
#pridiction
for target_col in target_cols:
    lr_model = lr_models[target_col]
    x_model = x_models[target_col]
    l_model = l_models[target_col]
    r_model = r_models[target_col]
    el_model = el_models[target_col]
    
    X_val_scaled = scaler.transform(X_val)
    
    y_pred_col = lr_model.predict(X_val_scaled)
    y_pred_colx = x_model.predict(X_val_scaled)
    y_pred_coll = l_model.predict(X_val_scaled)
    y_pred_colr = r_model.predict(X_val)
    y_pred_colel = el_model.predict(X_val_scaled)
    
    y_pred.append(y_pred_col)
    y_predx.append(y_pred_colx)
    y_predl.append(y_pred_coll)
    y_predr.append(y_pred_colr)
    y_predel.append(y_pred_colel)
    
y_pred = np.array(y_pred).T
y_predx = np.array(y_predx).T
y_predl = np.array(y_predl).T
y_predr = np.array(y_predr).T
y_predel = np.array(y_predel).T

print(f'pridiction of Liner Regression: {y_pred}') 
print(f'prediction of XGB: {y_predx}') 
print(f'pridiction of Riged Regression: {y_predr}') 
print(f'prediction of Lasso Regression: {y_predl}') 
print(f'pridiction of Elastic Net Regression: {y_predel}') 

pridiction of Liner Regression: [[109.68086626 104.63300375 100.34640145 109.43057078 119.10994539
  111.42666845 110.93975953 109.35427711 107.04001575 110.2432487
  105.79826581 110.58114063 116.19449073]]
prediction of XGB: [[110.898705 104.267624  99.60883  110.39981  118.300095 109.70104
  110.66538  107.721375 105.39892  109.59975  105.291    109.66177
  113.56251 ]]
pridiction of Riged Regression: [[110.24604351 104.29520768 100.03375975 111.20390674 118.36890993
  110.83293393 109.72534257 108.51064098 107.13756078 109.34301569
  106.06153702 108.97545486 114.64190388]]
prediction of Lasso Regression: [[110.88854954 104.63289981  99.68823529 110.24042897 120.80892737
  110.77803646 110.05331894 108.70811138 107.02325571 109.783881
  105.06331432 109.6165788  117.85279225]]
pridiction of Elastic Net Regression: [[111.18971719 104.64136715  99.67209245 110.46934159 120.58062941
  111.46681003 110.46445007 108.90798335 106.97460717 110.13374722
  105.79788399 110.36149318 117.5702

In [156]:
def prepSub(y_pred:list, target_cols: list, test, prefix:str):
    sub_df = pd.DataFrame(y_pred, columns=target_cols)
    sub_df['Month'] = test['Month']
    
    sub_df.set_index('Month', inplace=True)
    sub_df.columns = [prefix+'_' + col.lower().replace('_', ' ').strip() for col in sub_df.columns]
    sub_df.rename(columns= {f"{prefix}_headline cpi": f"{prefix}_headline CPI"}, inplace=True)
    
    sub_df = pd.melt(sub_df.reset_index(), id_vars= ['Month'], var_name= 'ID', value_name= 'Value')
    
    return sub_df[['ID', 'Value']]

In [157]:
sub = prepSub(y_predl, target_cols, test, 'August') 
sub.to_csv('out/lasso_with_lag6_historic_base.csv', index=False)