IMPORT

In [211]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

import math
import datetime

from sklearn.linear_model import LinearRegression, Ridge
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.cluster import KMeans

from xgboost import XGBRegressor

import warnings
warnings.filterwarnings('ignore')

pd.options.display.max_columns = 2000
pd.options.display.max_rows = 2000


#path = '/Koding_With_Kolesh/challenges/RMB_NOWCAST_APRIL/'
cpi = pd.read_csv('CPI_Historic_Values_Zindi_May_23.csv')
vehicles = pd.read_csv('Naamsa_Vehicle_Sales.csv')
rental = pd.read_csv('PayProp_Rental_Index.csv')
jse = pd.read_csv('jse_indices.csv')
currency = pd.read_csv('currency_data.csv')

PIVOTING THE DATA

In [212]:
cpi_pivot = cpi.pivot(index = 'Month', columns = 'Category', values = 'Value').reset_index()#changing from a long format to a wide format,
cpi_pivot['Month'] = pd.to_datetime(cpi_pivot['Month'])
cpi_pivot = cpi_pivot.sort_values("Month").reset_index(drop=True)

ADDING JUNE'S DATA MANUALLY FROM https://www.statssa.gov.za/publications/P0141/P0141June2023.pdf

In [213]:
date_str = '2023-06-30'
date_obj = pd.to_datetime(date_str)
new_row = pd.DataFrame({'Month': [date_obj]})
cpi_pivot = pd.concat([cpi_pivot, new_row]).reset_index(drop=True)
cpi_of_june = [110.9, 104.3, 99.6, 110.4, 118.3, 109.8, 110.8, 107.7, 105.4, 109.6, 105.3, 110.0, 112.3]

for i, col in enumerate(cpi_pivot.columns[1:]):
    cpi_pivot.at[17, col] = cpi_of_june[i]

ADDIND  ADDITIONAL DATA

* Vehicles https://naamsa.net/wp-content/uploads/2023/07/20230703-Flash-Report-Summary-June-2023.pdf

In [214]:
cpi_pivot['year_month'] = pd.to_datetime(cpi_pivot['Month'], format='%Y-%b').dt.strftime('%Y-%m')
start_date = datetime.datetime.strptime("2020-12-31", "%Y-%m-%d")
end_date = datetime.datetime.strptime("2023-05-31", "%Y-%m-%d")

# difference between each date. M means one month end
D = 'M'

date_list = pd.date_range(start_date, end_date, freq=D)[::-1]
vehicles['Date'] = date_list
vehicles['Date'] = pd.to_datetime(vehicles['Date'], format='%Y-%b-%d')
vehicles['year_month'] = pd.to_datetime(vehicles['Date'], format='%Y-%b').dt.strftime('%Y-%m')

cpi_pivot = cpi_pivot.merge(vehicles[['year_month', 'Total_Local Sales', 'Total_Export_Sales']], on='year_month', how='left')
namsa_of_june = [46810, 27296]

for i, col in enumerate(cpi_pivot.columns[-2:]):
    cpi_pivot.at[17, col] = namsa_of_june[i]

* jse_indices

In [215]:
jse['date'] = pd.to_datetime(jse['date'])  # Convert 'Date' column to pandas datetime format
jse.set_index('date', inplace=True)
jse_monthly_avg = jse.resample('M').mean()
cpi_pivot = pd.merge(cpi_pivot,jse_monthly_avg,  right_index=True, left_on='Month', how='inner')
cpi_pivot.drop(['Mid Cap Index', 'Small Cap Index',], axis=1, inplace=True)

ADD THE july ROW

In [216]:
date_str = '2023-07-31'
date_obj = pd.to_datetime(date_str)
new_row = pd.DataFrame({'Month': [date_obj]})
cpi_pivot = pd.concat([cpi_pivot, new_row]).reset_index(drop=True)

FEATURE ENGINNERING

In [217]:
feats_to_lag = [col for col in cpi_pivot.columns if col not in ['Month', 'year_month']]
for col in feats_to_lag:
  for i in range(1, 6):
    cpi_pivot[f"prev_{i}_month_{col}"]= cpi_pivot[col].shift(i)

HANDLE MISSING DATA

In [218]:
# Extract the columns containing the target names
target_columns = [col for col in cpi_pivot.columns if any(name in col for name in ['Top 40 Index','All-Share Index', 'Large Cap Index', 'Fin 15 Index'])]

# Create a KMeans imputer for each target column separately
for col in target_columns:
    # Initialize the KMeans imputer with the number of clusters you desire
    kmeans_imputer = KMeans(n_clusters=5)  # You can adjust the number of clusters as needed
    
    # Filter non-null values and fit KMeans to them
    non_null_values = cpi_pivot.dropna(subset=[col])
    kmeans_imputer.fit(non_null_values[col].values.reshape(-1, 1))
    
    # Predict cluster centers for non-null values
    non_null_values_clusters = kmeans_imputer.predict(non_null_values[col].values.reshape(-1, 1))
    
    # Reshape cluster centers to 1-dimensional array
    cluster_centers_1d = kmeans_imputer.cluster_centers_.flatten()
    
    # Fill the NaN values in the column with the predicted cluster centers
    cpi_pivot[col].fillna(pd.Series(cluster_centers_1d[non_null_values_clusters], index=non_null_values.index), inplace=True)


In [219]:
cpi_pivot = cpi_pivot.drop(0)
cpi_pivot = cpi_pivot.bfill()

TRAIN AND VALIDATION

In [220]:
train = cpi_pivot[cpi_pivot['Month'] != "2023-07-31"]
test = cpi_pivot[cpi_pivot['Month'] == "2023-07-31"]

training_set = train[train['Month']!= '2023-06-30']
validation_set = train[train['Month']== '2023-06-30']

train.shape, test.shape, training_set.shape, validation_set.shape

((17, 116), (1, 116), (16, 116), (1, 116))

MODELING

In [221]:
target_cols = ['Alcoholic beverages and tobacco', 'Clothing and footwear',
       'Communication', 'Education', 'Food and non-alcoholic beverages',
       'Headline_CPI', 'Health', 'Household contents and services',
       'Housing and utilities', 'Miscellaneous goods and services',
       'Recreation and culture', 'Restaurants and hotels ', 'Transport']

not_included = ['Top 40 Index', 'All-Share Index',
       'Large Cap Index', 'Fin 15 Index', 'Month', 'year_month','Total_Local Sales', 'Total_Export_Sales']

#if you add additional data sources that have no value in the predicting month , drop it, now that you have their lags
features= [col for col in train.columns if col not in target_cols + not_included]


X_train = training_set[features]
y_train = training_set[target_cols]

X_val = validation_set[features]
y_val = validation_set[target_cols]

x_models = {}
lr_models = {}
y_pred = []
y_predx = []
scaler = MinMaxScaler()

#training
for target_col in target_cols:
    lr_model = LinearRegression()
    x_model = XGBRegressor()
    X_train_scaled = scaler.fit_transform(X_train)
    lr_model.fit(X_train_scaled, y_train[target_col])
    x_model.fit(X_train_scaled, y_train[target_col])
    lr_models[target_col] = lr_model
    x_models[target_col] = x_model
#validation
for target_col in target_cols:
    lr_model = lr_models[target_col]
    x_model = x_models[target_col]
    X_val_scaled = scaler.transform(X_val)
    y_pred_col = lr_model.predict(X_val_scaled)
    y_pred_colx = x_model.predict(X_val_scaled)
    y_pred.append(y_pred_col)
    y_predx.append(y_pred_colx)


#scoring

y_pred = np.array(y_pred).T
y_predx = np.array(y_predx).T
df = pd.DataFrame({'y_pred': y_pred.flatten(), 'y_val': y_val.values.flatten()})
dfx = pd.DataFrame({'y_pred': y_predx.flatten(), 'y_val': y_val.values.flatten()})
#calculate the rmse
rmse = np.sqrt(mean_squared_error(df['y_pred'], df['y_val']))
rmsex = np.sqrt(mean_squared_error(dfx['y_pred'], df['y_val']))
print(f'RMSE of Liner Regression: {rmse}') 
print(f'RMSE of XGB: {rmsex}') 

RMSE of Liner Regression: 1.2847860949796923
RMSE of XGB: 0.41593481771500934


In [222]:
X_train = train[features]
y_train = train[target_cols]

X_val = test[features]
y_val = test[target_cols]

x_models = {}
lr_models = {}
y_pred = []
y_predx = []
scaler = MinMaxScaler()

#training
for target_col in target_cols:
    lr_model = LinearRegression()
    x_model = XGBRegressor()
    X_train_scaled = scaler.fit_transform(X_train)
    lr_model.fit(X_train_scaled, y_train[target_col])
    x_model.fit(X_train_scaled, y_train[target_col])
    lr_models[target_col] = lr_model
    x_models[target_col] = x_model
#pridiction
for target_col in target_cols:
    lr_model = lr_models[target_col]
    x_model = x_models[target_col]
    X_val_scaled = scaler.transform(X_val)
    y_pred_col = lr_model.predict(X_val_scaled)
    y_pred_colx = x_model.predict(X_val_scaled)
    y_pred.append(y_pred_col)
    y_predx.append(y_pred_colx)

y_pred = np.array(y_pred).T
y_predx = np.array(y_predx).T
print(f'pridiction of Liner Regression: {y_pred}') 
print(f'prediction of XGB: {y_predx}') 

pridiction of Liner Regression: [[110.48965713 104.40039987  98.50114215 108.85978376 119.65346875
  110.61748385 110.44771364 107.77174776 106.82800545 110.26336741
  106.38961241 109.88281078 112.9697659 ]]
prediction of XGB: [[110.89848  104.29873   99.65686  110.39956  118.29866  109.79878
  110.79808  107.68143  105.39845  109.486275 105.29846  110.39842
  113.19369 ]]


SUBMITION

In [223]:
def prepSub(y_pred:list, target_cols: list, test, prefix:str):
    sub_df = pd.DataFrame(y_pred, columns=target_cols)
    sub_df['Month'] = test['Month']
    
    sub_df.set_index('Month', inplace=True)
    sub_df.columns = [prefix+'_' + col.lower().replace('_', ' ').strip() for col in sub_df.columns]
    sub_df.rename(columns= {f"{prefix}_headline cpi": f"{prefix}_headline CPI"}, inplace=True)
    
    sub_df = pd.melt(sub_df.reset_index(), id_vars= ['Month'], var_name= 'ID', value_name= 'Value')
    
    return sub_df[['ID', 'Value']]

In [225]:
sub = prepSub(y_pred, target_cols, test, 'July') 
sub.to_csv('out/liner_with_lag5_hvj_k.csv', index=False)