# Instructions

* This is a fill in the blanks challenge
* Basic Machine Learning functions have been written out for you
* You will have to change the hyper-parameters yourself

Feature Engineering
* Some basic feature Engineering have been done for you
* If you would like to do more, please feel free to do so
* Some additional "Options" have been provided, simply copy & paste or uncomment to use the code

# Import Dataset

In [None]:
import kagglehub

path = kagglehub.dataset_download("mikhail1681/walmart-sales")

print("Path to dataset files:", path)

# Data Engineering and Cleaning

### Pro-Tip: Read Through the Lines in the code (& Explanation) before running the code block.

#### If you want to back-track after doing feature engineering you need to re-import the dataset again.

Reading the CSV into a Pandas Dataframe

In [None]:
import pandas as pd
import numpy as np

df = pd.read_csv(path + "/Walmart_Sales.csv")
display(df.head())

Basic Feature Engineering

In [None]:
df['Date'] = pd.to_datetime(df['Date'], format='%d-%m-%Y')
df['Month'] = df['Date'].dt.month # Gives you a numeric month (1-12)
df['Month_sin'] = np.sin(2 * np.pi * df["Month"] / 12) # Wraps the month around. ie. January is Close to December
df["Month_cos"] = np.cos(2 * np.pi * df["Month"] / 12)
df.drop(columns=["Month", "Date"], inplace=True) # Remove the month column from your dataset

df.head()

## Other Options (Uncomment or Copy & Paste to use the code):

## df['Year'] = df['Date'].dt.year
## df['Month'] = df['Date'].dt.month
## df['Day'] = df['Date'].dt.day
## df['DayOfWeek'] = df['Date'].dt.dayofweek

Splitting into X, y and Train Test (80-20 split)



*   DO NOT Change this train-test split. It will be used for evaluation



In [None]:
train = df.sample(frac=0.8, random_state=1000)
test = df.drop(train.index)

X_train = train.drop(columns=["Weekly_Sales"])
y_train = train["Weekly_Sales"]

X_test = test.drop(columns=["Weekly_Sales"])
y_test = test["Weekly_Sales"]

Standardisation of the Dataset

In [None]:
from sklearn.preprocessing import StandardScaler

scaler_X = StandardScaler()
scaler_y = StandardScaler()

X_train_standardised = scaler_X.fit_transform(X_train)
X_train_standardised = pd.DataFrame(X_train_standardised, columns=X_train.columns) ## Restore the dataframe
y_train_standardised = scaler_y.fit_transform(y_train.values.reshape(-1,1))

X_test_standardised = scaler_X.transform(X_test)
X_test_standardised = pd.DataFrame(X_test_standardised, columns=X_test.columns) ## Restore the dataframe
y_test_standardised = scaler_y.transform(y_test.values.reshape(-1,1))

# Unsupervised Learning

(a) Fitting PCA

In [None]:
from sklearn.decomposition import PCA

pca = PCA(n_components = X_train_standardised.shape[1], random_state= 1000)
pca.fit(X_train_standardised)

(a) Scree Plot

In [None]:
import matplotlib.pyplot as plt

plt.plot(np.arange(1,9) ,pca.explained_variance_ratio_, marker='o', linestyle='-', linewidth=2, markersize=6, color='steelblue')
plt.xlabel('Principal Component Number', fontsize=12, fontweight='bold')
plt.ylabel('Explained Variance Ratio', fontsize=12, fontweight='bold')
plt.title('Scree Plot: Variance Explained by Each PC',
              fontsize=14, fontweight='bold')
plt.show()

plt.plot(np.arange(1,9) ,np.cumsum(pca.explained_variance_ratio_), marker='o', linestyle='-', linewidth=2, markersize=6, color='steelblue')
plt.xlabel('Principal Component Number', fontsize=12, fontweight='bold')
plt.ylabel('Cumulative Explained Variance', fontsize=12, fontweight='bold')
plt.title('Cumulative Plot: Sum of Each Individual PC',
              fontsize=14, fontweight='bold')
plt.show()

(b) Biplot

In [None]:
pcaX = pca.transform(X_train_standardised)[:, :2]   # first two principal components
loadings = pca.components_.T[:, :2]    # (n_features, 2)
print("Raw loadings values: \n", loadings)
feature_names = X_train_standardised.columns

fig, ax1 = plt.subplots(figsize=(10, 8)) # Create main figure and axis

# Plot data points in a scatter plot using our first two PCs as the axis
sc = ax1.scatter(
    pcaX[:, 0], pcaX[:, 1],
    c=y_train_standardised, cmap='coolwarm',
    s=10, alpha=0.6, linewidth=0.3
)

# Label your primary axis -> PCA scores
ax1.set_xlabel("PC1 scores", fontsize=12, fontweight='bold', labelpad=10)
ax1.set_ylabel("PC2 scores", fontsize=12, fontweight='bold', labelpad=10)
ax1.set_title("PCA Biplot with Score and Loading Axes", fontsize=14, fontweight='bold', pad=25)

# Create a secondary axis that records the loading of each feature in the original data set.
ax2 = ax1.twinx().twiny()
ax2.set_xlim(-1, 1)
ax2.set_ylim(-1, 1)
ax2.set_xlabel("PC1 loadings", fontsize=12, fontweight='bold', color='red', labelpad=10)
ax2.set_ylabel("PC2 loadings", fontsize=12, fontweight='bold', color='red', labelpad=10)
ax2.tick_params(axis='x', colors='red')
ax2.tick_params(axis='y', colors='red')

# Draw out the loading arrows.
scale = (np.max(np.abs(pcaX)) / np.max(np.abs(loadings))) * 0.5
for i, feature in enumerate(feature_names):
    ax1.arrow(0, 0, loadings[i, 0]*scale, loadings[i, 1]*scale,
              color='red', alpha=0.7, head_width=0.1, length_includes_head=True)
    ax1.text(loadings[i, 0]*scale*1.1, loadings[i, 1]*scale*1.1, feature,
             color='red', ha='center', va='center', fontsize=10, fontweight='bold')

# Add reference axis lines
ax1.axhline(0, color="gray", linestyle="--", linewidth=1)
ax1.axvline(0, color="gray", linestyle="--", linewidth=1)

# Formatting and Layout Code
xlim = ax1.get_xlim()
ylim = ax1.get_ylim()
x_pad = (xlim[1] - xlim[0]) * 0.15
y_pad = (ylim[1] - ylim[0]) * 0.15
ax1.set_xlim(-4,4)
ax1.set_ylim(-5,5)
plt.tight_layout(rect=[0, 0, 1, 0.96]) # adjust the plot so that title is visable

plt.show()

# UMAP

(a) Fitting

In [None]:
from umap import UMAP

# from umap.umap_ import UMAP # uncomment if the above import does not work for you

umap = UMAP(n_components=2, n_neighbors = 100, random_state=1000)
umap_train = umap.fit_transform(X_train_standardised)

(b) Visualisation

Note: This code block uses KMeans Clustering because the y_variable is continuous. KMeans was not one of the unsupervised learning models in this Workshop, but remains extremely useful

In its core essence, KMeans Clustering helps you identify clusters in your data by grouping data closest together.

The main hyperparameter for KMeans is n_clusters, which is the number of clusters you would like to see

Usually the number of clusters is eyeballed, but there are also other metrics that can guide you to find the best number of clusters, such as the elbow point (similar to PCA), Bayesian Information Criterion (BIC), Akaikean Information Criterion (AIC) or Akaikean Information Criterion Corrected (AICc).

However, these metrics are simply a guide and you should mainly rely on your own intuition.

For this dataset, it is quite clear that there are 3 main clusters

In [None]:
# 2D / 100 Neighbours
# Continuous Variable -> Using KMeans for better visualisation
from sklearn.cluster import KMeans

kmeans = KMeans(n_clusters=3, random_state=1000) # feel free to toggle the number of clusters as you see fit
labels = kmeans.fit(umap_train)

plt.scatter(umap_train[:, 0], umap_train[:, 1], c=labels.labels_, s=5)
plt.title("2D UMAP Visualization - 100 Neighbours")
plt.xlabel("Component 1")
plt.ylabel("Component 2")
plt.show()

(c) 3D Visualisation

In [None]:
umap2 = UMAP(n_components=3, n_neighbors = 100, random_state=1000)
umap_train2 = umap2.fit_transform(X_train_standardised)

In [None]:
from mpl_toolkits.mplot3d import Axes3D

fig = plt.figure(figsize=(10, 8))
ax = fig.add_subplot(111, projection='3d')

kmeans = KMeans(n_clusters=3, random_state=1000) # feel free to toggle the number of clusters as you see fit
labels = kmeans.fit(umap_train2)

scatter = ax.scatter(
    umap_train2[:, 0], umap_train2[:, 1], umap_train2[:, 2],
    c=labels.labels_, s=5, alpha=0.7
)

ax.set_title("3D UMAP Visualization - 100 Neighbours", fontsize=14)
ax.set_xlabel("UMAP Dimension 1")
ax.set_ylabel("UMAP Dimension 2")
ax.set_zlabel("UMAP Dimension 3")

plt.show()

Unsupervised Learning Comments:
1. This dataset presents a case of a very informative unsupervised learning, where clusters are clearly defined. This is likely a case of non-linear data because UMAP produced a much better result than PCA
2. The next step would be to think about comparing cluster characteristics to understand what is driving this divergence.
3. There are a number of ways that you can do this, however, it is out of the scope of this workshop because there is simply not enough time and understanding the subsequent models will take up a whole workshop on its own.
4. However, we strongly encourage you to read up on the subsequent steps to further understand your dataset! Listed below are some suggestions:
* KMeans Group By Cluster
* ANOVA / Kruskal–Wallis tests --> Understand which features are driving the difference (These methods have strong foundations in Statistics)
* Decision Trees with cluster as a target variable | This becomes a classification task | Important! Do not get the impression that models can only fall in the groups of supervised learning and unsupervised learning. In some cases (such as this one) supervised learning models can help with unsupervised learning tasks.
* Principal Component Regression / Regression after UMAP

# Supervised Learning:

### Use the provided code templates as a starting point. Experiment by modifying the model parameters and selected features to achieve the lowest possible RMSE.

## Linear Regression

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

# --- Choose regressors here(exclude Weekly_Sales) ---
feature_cols = [

]

X_train_lr = X_train[feature_cols]
X_test_lr = X_test[feature_cols]

# --- Fit model ---
model = LinearRegression()
model.fit(X_train_lr, y_train)

# --- Predict on test set ---
y_pred_lr = model.predict(X_test_lr)

# --- Calculate Mean Squared Error ---
mse = mean_squared_error(y_test, y_pred_lr)
rmse = np.sqrt(mse)
print(f"Root Mean Squared Error (RMSE): {rmse:.2f}")

## KNN

In [None]:
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error

# --- Choose regressors here(exclude Weekly_Sales) ---
feature_cols = [

]

X_train_knn = X_train_standardised[feature_cols]
X_test_knn = X_test_standardised[feature_cols]

# --- Fit KNN Regressor ---
knn = KNeighborsRegressor(n_neighbors = ____)  # Choose k value
knn.fit(X_train_knn, y_train_standardised.ravel())

# --- Predict on test set ---
y_pred_standardised = knn.predict(X_test_knn)

# --- Inverse transform predictions to original Weekly_Sales scale ---
y_pred = scaler_y.inverse_transform(y_pred_standardised.reshape(-1, 1))
y_test_original = scaler_y.inverse_transform(y_test_standardised)

# --- Compute metrics ---
mse = mean_squared_error(y_test_original, y_pred)
rmse = np.sqrt(mse)
print(f"KNN Root Mean Squared Error (RMSE): {rmse:.2f}")


## XGBoost

In [None]:
from xgboost import XGBRegressor

# --- Choose regressors here(exclude Weekly_Sales) ---
feature_cols = [

]

X_train_xgb = X_train[feature_cols]
X_test_xgb = X_test[feature_cols]

# --- Fit XGBoost Regressor (You may change the all the parameters except random_state within XGBRegressor() )---
xgb_model = XGBRegressor(
    n_estimators = ___,     # number of boosting rounds (trees)
    max_depth = ___,        # tree depth (controls model complexity)
    learning_rate = ___,    # step size shrinkage
    random_state = 1000     # Please don't change the random state for reproducibility
)
xgb_model.fit(X_train_xgb, y_train)

# --- Predict on test set ---
y_pred_xgb = xgb_model.predict(X_test_xgb)

# --- Compute metrics ---
mse = mean_squared_error(y_test, y_pred_xgb)
rmse = np.sqrt(mse)

print(f"XGBoost Root Mean Squared Error (RMSE): {rmse:.2f}")


## Random Forest

In [None]:
from sklearn.ensemble import RandomForestRegressor

# --- Choose regressors here(exclude Weekly_Sales) ---
feature_cols = [

]

X_train_rf = X_train[feature_cols]
X_test_rf = X_test[feature_cols]

# --- Fit Random Forest Regressor (You may change all the parameters except random_state within RandomForestRegressor()) ---
rf_model = RandomForestRegressor(
    n_estimators = ___,     # number of trees in the forest
    max_depth = ___,        # let trees expand until all leaves are pure
    random_state = 1000     # Please don't change the random state for reproducibility
)
rf_model.fit(X_train_rf, y_train)

# --- Predict on test set ---
y_pred_rf = rf_model.predict(X_test_rf)

# --- Compute metrics ---
mse = mean_squared_error(y_test, y_pred_rf)
rmse = np.sqrt(mse)

print(f"Random Forest Root Mean Squared Error (RMSE): {rmse:.2f}")
