In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib import cm 
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from skrebate import ReliefF
from scipy.stats import chi2_contingency
%matplotlib inline

In [2]:
df = pd.read_csv("../../Data/Transformed/inventory_management_cleaned_jaron.csv")

In [3]:
df.head()

Unnamed: 0,Category,Region,Inventory Level,Price,Discount,Weather Condition,Promotion,Competitor Pricing,Product_UID,Weekday
0,Electronics,North,195,72.72,5,Snowy,0,85.73,S001_P0001,Saturday
1,Clothing,North,117,80.16,15,Snowy,1,92.02,S001_P0002,Saturday
2,Clothing,North,247,62.94,10,Snowy,1,60.08,S001_P0003,Saturday
3,Electronics,North,139,87.63,10,Snowy,0,85.19,S001_P0004,Saturday
4,Groceries,North,152,54.41,0,Snowy,0,51.63,S001_P0005,Saturday


In [4]:
# Now we encode everything so that ML algorithms can perform on the data
label_encoders = {}

# Columns that need to be encoded
cols_to_encode = ["Region", "Weather Condition", "Category", "Weekday"]

# Encode each column and store the encoder
for col in cols_to_encode:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le  # Save the encoder

# Have a look at the encoded dataframe
df.head()

Unnamed: 0,Category,Region,Inventory Level,Price,Discount,Weather Condition,Promotion,Competitor Pricing,Product_UID,Weekday
0,1,1,195,72.72,5,1,0,85.73,S001_P0001,2
1,0,1,117,80.16,15,1,1,92.02,S001_P0002,2
2,0,1,247,62.94,10,1,1,60.08,S001_P0003,2
3,1,1,139,87.63,10,1,0,85.19,S001_P0004,2
4,3,1,152,54.41,0,1,0,51.63,S001_P0005,2


In [5]:
df.to_csv("../../Data/Transformed/inventory_management_cleaned_transformed_unscaled_jaron.csv", index=False)

In [6]:
scale_columns = [
    "Inventory Level",
    "Price",
    "Discount",
    "Competitor Pricing",
]

df_min_max_scaled = df.copy()
scaler = StandardScaler()
df_min_max_scaled[scale_columns] = scaler.fit_transform(df[scale_columns])

In [7]:
import pandas as pd
for col, le in label_encoders.items():
    mapping = pd.DataFrame({
        'Original': le.classes_,
        'Encoded': le.transform(le.classes_)
    })
    print(f"Mapping for {col}:")
    display(mapping)

Mapping for Region:


Unnamed: 0,Original,Encoded
0,East,0
1,North,1
2,South,2
3,West,3


Mapping for Weather Condition:


Unnamed: 0,Original,Encoded
0,Cloudy,0
1,Snowy,1
2,Sunny,2


Mapping for Category:


Unnamed: 0,Original,Encoded
0,Clothing,0
1,Electronics,1
2,Furniture,2
3,Groceries,3
4,Toys,4


Mapping for Weekday:


Unnamed: 0,Original,Encoded
0,Friday,0
1,Monday,1
2,Saturday,2
3,Sunday,3
4,Thursday,4
5,Tuesday,5
6,Wednesday,6


In [8]:
# Use the ReliefF Algorithm to get insights in the feature importance, this can only be applied after all values are encoded and fits well
# in this scenario due to our binary classification problem
# Separate target variable and features
X = df_min_max_scaled.drop(['Promotion', 'Product_UID'], axis=1) 
y = df_min_max_scaled['Promotion']  

# Apply the algorithm
relieff = ReliefF(n_neighbors=100, n_features_to_select=X.shape[1])
relieff.fit(X.values, y.values)

# Output the feature importance scores
scores = pd.Series(relieff.feature_importances_, index=X.columns)
print("Feature Importance Scores (sorted):")
print(scores.sort_values(ascending=False))

Feature Importance Scores (sorted):
Discount              0.439715
Weekday               0.017474
Weather Condition     0.009553
Inventory Level      -0.003388
Region               -0.008124
Competitor Pricing   -0.009923
Price                -0.013206
Category             -0.017590
dtype: float64


In [9]:
# We store the cleaned data with the scaled values for kmeans
df_min_max_scaled.to_csv("../../Data/Transformed/inventory_management_cleaned_transformed_jaron.csv", index=False)