In [1]:
pip install mlflow

Note: you may need to restart the kernel to use updated packages.


In [2]:
import mlflow
import mlflow.sklearn
import mlflow.sklearn

In [3]:
mlflow.set_tracking_uri("http://127.0.0.1:5000")

mlflow.set_experiment("RandomForest")

2025/11/26 15:57:59 INFO mlflow.tracking.fluent: Experiment with name 'RandomForest' does not exist. Creating a new experiment.


<Experiment: artifact_location='mlflow-artifacts:/649018504180659342', creation_time=1764165479494, experiment_id='649018504180659342', last_update_time=1764165479494, lifecycle_stage='active', name='RandomForest', tags={}>

In [4]:
import pandas as pd
import numpy as np

df = pd.read_csv("F:/DEPI Graduation Project/FMCG_2022_2024.csv")

In [5]:
df.duplicated().sum()

np.int64(0)

In [6]:
upper_cap = df['stock_available'].quantile(0.99)
df['stock_available_capped'] = np.where(df['stock_available'] > upper_cap, upper_cap, df['stock_available'])
lower_cap = df['delivered_qty'].quantile(0.01)  # 1st percentile
upper_cap = df['delivered_qty'].quantile(0.99)  # 99th percentile

df['delivered_qty_capped'] = df['delivered_qty'].clip(lower=lower_cap, upper=upper_cap)

In [7]:
df['units_sold_log'] = np.log1p(df['units_sold'])

  result = getattr(ufunc, method)(*inputs, **kwargs)


In [8]:
df['Total_price per transaction'] = df['price_unit'] * df['units_sold']
df['Total_price per transaction'] = df['Total_price per transaction'].astype(float)

In [9]:
df["sell_through_rate"] = (df["units_sold"] / df["stock_available"]) * 100

In [10]:
df['units_sold_lag_1'] = df['units_sold'].shift(1)
df['units_sold_lag_7'] = df['units_sold'].shift(7)
df['units_sold_lag_30'] = df['units_sold'].shift(30)
df['units_sold_log_lag_1'] = df['units_sold_log'].shift(1)
df['units_sold_log_lag_7'] = df['units_sold_log'].shift(7)
df['units_sold_log_lag_30'] = df['units_sold_log'].shift(30)
df = df.dropna().reset_index(drop=True)
df["units_sold_roll7"] = df.groupby("sku")["units_sold"].transform(lambda x: x.rolling(7, min_periods=1).mean())
df["units_sold_roll30"] = df.groupby("sku")["units_sold"].transform(lambda x: x.rolling(30, min_periods=1).mean())
df["units_sold_log_roll7"]  = df.groupby("sku")["units_sold_log"].transform(lambda x: x.rolling(7, min_periods=1).mean())
df["units_sold_log_roll30"] = df.groupby("sku")["units_sold_log"].transform(lambda x: x.rolling(30, min_periods=1).mean())
df["promo_lag1"] = df["promotion_flag"] * df["units_sold_lag_1"]
df["promo_lag7"] = df["promotion_flag"] * df["units_sold_lag_7"]
df["promo_price_interaction"] = df["promotion_flag"] * df["price_unit"]
df["promo_roll7"] = df["promotion_flag"] * df["units_sold_roll7"]
df["promo_roll30"] = df["promotion_flag"] * df["units_sold_roll30"]

In [11]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import numpy as np

# -----------------------------
# 1. DROP LEAKAGE COLUMNS
# -----------------------------
leak_columns = [
    "units_sold",
    "Total_price per transaction",
    "sell_through_rate"
]

df = df.drop(columns=leak_columns)


In [12]:
# -----------------------------
# 2. FEATURE LISTS
# -----------------------------
categorical_cols = ["sku", "brand", "segment", "category", "channel", "region", "pack_type"]

numeric_cols = [
    "price_unit", "promotion_flag", "delivery_days",
    "units_sold_lag_1", "units_sold_lag_7", "units_sold_lag_30",
    "units_sold_log_lag_1", "units_sold_log_lag_7", "units_sold_log_lag_30",
    "units_sold_roll7", "units_sold_roll30",
    "units_sold_log_roll7", "units_sold_log_roll30",
    "promo_lag1", "promo_lag7", "promo_price_interaction",
    "promo_roll7", "promo_roll30","delivered_qty",
    "stock_available",
    "delivered_qty_capped",
    "stock_available_capped"
]


In [13]:
# -----------------------------
# 3. DEFINE X AND y
# -----------------------------
X = df[categorical_cols + numeric_cols]
y = df["units_sold_log"]

# HANDLE MISSING VALUES
X = X.fillna(0)


In [14]:
# -----------------------------
# 4. TIME-BASED SPLIT (NO SHUFFLE)
# -----------------------------
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, shuffle=False
)


In [15]:
# -----------------------------
# 5. PREPROCESSOR
# -----------------------------
preprocessor = ColumnTransformer([
    ("cat", OneHotEncoder(handle_unknown='ignore'), categorical_cols),
    ("num", StandardScaler(), numeric_cols)
], remainder='drop')


In [16]:
# -----------------------------
# 6. PIPELINE
# -----------------------------
pipeline = Pipeline([
    ("preprocess", preprocessor),

    ("model", RandomForestRegressor(
        n_estimators=300,
        max_depth=13,
        min_samples_split=3,
        min_samples_leaf=3,
        random_state=42,
        n_jobs=-1
    ))
])

In [17]:
with mlflow.start_run(run_name="RandomForest_pipeline"):
    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_test)
    y_test_real = np.exp(y_test)
    y_pred_real = np.exp(y_pred)
    
    mae = mean_absolute_error(y_test_real, y_pred_real)
    rmse = np.sqrt(mean_squared_error(y_test_real, y_pred_real))
    r2 = r2_score(y_test_real, y_pred_real)
    mlflow.log_metric("MAE", mae)
    mlflow.log_metric("RMSE", rmse)
    mlflow.log_metric("R2", r2)
    input_example = X_train.head(1)
    mlflow.sklearn.log_model(pipeline, name="RandomForest_pipeline")




üèÉ View run RandomForest_pipeline at: http://127.0.0.1:5000/#/experiments/649018504180659342/runs/ce374cfc562f4850aa7d2760ee945176
üß™ View experiment at: http://127.0.0.1:5000/#/experiments/649018504180659342


In [18]:
# -----------------------------
# 8. EVALUATE
# -----------------------------
y_pred = pipeline.predict(X_test)
y_test_real = np.exp(y_test)
y_pred_real = np.exp(y_pred)

mae = mean_absolute_error(y_test_real, y_pred_real)
rmse = np.sqrt(mean_squared_error(y_test_real, y_pred_real))
r2 = r2_score(y_test_real, y_pred_real)

print("FINAL Pipeline MAE:", mae)
print("FINAL Pipeline RMSE:", rmse)
print("FINAL Pipeline R2:", r2)

FINAL Pipeline MAE: 3.0371212431948895
FINAL Pipeline RMSE: 4.170919626678901
FINAL Pipeline R2: 0.8126223023830561
