In [1]:
from src.data_ingestion.data_loader import IngestionFactory,DataLoader
from src.data_ingestion.data_validator import ValidationFactory,DataValidator
from src.data_ingestion.data_preprocessor import PreprocessingFactory
from src.features.customer_features import CustomerFeatureExtractor
from src.features.product_features import ProductFeatureExtractor
from src.features.training_data_builder import TrainingDataBuilder
from src.training.data_splitter import TemporalDataSplitter
from src.models.baseline_models import PopularityRecommender,PersonalFrequencyRecommender
from src.evaluation.metrics import RankingMetrics
from src.models.lightgbm_ranker import LightGBMRanker
from src.tuning.hyperparameter_tuning import HyperparameterTuner
from collections import Counter

In [18]:
import pandas as pd

df = pd.read_csv("./data/raw/data_raw.csv")

In [24]:
df[df['customer_id'] == 5].groupby('product_name').count()

Unnamed: 0_level_0,Unnamed: 0,order_id,order_item_id,order_date,order_time,order_total_price,order_gst,cart_order_time,product_variant,product_category,customer_id,first_order_date,last_order_date,total_orders,total_lifetime_value,customer_segment
product_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
(Entree) Pasta,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3
(Main) Pasta,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3
Affogato,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2
Babychino,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16
Banana Bread,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5
Big Breakfast,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3
Brownie,3,3,3,3,3,3,3,3,2,3,3,3,3,3,3,3
Build Your Own Breakfast,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2
Burger,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3
Cappuccino,19,19,19,19,19,19,19,19,19,19,19,19,19,19,19,19


In [2]:
#Load CSV
ingestion = IngestionFactory.create(
    source_type="csv",
    file_path = 'data/raw/data_raw.csv',
    date_columns=["order_date", "first_order_date", "last_order_date"]
)

loader = DataLoader(ingestion)
raw = loader.load()
df = raw.transactions

In [3]:
#Validation - only proceed when is valid = True
validator = DataValidator(
    rules=ValidationFactory.default_rules(),   # Using ValidationFactory 
    strict_mode=False                          # False = allow WARNING, fail only ERROR/CRITICAL
)
report = validator.validate(df)
report.is_valid

True

In [4]:
#Preprocessing
preprocessor = PreprocessingFactory.create(
    method="sequence",
    min_orders=2
)
prepared = preprocessor.transform(df)


In [5]:
#Customer Feature Extration
cust_ext = CustomerFeatureExtractor()
customer_profiles = cust_ext.extract(prepared)

In [6]:
#Producgt Feature Extration
prod_ext = ProductFeatureExtractor()
product_features = prod_ext.extract(prepared)

In [7]:
#Build Training Data
builder = TrainingDataBuilder(negative_ratio=5)
training_data = builder.build(
prepared_data=prepared,
product_features=product_features,
customer_profiles=customer_profiles
)

In [8]:
#Train Test Split
#Issue:CustomerID in train set might not in test set
splitter = TemporalDataSplitter(test_ratio=0.2)
split = splitter.split(
    training_data=training_data,
    date_column="order_date"
)

In [9]:
# train_df = split.train_df
# test_df = split.test_df
# feature_names = split.feature_names
train_df = split.train_df
valid_df = split.valid_df
test_df  = split.test_df
feature_names = split.feature_names
print("Train:", len(train_df))
print("Valid:", len(valid_df))
print("Test:", len(test_df))
print("Feature_names",feature_names[:5])

Train: 251376
Valid: 36132
Test: 71958
Feature_names ['in_history', 'history_count', 'history_freq', 'orders_since_last_purchase', 'time_decay_score']


In [10]:
#base model and baseline creation
pop_model = PopularityRecommender().fit(train_df, feature_names)
pf_model = PersonalFrequencyRecommender(smoothing=0.3).fit(train_df, feature_names)


In [11]:
#
test_scores_pop = pop_model.predict_df(test_df)
test_scores_pf = pf_model.predict_df(test_df)

In [12]:
evaluator = RankingMetrics(k_values=[1, 3, 5])

result_pop = evaluator.evaluate(pop_model, test_df, feature_names)
print(result_pop.metrics)

{'ndcg@1': 0.32479836128536677, 'ndcg@3': 0.48971438752825314, 'ndcg@5': 0.5613271110993573, 'hit_rate@1': 0.32479836128536677, 'hit_rate@3': 0.7349891179106388, 'hit_rate@5': 0.8973242862629625, 'precision@1': 0.32479836128536677, 'precision@3': 0.25404344300772413, 'precision@5': 0.19976955575470495, 'mrr': 0.5521613820650605}


In [None]:
#LGBM Ranker
# Create the model
lgbm_model = LightGBMRanker(
    num_leaves=31,
    learning_rate=0.05,
    num_boost_round=300,
    early_stopping_rounds=30
)

# Fit the model (train_df must contain label, customer_id, order_idx)
lgbm_model.fit(
    train_df=train_df,
    feature_names=feature_names
)


Training until validation scores don't improve for 30 rounds
[100]	train's ndcg@1: 0.923496	train's ndcg@3: 0.917572	train's ndcg@5: 0.931921	train's ndcg@10: 0.944544
[200]	train's ndcg@1: 0.934574	train's ndcg@3: 0.924672	train's ndcg@5: 0.93832	train's ndcg@10: 0.950363
[300]	train's ndcg@1: 0.941238	train's ndcg@3: 0.929512	train's ndcg@5: 0.94276	train's ndcg@10: 0.954407


<src.models.lightgbm_ranker.LightGBMRanker at 0x1ac9940dca0>

In [14]:
from src.training.data_splitter import TemporalDataSplitter

from src.tuning.model_tuning_strategy import LightGBMTuningStrategy
from src.models.lightgbm_ranker import LightGBMRanker
from src.evaluation.metrics import RankingMetrics

In [15]:
tuner = HyperparameterTuner(
    metric="ndcg@3",
    direction="maximize",
    n_trials=25,
    timeout=600
)

strategy = LightGBMTuningStrategy()

tuning_result = tuner.tune(split, strategy)

print("Best LightGBM params:", tuning_result.best_params)
print("Best Validation NDCG@3:", tuning_result.best_score)


[I 2025-12-06 13:04:44,885] A new study created in memory with name: no-name-09864180-cbbb-4468-b42b-aba8a60632ef


  0%|          | 0/25 [00:00<?, ?it/s]

Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[27]	train's ndcg@1: 0.9164	train's ndcg@3: 0.91419	train's ndcg@5: 0.928724	train's ndcg@10: 0.941608	valid's ndcg@1: 0.934811	valid's ndcg@3: 0.935803	valid's ndcg@5: 0.948351	valid's ndcg@10: 0.955722
[I 2025-12-06 13:04:46,196] Trial 0 finished with value: 0.29209941588917426 and parameters: {'num_leaves': 23, 'learning_rate': 0.18227472944261908, 'feature_fraction': 0.6276054697470599, 'bagging_fraction': 0.9662338942474383, 'min_data_in_leaf': 17, 'lambda_l1': 0.9420264637121939, 'lambda_l2': 0.39487256988864083}. Best is trial 0 with value: 0.29209941588917426.
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[38]	train's ndcg@1: 0.920338	train's ndcg@3: 0.916159	train's ndcg@5: 0.930982	train's ndcg@10: 0.943653	valid's ndcg@1: 0.937903	valid's ndcg@3: 0.937888	valid's ndcg@5: 0.949568	valid's ndcg@10: 0.957414
[I 2025-12-06 13:04:47,625

In [16]:
tuning_result.best_params

{'num_leaves': 56,
 'learning_rate': 0.0787818480520034,
 'feature_fraction': 0.8083715157277301,
 'bagging_fraction': 0.69608932398183,
 'min_data_in_leaf': 32,
 'lambda_l1': 0.9584683140057209,
 'lambda_l2': 0.21403171721001724}