In [4]:
!pip uninstall -y torch torchvision torchaudio
!pip install --upgrade \
    torch torchvision torchaudio \
    --index-url https://download.pytorch.org/whl/cu118

!pip install lightfm implicit
!pip install --upgrade sentence-transformers
!pip install --upgrade tensorflow

Found existing installation: torch 2.7.0
Uninstalling torch-2.7.0:
  Successfully uninstalled torch-2.7.0
Found existing installation: torchvision 0.21.0+cu124
Uninstalling torchvision-0.21.0+cu124:
  Successfully uninstalled torchvision-0.21.0+cu124
Found existing installation: torchaudio 2.6.0+cu124
Uninstalling torchaudio-2.6.0+cu124:
  Successfully uninstalled torchaudio-2.6.0+cu124
Looking in indexes: https://download.pytorch.org/whl/cu118
Collecting torch
  Downloading https://download.pytorch.org/whl/cu118/torch-2.7.0%2Bcu118-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (28 kB)
Collecting torchvision
  Downloading https://download.pytorch.org/whl/cu118/torchvision-0.22.0%2Bcu118-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (6.1 kB)
Collecting torchaudio
  Downloading https://download.pytorch.org/whl/cu118/torchaudio-2.7.0%2Bcu118-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (6.6 kB)
Collecting nvidia-cuda-nvrtc-cu11==11.8.89 (from torch)
  Downloading https://download.pyto

^C
^C
^C


In [2]:
# in a Colab cell, *before* any imports:
!pip install --upgrade implicit lightfm




In [6]:
# === 1. INSTALL & IMPORTS ===
# In Colab, run before any imports:
# !pip install --upgrade implicit lightfm sentence-transformers tensorflow torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118

import pandas as pd
import numpy as np
from collections import defaultdict
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn.metrics.pairwise import linear_kernel

from implicit.als import AlternatingLeastSquares
from lightfm import LightFM

from sentence_transformers import SentenceTransformer
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import GRU, Dense, Embedding
from scipy.sparse import csr_matrix
import joblib
from datetime import datetime

# === 2. DATA LOADING ===
def load_data():
    """Load all datasets and parse timestamps"""
    products = pd.read_csv('products.csv')
    variants = pd.read_csv('variants.csv')
    orders = pd.read_csv('orders.csv')
    order_items = pd.read_csv('order_items.csv')
    cart_events = pd.read_csv('cart_events.csv')
    product_categories = pd.read_csv('product_categories.csv')

    # convert timestamps
    orders['created_at'] = pd.to_datetime(orders['created_at'])
    cart_events['created_at'] = pd.to_datetime(cart_events['created_at'])

    # Merge product data
    product_variants = (
        pd.merge(pd.merge(products, variants, on='product_id'),
                 product_categories, on='product_id')
        .groupby('variant_id')
        .agg({
            'name': 'first',
            'description': 'first',
            'color': 'first',
            'size': 'first',
            'price': 'first',
            'stock': 'first',
            'category_id': lambda x: list(x.unique())
        })
        .reset_index()
    )

    # Create interaction datasets
    user_purchases = pd.merge(
        pd.merge(orders, order_items, on='order_id'),
        product_variants,
        on='variant_id'
    )

    user_carts = pd.merge(
        cart_events,
        product_variants,
        on='variant_id'
    )

    return product_variants, user_purchases, user_carts

# === 3. CONTENT-BASED RECOMMENDER ===
class ContentRecommender:
    def __init__(self, product_variants):
        self.products = product_variants
        self.encoder = OneHotEncoder(handle_unknown='ignore')
        self.scaler = MinMaxScaler()
        self.text_model = SentenceTransformer('paraphrase-MiniLM-L6-v2')

        cat = self.encoder.fit_transform(product_variants[['color', 'size']])
        num = self.scaler.fit_transform(product_variants[['price']])
        txt = self.text_model.encode(product_variants['description'].fillna(''))

        # store for profile building
        self.text_embeddings = txt
        self.variant_to_index = { vid: idx for idx, vid in enumerate(product_variants['variant_id']) }
        self.feature_matrix = np.hstack([cat.toarray(), num, txt])

    def recommend(self, user_profile, n=50):
        sims = linear_kernel([user_profile], self.feature_matrix)[0]
        idx = np.argsort(sims)[-n:][::-1]
        return self.products.iloc[idx]['variant_id'].tolist()

# === 4. COLLABORATIVE FILTERING ===
class CollaborativeFiltering:
    def __init__(self, user_purchases):
        cats_u = user_purchases['user_id'].astype('category')
        cats_i = user_purchases['variant_id'].astype('category')
        u_codes = cats_u.cat.codes.values
        i_codes = cats_i.cat.codes.values
        self.user_index = cats_u.cat.categories
        self.item_index = cats_i.cat.categories
        n_users = len(self.user_index)
        n_items = len(self.item_index)

        interactions = csr_matrix(
            (user_purchases['quantity'].values, (u_codes, i_codes)),
            shape=(n_users, n_items)
        )
        self.interactions = interactions

        # Implicit ALS
        self.als = AlternatingLeastSquares(factors=64, regularization=0.01)
        self.als.fit(interactions)

        # LightFM (WARP loss)
        self.lfm = LightFM(loss='warp')
        self.lfm.fit(interactions, epochs=20, num_threads=4)

    def als_recommend(self, user_id, n=50):
        try:
            uidx = self.user_index.get_loc(user_id)
            user_vec = self.interactions[uidx]
            item_codes, _ = self.als.recommend(uidx, user_vec, N=n)
            return self.item_index[item_codes].tolist()
        except KeyError:
            return []

    def lfm_recommend(self, user_id, n=50):
        try:
            uidx = self.user_index.get_loc(user_id)
            scores = self.lfm.predict(uidx, np.arange(self.interactions.shape[1]))
            top = np.argsort(-scores)[:n]
            return self.item_index[top].tolist()
        except KeyError:
            return []

# === 5. SESSION-BASED RECOMMENDER ===
class SessionRecommender:
    def __init__(self, cart_events):
        self.model = Sequential([
            Embedding(input_dim=len(cart_events['variant_id'].unique())+1, output_dim=64),
            GRU(128),
            Dense(len(cart_events['variant_id'].unique()), activation='softmax')
        ])
        self.model.compile(loss='categorical_crossentropy', optimizer='adam')
        # TODO: prepare session sequences

    def recommend(self, session_history, n=10):
        return []

# === 6. HYBRID RECOMMENDER ===
class HybridRecommender:
    def __init__(self, product_variants, user_purchases, user_carts):
        self.products = product_variants
        self.user_purchases = user_purchases
        self.user_carts = user_carts
        self.content_model = ContentRecommender(product_variants)
        self.collab_model = CollaborativeFiltering(user_purchases)
        self.session_model = SessionRecommender(user_carts)

    def create_user_profile(self, user_id):
        bought = self.user_purchases[self.user_purchases['user_id'] == user_id].copy()
        carted = self.user_carts[self.user_carts['user_id'] == user_id].copy()
        interactions = pd.concat([bought, carted], ignore_index=True)
        if interactions.empty:
            return np.zeros(self.content_model.feature_matrix.shape[1])

        now = datetime.now()
        bought.loc[:, 'weight'] = np.exp(-(now - bought['created_at']).dt.days / 14)
        carted.loc[:, 'weight'] = 0.7 * np.exp(-(now - carted['created_at']).dt.days / 7)
        interactions = pd.concat([bought, carted], ignore_index=True)

        feats_cat = self.content_model.encoder.transform(interactions[['color', 'size']]).toarray()
        feats_num = self.content_model.scaler.transform(interactions[['price']])
        idxs = interactions['variant_id'].map(self.content_model.variant_to_index).values
        feats_txt = self.content_model.text_embeddings[idxs]

        feats = np.hstack([feats_cat, feats_num, feats_txt])
        return np.average(feats, axis=0, weights=interactions['weight'])

    def recommend(self, user_id, session_history=None,
                 content_weight=0.4, als_weight=0.3, lfm_weight=0.2, session_weight=0.1,
                 diversity_penalty=0.3, price_tier_penalty=0.5):
        profile = self.create_user_profile(user_id)
        recs = {
            'content': self.content_model.recommend(profile),
            'als': self.collab_model.als_recommend(user_id),
            'lfm': self.collab_model.lfm_recommend(user_id),
            'session': self.session_model.recommend(session_history or [])
        }
        scores = defaultdict(float)
        for source, weight in [('content', content_weight), ('als', als_weight), ('lfm', lfm_weight), ('session', session_weight)]:
            for i, vid in enumerate(recs[source]):
                scores[vid] += weight * (1.0 / (i + 1))

        sorted_scores = sorted(scores.items(), key=lambda x: -x[1])
        return [vid for vid, _ in sorted_scores][:10]

# === 7. EVALUATION & COMPARISON ===

def evaluate_recommendations(model, test_users, k=10):
    metrics = {'hit_rate': 0, 'precision@k': [], 'recall@k': [], 'coverage': set(), 'diversity': [], 'conversion_rate': []}
    for uid in test_users:
        bought = model.user_purchases[model.user_purchases['user_id'] == uid]['variant_id'].tolist()
        if not bought:
            continue
        recs = model.recommend(uid)[:k]
        hit = set(recs) & set(bought)
        metrics['hit_rate'] += int(bool(hit))
        metrics['precision@k'].append(len(hit) / k)
        metrics['recall@k'].append(len(hit) / len(bought))
        metrics['coverage'].update(recs)
        cats = model.products[model.products['variant_id'].isin(recs)]['category_id'].explode().nunique()
        metrics['diversity'].append(cats / k)
        metrics['conversion_rate'].append(len(hit) / k)
    return {
        'users_tested': len(test_users),
        'hit_rate': metrics['hit_rate'] / len(test_users),
        'precision@k': np.nanmean(metrics['precision@k']),
        'recall@k': np.nanmean(metrics['recall@k']),
        'coverage': len(metrics['coverage']) / model.products['variant_id'].nunique(),
        'diversity': np.nanmean(metrics['diversity']),
        'conversion_rate': np.nanmean(metrics['conversion_rate'])
    }

class ModelComparator:
    def __init__(self): self.results = {}
    def add_model(self, name, model): self.results[name] = model
    def compare(self, users): return {n: evaluate_recommendations(m, users) for n, m in self.results.items()}

# === 8. TESTING ===
if __name__ == "__main__":
    pv, up, uc = load_data()
    base = HybridRecommender(pv, up, uc)
    enh = HybridRecommender(pv, up, uc)
    comp = ModelComparator()
    comp.add_model('Baseline', base)
    comp.add_model('Enhanced', enh)
    test_users = up['user_id'].drop_duplicates().sample(100, random_state=42)
    report = comp.compare(test_users)
    print("=== MODEL COMPARISON ===")
    for name, m in report.items():
        print(f"\n{name}:")
        print(f"Hit Rate:      {m['hit_rate']:.2%}")
        print(f"Precision@10:  {m['precision@k']:.2%}")
        print(f"Recall@10:     {m['recall@k']:.2%}")
        print(f"Coverage:      {m['coverage']:.2%}")
        print(f"Diversity:     {m['diversity']:.2%}")
        print(f"Conv. Rate:    {m['conversion_rate']:.2%}")


  0%|          | 0/15 [00:00<?, ?it/s]

  0%|          | 0/15 [00:00<?, ?it/s]

=== MODEL COMPARISON ===

Baseline:
Hit Rate:      78.00%
Precision@10:  11.10%
Recall@10:     8.01%
Coverage:      44.18%
Diversity:     77.00%
Conv. Rate:    11.10%

Enhanced:
Hit Rate:      76.00%
Precision@10:  11.10%
Recall@10:     8.02%
Coverage:      45.20%
Diversity:     75.90%
Conv. Rate:    11.10%
