In [None]:
!pip install scikit-surprise  # Correct package name
!pip install --upgrade numpy  # Ensure numpy compatibility

In [27]:
# === 2. IMPORTS ===
import pandas as pd
import numpy as np
from collections import defaultdict
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn.metrics.pairwise import linear_kernel
from surprise import SVD, Dataset, Reader
from surprise.model_selection import train_test_split
from sklearn.model_selection import ParameterGrid
import joblib
from datetime import datetime

# === 3. DATA LOADING & PREPROCESSING ===
# Load all datasets
products = pd.read_csv('products.csv')
variants = pd.read_csv('variants.csv')
orders = pd.read_csv('orders.csv')
order_items = pd.read_csv('order_items.csv')
cart_events = pd.read_csv('cart_events.csv')
product_categories = pd.read_csv('product_categories.csv')

# Convert date columns
orders['created_at'] = pd.to_datetime(orders['created_at'])
cart_events['created_at'] = pd.to_datetime(cart_events['created_at'])

# Merge product data
product_variants = pd.merge(
    pd.merge(products, variants, on='product_id'),
    product_categories,
    on='product_id'
).groupby('variant_id').agg({
    'color': 'first',
    'size': 'first',
    'price': 'first',
    'stock': 'first',
    'category_id': lambda x: list(x.unique())
}).reset_index()

# Create user interaction datasets
user_purchases = pd.merge(
    pd.merge(orders, order_items, on='order_id'),
    product_variants,
    on='variant_id'
)

user_carts = pd.merge(
    cart_events,
    product_variants,
    on='variant_id'
)

# === 4. GLOBAL FEATURE ENGINEERING ===
encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
encoder.fit(product_variants[['color', 'size']])

scaler = MinMaxScaler()
scaler.fit(product_variants[['price']])

# Create feature matrix
categorical_features = encoder.transform(product_variants[['color', 'size']])
numerical_features = scaler.transform(product_variants[['price']])
feature_matrix = np.hstack([categorical_features, numerical_features])

# === 5. CONTENT-BASED COMPONENTS (UPDATED) ===
def create_user_profile(user_id):
    """Create temporal-aware user profile"""
    purchased = user_purchases[user_purchases['user_id'] == user_id].copy()
    carted = user_carts[user_carts['user_id'] == user_id].copy()

    if purchased.empty and carted.empty:
        return np.zeros(feature_matrix.shape[1])

    # Temporal weighting
    current_date = datetime.now()

    # Purchase recency (14-day half-life)
    purchased['days_ago'] = (current_date - purchased['created_at']).dt.days
    purchased_weights = np.exp(-purchased['days_ago']/14)

    # Cart recency (7-day half-life)
    carted['days_ago'] = (current_date - carted['created_at']).dt.days
    carted_weights = 0.7 * np.exp(-carted['days_ago']/7)

    interactions = pd.concat([purchased, carted])
    weights = np.concatenate([purchased_weights, carted_weights])

    # Transform features
    categorical = encoder.transform(interactions[['color', 'size']])
    numerical = scaler.transform(interactions[['price']])
    features = np.hstack([categorical, numerical])

    return np.average(features, axis=0, weights=weights)

def recommend_content(user_profile, n=50):
    """Content-based recommendations using linear kernel"""
    if np.all(user_profile == 0):
        return []

    similarities = linear_kernel([user_profile], feature_matrix)[0]
    return product_variants.iloc[np.argsort(similarities)[::-1][:n]]['variant_id'].tolist()

# === 6. COLLABORATIVE FILTERING (ENHANCED) ===
reader = Reader(rating_scale=(0, 5))
data = Dataset.load_from_df(
    user_purchases[['user_id', 'variant_id', 'quantity']],
    reader
)

trainset, testset = train_test_split(data, test_size=0.25)
algo = SVD(
    n_factors=100,
    n_epochs=30,
    lr_all=0.007,
    reg_all=0.1,
    random_state=42
)
algo.fit(trainset)

def recommend_collab(user_id, n=50):
    """Collaborative filtering recommendations with fallback"""
    try:
        testset = [[user_id, variant, 0] for variant in product_variants['variant_id']]
        predictions = algo.test(testset)
        return [pred.iid for pred in sorted(predictions, key=lambda x: x.est, reverse=True)[:n]]
    except Exception as e:
        return []

# === 7. HYBRID RECOMMENDATION SYSTEM (FINAL) ===
def hybrid_recommendation(user_id,
                         content_weight=0.6,
                         collab_weight=0.25,
                         base_diversity_penalty=0.4,
                         novelty_strength=0.35,
                         exploration_rate=0.15,
                         size_penalty=0.2,
                         color_penalty=0.3):
    """Enhanced hybrid recommendations with multi-factor diversity"""
    # Get user data
    user_history = user_purchases[user_purchases['user_id'] == user_id]
    user_cart_history = user_carts[user_carts['user_id'] == user_id]

    # ===== 1. COLD-START HANDLING =====
    if len(user_history) < 3:
        # Use cart data for better cold-start
        cart_cats = user_cart_history['category_id'].explode().unique()
        user_cats = list(user_history['category_id'].explode().unique()) + list(cart_cats)
        user_price = user_history['price'].mean() if not user_history.empty else None

        # Price-aware cold-start
        price_filter = product_variants['price'] <= (user_price * 1.5 if user_price else 100)
        relevant_items = product_variants[
            (product_variants['category_id'].apply(lambda x: any(c in user_cats for c in x))) &
            price_filter
        ]

        return list(set(
            user_purchases['variant_id'].value_counts().head(5).index.tolist() +
            relevant_items.sample(5)['variant_id'].tolist()
        ))[:10]

    # ===== 2. USER CONTEXT =====
    profile = create_user_profile(user_id)
    content_recs = recommend_content(profile)
    collab_recs = recommend_collab(user_id)

    # Calculate user's price tier
    user_avg_price = user_history['price'].mean()
    price_tiers = [
        (0, 50),
        (50, 150),
        (150, float('inf'))
    ]
    user_tier = next(i for i, (low, high) in enumerate(price_tiers) if low <= user_avg_price < high)

    # ===== 3. SCORING WITH ENHANCED DIVERSITY =====
    variant_data = product_variants.set_index('variant_id')
    scores = defaultdict(float)
    seen_categories = set()
    seen_sizes = set()
    seen_colors = set()

    # Process content recommendations
    for i, variant in enumerate(content_recs):
        # Adaptive diversity penalties
        diversity_penalty = min(
            base_diversity_penalty * (i/len(content_recs)),
            0.6  # Max penalty
        )

        categories = variant_data.loc[variant, 'category_id']
        price = variant_data.loc[variant, 'price']
        size = variant_data.loc[variant, 'size']
        color = variant_data.loc[variant, 'color']

        score = (len(content_recs) - i) * content_weight

        # Apply diversity penalties
        if any(c in seen_categories for c in categories):
            score *= (1 - diversity_penalty)
        if size in seen_sizes:
            score *= (1 - size_penalty)
        if color in seen_colors:
            score *= (1 - color_penalty)

        # Price tier penalty
        variant_tier = next(i for i, (low, high) in enumerate(price_tiers) if low <= price < high)
        if variant_tier != user_tier:
            score *= 0.7

        # Novelty boost
        popularity = user_purchases['variant_id'].value_counts(normalize=True).get(variant, 0)
        score *= (1 + (1 - popularity) * novelty_strength)

        scores[variant] += score
        seen_categories.update(categories)
        seen_sizes.add(size)
        seen_colors.add(color)

    # Process collaborative recommendations
    seen_categories = set()
    seen_sizes = set()
    seen_colors = set()

    for i, variant in enumerate(collab_recs):
        diversity_penalty = min(
            base_diversity_penalty * (i/len(collab_recs)),
            0.6
        )

        categories = variant_data.loc[variant, 'category_id']
        price = variant_data.loc[variant, 'price']
        size = variant_data.loc[variant, 'size']
        color = variant_data.loc[variant, 'color']

        score = (len(collab_recs) - i) * collab_weight

        if any(c in seen_categories for c in categories):
            score *= (1 - diversity_penalty)
        if size in seen_sizes:
            score *= (1 - size_penalty)
        if color in seen_colors:
            score *= (1 - color_penalty)

        variant_tier = next(i for i, (low, high) in enumerate(price_tiers) if low <= price < high)
        if variant_tier != user_tier:
            score *= 0.7

        popularity = user_purchases['variant_id'].value_counts(normalize=True).get(variant, 0)
        score *= (1 + (1 - popularity) * novelty_strength)

        scores[variant] += score
        seen_categories.update(categories)
        seen_sizes.add(size)
        seen_colors.add(color)

    # ===== 4. RE-RANKING & EXPLORATION =====
    # Get top candidates
    candidates = sorted(scores.items(), key=lambda x: x[1], reverse=True)[:25]

    # Re-rank with diversity
    final_recs = []
    seen_cats = set()
    seen_sizes = set()
    seen_colors = set()

    for variant, score in candidates:
        cats = variant_data.loc[variant, 'category_id']
        size = variant_data.loc[variant, 'size']
        color = variant_data.loc[variant, 'color']

        diversity_score = (
            0.4 * len(set(cats) - seen_cats) +
            0.3 * (size not in seen_sizes) +
            0.3 * (color not in seen_colors)
        )

        final_recs.append((variant, score + diversity_score))

    # Sort by combined score
    final_recs = sorted(final_recs, key=lambda x: x[1], reverse=True)[:8]
    final_recs = [v[0] for v in final_recs]

    # Add exploration
    exploration_candidates = product_variants[
        (product_variants['stock'] > 0) &
        (product_variants['price'].between(user_avg_price*0.8, user_avg_price*1.2))
    ].sample(frac=0.1)

    if not exploration_candidates.empty:
        exploration_items = exploration_candidates['variant_id'].tolist()[:2]
        final_recs += exploration_items

    return final_recs[:10]

# === 8. HYPERPARAMETER TUNING ===
def tune_parameters(test_users, param_grid, num_combinations=50):
    """Grid search for optimal parameters"""
    best_score = -np.inf
    best_params = {}
    results = []

    # Create smaller grid if needed
    grid = list(ParameterGrid(param_grid))[:num_combinations]

    for i, params in enumerate(grid):
        print(f"\nTesting combination {i+1}/{len(grid)}: {params}")

        # Temporary recommendation function with current params
        def tuned_recommender(user_id):
            return hybrid_recommendation(
                user_id,
                content_weight=params.get('content_weight', 0.7),
                collab_weight=params.get('collab_weight', 0.3),
                base_diversity_penalty=params.get('base_diversity_penalty', 0.4),
                novelty_strength=params.get('novelty_strength', 0.3),
                exploration_rate=params.get('exploration_rate', 0.1)
            )

        # Evaluate with current parameters
        performance = evaluate_recommendations(test_users, recommender=tuned_recommender)

        # Calculate composite score (adjust weights as needed)
        composite_score = (
            0.4 * performance['precision@k'] +
            0.3 * performance['recall@k'] +
            0.2 * performance['coverage'] +
            0.1 * performance['diversity']
        )

        results.append({
            'params': params,
            'performance': performance,
            'score': composite_score
        })

        if composite_score > best_score:
            best_score = composite_score
            best_params = params
            print(f"New best score: {best_score:.3f}")

    # Save results
    joblib.dump(results, 'tuning_results.pkl')
    joblib.dump(best_params, 'best_params.pkl')

    return best_params, results

# Define parameter grid
param_grid = {
    'content_weight': [0.6, 0.7, 0.8],
    'collab_weight': [0.2, 0.25, 0.3],
    'base_diversity_penalty': [0.3, 0.4, 0.5],
    'novelty_strength': [0.25, 0.3, 0.35],
    'exploration_rate': [0.1, 0.15, 0.2]
}

# Get active users for tuning
tune_users = user_purchases['user_id'].value_counts().index.tolist()[:200]

# === 9. EVALUATION & TESTING ===
def evaluate_recommendations(test_users, k=10, recommender=None):
    """Calculate performance metrics"""
    metrics = {
        'hit_rate': 0,
        'precision@k': [],
        'recall@k': [],
        'coverage': set(),
        'diversity': []
    }

    total_variants = product_variants['variant_id'].nunique()

    # Use custom recommender if provided
    recommend_func = recommender or hybrid_recommendation

    for user_id in test_users:
        purchased = user_purchases[user_purchases['user_id'] == user_id]['variant_id'].tolist()
        if not purchased:
            continue

        recs = recommend_func(user_id)[:k]
        relevant = set(recs) & set(purchased)

        # Basic metrics
        metrics['hit_rate'] += 1 if relevant else 0
        metrics['precision@k'].append(len(relevant)/k)
        metrics['recall@k'].append(len(relevant)/len(purchased))

        # Coverage
        metrics['coverage'].update(recs)

        # Diversity
        categories = product_variants[product_variants['variant_id'].isin(recs)]['category_id'].explode().nunique()
        metrics['diversity'].append(categories/k)

    return {
        'users_tested': len(test_users),
        'hit_rate': metrics['hit_rate']/len(test_users),
        'precision@k': np.nanmean(metrics['precision@k']),
        'recall@k': np.nanmean(metrics['recall@k']),
        'coverage': len(metrics['coverage'])/total_variants,
        'diversity': np.nanmean(metrics['diversity'])
    }

def test_system(use_tuned_params=True):
    """Run full test and display results"""
    # Load best parameters
    if use_tuned_params:
        try:
            best_params = joblib.load('best_params.pkl')
            print("Using tuned parameters:", best_params)
        except:
            print("No tuned parameters found, using defaults")
            best_params = {}
    else:
        best_params = {}

    # Get active users with purchase history
    active_users = user_purchases['user_id'].value_counts().index.tolist()[:100]

    def tuned_recommender(user_id):
      return hybrid_recommendation(
          user_id,
          **best_params
      )

    # Run evaluation
    performance = evaluate_recommendations(active_users, recommender=tuned_recommender)

    # Print metrics
    print("\n=== PERFORMANCE REPORT ===")
    print(f"Users Tested: {performance['users_tested']}")
    print(f"Hit Rate: {performance['hit_rate']:.2%}")
    print(f"Precision@10: {performance['precision@k']:.2%}")
    print(f"Recall@10: {performance['recall@k']:.2%}")
    print(f"Catalog Coverage: {performance['coverage']:.2%}")
    print(f"Category Diversity: {performance['diversity']:.2f}/1.0")

    # Show sample recommendations
    print("\n=== SAMPLE RECOMMENDATIONS ===")
    for user_id in active_users[:3]:
        recs = hybrid_recommendation(user_id)[:10]
        details = product_variants[product_variants['variant_id'].isin(recs)][['variant_id', 'color', 'size', 'price']]
        print(f"\nUser {user_id} recommendations:")
        print(details.to_string(index=False))

# === 10. RUN THE TEST ===
if __name__ == "__main__":
    # # 1. Initial quick search
    # tune_parameters(tune_users, param_grid, num_combinations=20)

    # # 2. Refine grid around promising values
    # refined_grid = {
    #     'content_weight': [0.68, 0.7, 0.72],
    #     'base_diversity_penalty': [0.38, 0.4, 0.42],
    #     'exploration_rate': [0.12, 0.15, 0.18]
    # }
    # tune_parameters(tune_users, refined_grid, num_combinations=30)
    test_system(use_tuned_params=True)

Using tuned parameters: {'base_diversity_penalty': 0.38, 'content_weight': 0.72, 'exploration_rate': 0.12}

=== PERFORMANCE REPORT ===
Users Tested: 100
Hit Rate: 54.00%
Precision@10: 7.50%
Recall@10: 3.46%
Catalog Coverage: 39.49%
Category Diversity: 0.74/1.0

=== SAMPLE RECOMMENDATIONS ===

User 433c6a36-1ada-40ea-a8a1-8ed56faba54f recommendations:
                          variant_id color size  price
22a3a856-63b4-48ba-8864-8e5f4ed425c8  Grey    S  34.99
2b9995a3-05b6-41cb-b935-92ede22c2846  Grey   2L  10.99
2d1efbb9-7a7a-4ca9-b766-4dcc8fffd32e  Grey   2L  29.99
3ba3727a-c4e3-438f-9e17-a5de4981d840  Grey    S  39.99
51ed9279-1aec-4bff-af22-74af285a7309 Black   2L  69.99
803a6f4b-e30b-41e3-8dbe-a0f4607bb262 Black   2L  15.99
b9d12bb9-40d6-4d6f-91f3-8bd49156c99b Black   2L  13.99
be06fd03-469f-48d3-b0ba-759238cd79ef  Grey   2L  12.99
d3d83d15-8be1-447b-bae5-9bd96bcc7208  Grey  XXL  29.99
d56099a4-7de9-4185-85c9-78b95fb11007  Grey   2L  21.99

User 865014d0-2c41-4099-831b-918bda466a4d