
# Offer Engagement Prediction

## 1. Business Objective (The "Why")
In a high-volume e-commerce environment, users are bombarded with offers. Indiscriminate targeting leads to:
*   **User Fatigue**: Users ignore notifications or churn.
*   **Wasted Budget**: Incentives are given to users who wouldn't convert or aren't interested.
*   **Missed Revenue**: High-intent users miss relevant offers buried in noise.

**The Goal**: Move from "Broadcast" to "Precision" targeting. We aim to build an AI model that predicts the **exact probability** (`0.0` to `1.0`) that a specific user will click on a specific offer.

## 2. Technical Approach (The "How")
To solve this, we treat the problem as a **Binary Classification** task (`is_clicked` = 1 vs 0).

*   **Session-Awareness**: We don't just look at "who the user is" (demographics), but "what they are doing right now". We link **Retail** behavior (browsing items) with **Offer** impressions in real-time.
*   **Point-in-Time Correctness**: We strictly observe the timeline. We only use information available *before* the offer was shown to prevent data leakage.
*   **Gradient Boosting**: We use **LightGBM**, a state-of-the-art algorithm for tabular data, optimized for speed and accuracy on large datasets.

## 3. Success Metrics
We will evaluate the model not just on accuracy, but on **Business Impact**:
*   **Lift at Top 10%**: If we only target the top 10% of users ranked by the model, how many clicks do we capture compared to random guessing?
*   **Calibration (Log Loss)**: Are the predicted probabilities real? (e.g., if we predict 20% risk, does it happen 20% of the time?)


In [None]:

# Install dependencies if needed
# !pip install lightgbm shap polars
import os
import gc
import pandas as pd
import numpy as np
import lightgbm as lgb
import shap
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import timedelta
from sklearn.metrics import log_loss, roc_auc_score, roc_curve
from sklearn.model_selection import train_test_split

# Configuration
CLEANED_DATA_DIR = "cleaned_data"
OUTPUT_DIR = "models/offer_ctr_model"
os.makedirs(OUTPUT_DIR, exist_ok=True)
SEED = 42

# Plotting style
sns.set_style("whitegrid")
plt.rcParams['figure.figsize'] = (10, 6)


In [None]:

def load_data():
    print("Loading datasets...")
    # Load Users
    users = pd.read_parquet(os.path.join(CLEANED_DATA_DIR, 'users_clean.parquet'))
    
    # Load Retail Events (Context)
    # We need this to understand what the user was doing *before* the offer.
    retail_events = pd.read_parquet(os.path.join(CLEANED_DATA_DIR, 'retail_events_clean.parquet'))
    retail_events = retail_events[['user_id', 'timestamp', 'item_id', 'subdomain']].sort_values('timestamp')
    
    # Load Offers Events (Target)
    offers_events = pd.read_parquet(os.path.join(CLEANED_DATA_DIR, 'offers_events_clean.parquet'))
    offers_events = offers_events.sort_values('timestamp')
    
    print(f"Users: {len(users):,}")
    print(f"Retail Events: {len(retail_events):,}")
    print(f"Offers Events: {len(offers_events):,}")
    
    return users, retail_events, offers_events

users, retail_events, offers_events = load_data()


In [None]:

def define_target(df):
    print("Defining Target Variable...")
    # We define 'Engagement' as an explicit CLICK or CONVERSION.
    # Positive (1): 'redirect_to_partner', 'like' (Actual Clicks)
    # Negative (0): 'seen', 'offer_shown' (Impressions/Views without click)
    
    # NOTE: We exclude 'offer_shown' from positive class because it is just an impression.
    # We want to predict who will actually CLICK.
    
    positive_actions = ['redirect_to_partner', 'like']
    df['is_clicked'] = df['action_type'].isin(positive_actions).astype(int)
    
    ctr = df['is_clicked'].mean()
    print(f"Global CTR (Strict Click-Through): {ctr:.2%}")
    return df

offers_events = define_target(offers_events)


In [None]:

def build_features(offers, retail, users):
    print("Building Features...")
    
    # 1. Temporal Features
    # Note: 'timestamp' is a Timedelta (duration), not a Datetime.
    # We extract the hour of the day (0-23) and a cyclic day index (0-6).
    offers['hour'] = (offers['timestamp'].dt.seconds // 3600).astype(int)
    offers['day_of_week'] = (offers['timestamp'].dt.days % 7).astype(int)
    
    # 2. Item Features (CRITICAL: The model needs to know WHICH offer is shown)
    # Convert item_id to categorical for LightGBM
    print("  Encoding Offer IDs...")
    offers['item_id'] = offers['item_id'].astype('category')
    
    # Offer Popularity (Expanding Count)
    # How many times has this offer been shown before?
    print("  Computing Offer Popularity...")
    offers['offer_popularity'] = offers.groupby('item_id')['timestamp'].transform(
        lambda x: x.expanding().count()
    )
    
    # 3. Historical CTR (Expanding Mean)
    print("  Computing Historical CTR...")
    offers['user_hist_ctr'] = offers.groupby('user_id')['is_clicked'].transform(
        lambda x: x.shift().expanding().mean()
    ).fillna(0)
    
    # 4. Retail Context (Merge Asof)
    print("  Merging Retail Context (asof)...")
    retail_sorted = retail.sort_values('timestamp')
    offers_sorted = offers.sort_values('timestamp')
    
    merged = pd.merge_asof(
        offers_sorted,
        retail_sorted[['user_id', 'timestamp', 'subdomain']],
        on='timestamp',
        by='user_id',
        direction='backward',
        suffixes=('', '_retail')
    )
    
    # Feature: How recently did they interact with retail?
    merged['seconds_since_retail'] = (merged['timestamp'] - merged['timestamp_retail']).dt.total_seconds()
    merged['seconds_since_retail'] = merged['seconds_since_retail'].fillna(-1)
    merged['has_retail_activity'] = (merged['seconds_since_retail'] != -1).astype(int)
    
    # Feature: What category did they see last?
    merged['last_retail_subdomain'] = merged['subdomain'].fillna('none')
    merged['last_retail_subdomain'] = merged['last_retail_subdomain'].astype('category').cat.codes
    
    # 5. Demographics
    print("  Merging Demographics...")
    final_df = merged.merge(users[['user_id', 'socdem_cluster', 'region']], on='user_id', how='left')
    final_df['socdem_cluster'] = final_df['socdem_cluster'].fillna(-1)
    final_df['region'] = final_df['region'].fillna(-1)
    
    return final_df

full_dataset = build_features(offers_events, retail_events, users)

# Garbage Collection
del offers_events, retail_events
gc.collect()


In [None]:

def split_and_downsample(df):
    print("Splitting and Downsampling...")
    
    # Strict Time-Based Split to simulate production reality
    times = df['timestamp'].sort_values()
    n = len(times)
    train_cutoff = times.iloc[int(0.7 * n)]
    val_cutoff = times.iloc[int(0.85 * n)]
    
    train = df[df['timestamp'] < train_cutoff]
    val = df[(df['timestamp'] >= train_cutoff) & (df['timestamp'] < val_cutoff)]
    test = df[df['timestamp'] >= val_cutoff]
    
    # Downsample Negatives in TRAIN only
    # This balances the dataset for the model to learn patterns better,
    # without biasing the Validation/Test sets (which must remain real-world).
    print("  Downsampling Train Negatives...")
    train_pos = train[train['is_clicked'] == 1]
    train_neg = train[train['is_clicked'] == 0]
    
    train_neg_sampled = train_neg.sample(frac=0.2, random_state=SEED)
    train_balanced = pd.concat([train_pos, train_neg_sampled]).sample(frac=1, random_state=SEED)
    
    print(f"Train Balanced: {len(train_balanced):,} rows")
    return train_balanced, val, test

train_df, val_df, test_df = split_and_downsample(full_dataset)


In [None]:

def train_model(train, val):
    print("Training LightGBM Model...")
    
    features = [
        'item_id', 'offer_popularity', # NEW: Item Features
        'hour', 'day_of_week', 'user_hist_ctr', 
        'seconds_since_retail', 'has_retail_activity', 'last_retail_subdomain',
        'socdem_cluster', 'region'
    ]
    target = 'is_clicked'
    
    dtrain = lgb.Dataset(train[features], label=train[target])
    dval = lgb.Dataset(val[features], label=val[target], reference=dtrain)
    
    params = {
        'objective': 'binary',
        'metric': 'binary_logloss',
        'boosting_type': 'gbdt',
        'learning_rate': 0.05,
        'num_leaves': 31,
        'feature_fraction': 0.8,
        'bagging_fraction': 0.8,
        'bagging_freq': 5,
        'verbose': -1,
        'seed': SEED
    }
    
    model = lgb.train(
        params,
        dtrain,
        num_boost_round=1000,
        valid_sets=[dtrain, dval],
        callbacks=[lgb.early_stopping(stopping_rounds=50), lgb.log_evaluation(50)]
    )
    
    return model, features

model, feature_names = train_model(train_df, val_df)


In [None]:

def plot_lift_curve(y_true, y_pred, step=0.1):
    # Create a DataFrame for analysis
    data = pd.DataFrame({'y_true': y_true, 'y_pred': y_pred})
    data = data.sort_values('y_pred', ascending=False)
    
    # Calculate cumulative metrics
    data['cum_users'] = np.arange(len(data)) + 1
    data['cum_pos'] = data['y_true'].cumsum()
    data['percentile'] = data['cum_users'] / len(data)
    
    # Global CTR
    global_ctr = data['y_true'].mean()
    
    # Calculate Lift at each percentile
    # Lift = (Cumulative CTR at percentile) / Global CTR
    data['cum_ctr'] = data['cum_pos'] / data['cum_users']
    data['lift'] = data['cum_ctr'] / global_ctr
    
    # Plot
    plt.figure(figsize=(10, 6))
    plt.plot(data['percentile'], data['lift'], label='Model Lift')
    plt.axhline(1.0, color='r', linestyle='--', label='Random Guessing (Lift=1.0)')
    plt.xlabel('Percentile of Users Targeted (Top X%)')
    plt.ylabel('Lift (x times better than random)')
    plt.title('Lift Curve: Business Impact of Targeting')
    plt.legend()
    plt.grid(True)
    plt.show()
    
    return data

def evaluate_and_visualize(model, test_df, features):
    print("Evaluating Model...")
    
    X_test = test_df[features]
    y_test = test_df['is_clicked']
    y_pred = model.predict(X_test)
    
    # 1. ROC Curve
    fpr, tpr, _ = roc_curve(y_test, y_pred)
    auc = roc_auc_score(y_test, y_pred)
    
    plt.figure(figsize=(8, 6))
    plt.plot(fpr, tpr, label=f'AUC = {auc:.3f}')
    plt.plot([0, 1], [0, 1], 'r--')
    plt.title('ROC Curve')
    plt.legend()
    plt.show()
    
    # 2. Lift Curve (Business Metric)
    lift_data = plot_lift_curve(y_test, y_pred)
    
    # 3. Key Metrics for Conclusion
    top_10_lift = lift_data[lift_data['percentile'] >= 0.1].iloc[0]['lift']
    top_20_lift = lift_data[lift_data['percentile'] >= 0.2].iloc[0]['lift']
    
    return y_pred, auc, top_10_lift, top_20_lift

y_pred, auc_score, lift_10, lift_20 = evaluate_and_visualize(model, test_df, feature_names)


In [None]:

def explain_model(model, val_df, features):
    print("Generating SHAP Explanations...")
    X_sample = val_df[features].sample(1000, random_state=SEED)
    explainer = shap.TreeExplainer(model)
    shap_values = explainer.shap_values(X_sample)
    
    plt.figure()
    shap.summary_plot(shap_values[1], X_sample, plot_type="bar")
    plt.show()
    
    plt.figure()
    shap.summary_plot(shap_values[1], X_sample)
    plt.show()

explain_model(model, val_df, feature_names)


In [None]:

def print_conclusion(auc, lift_10, lift_20):
    print("="*60)
    print("FINAL CONCLUSION & BUSINESS IMPACT")
    print("="*60)
    
    print(f"1. Model Performance (AUC: {auc:.3f})")
    if auc > 0.7:
        print("   -> The model has STRONG predictive power.")
    elif auc > 0.6:
        print("   -> The model has MODERATE predictive power.")
    else:
        print("   -> The model is WEAK (close to random guessing).")
        
    print(f"\n2. Business Impact (Lift)")
    print(f"   -> Top 10% Targeting: {lift_10:.2f}x more effective than random.")
    print(f"   -> Top 20% Targeting: {lift_20:.2f}x more effective than random.")
    
    print("\n3. Verdict")
    if lift_10 > 2.0:
        print("   SUCCESS. The AI significantly outperforms random guessing.")
        print("   Recommendation: Deploy to production for A/B testing on a small traffic slice.")
    else:
        print("   UNCERTAIN. Lift is marginal.")
        print("   Recommendation: Investigate more features (e.g., item embeddings, deeper history).")

print_conclusion(auc_score, lift_10, lift_20)


In [None]:

import ipywidgets as widgets
from IPython.display import display, clear_output

def interactive_demo(model, test_df, features):
    print("="*60)
    print("INTERACTIVE PRODUCTION SIMULATION")
    print("="*60)
    print("Select a User ID to see how the model predicts their behavior on 'Live' (Test) data.")
    
    # Get a list of users who have at least one interaction in the test set
    sample_users = test_df['user_id'].unique()[:100] # Take first 100 for dropdown
    
    user_dropdown = widgets.Dropdown(
        options=sample_users,
        description='User ID:',
        disabled=False,
    )
    
    output = widgets.Output()
    
    def on_user_change(change):
        if change['type'] == 'change' and change['name'] == 'value':
            user_id = change['new']
            with output:
                clear_output()
                
                # Get User Data
                user_data = test_df[test_df['user_id'] == user_id].copy()
                
                if len(user_data) == 0:
                    print("No data found for this user.")
                    return
                
                # User Profile (Static)
                print(f"--- User Profile ({user_id}) ---")
                print(f"Region: {user_data['region'].iloc[0]}")
                print(f"SocDem Cluster: {user_data['socdem_cluster'].iloc[0]}")
                print(f"Historical CTR: {user_data['user_hist_ctr'].iloc[-1]:.2%}")
                print("-" * 30)
                
                # Predict
                X_user = user_data[features]
                user_data['predicted_prob'] = model.predict(X_user)
                
                # Format for Display
                display_cols = [
                    'timestamp', 'item_id', 'offer_popularity', # Added Item Info
                    'hour', 'last_retail_subdomain', 
                    'seconds_since_retail', 'is_clicked', 'predicted_prob'
                ]
                
                display_df = user_data[display_cols].copy()
                display_df['is_clicked'] = display_df['is_clicked'].map({1: 'CLICKED', 0: 'Ignored'})
                display_df['predicted_prob'] = display_df['predicted_prob'].map('{:.1%}'.format)
                display_df['seconds_since_retail'] = display_df['seconds_since_retail'].apply(lambda x: f"{x:.0f}s" if x != -1 else "N/A")
                display_df['offer_popularity'] = display_df['offer_popularity'].astype(int)
                
                print(f"\nRecent Offer Interactions ({len(display_df)}):")
                display(display_df)

    user_dropdown.observe(on_user_change)
    display(user_dropdown, output)
    
    # Trigger first load
    if len(sample_users) > 0:
        user_dropdown.value = sample_users[0]

interactive_demo(model, test_df, feature_names)
