In [1]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    roc_auc_score, log_loss, average_precision_score,
    accuracy_score, f1_score
)
import warnings
import math
warnings.filterwarnings('ignore')


In [2]:
# =========================
# Load and Feature Engineering (No Leakage)
# =========================
def load_data(raw_path, user_path, ad_path):
    raw_df = pd.read_csv(raw_path)
    user_df = pd.read_csv(user_path)
    ad_df = pd.read_csv(ad_path)
    user_df.columns = user_df.columns.str.strip()
    merged_df = raw_df.merge(user_df, how='left', left_on='user', right_on='userid')
    merged_df = merged_df.merge(ad_df, how='left', on='adgroup_id')
    return merged_df


In [3]:
def add_ctr_stats_no_leakage(df):
    df = df.sort_values('time_stamp')
    df['user_total_clicks'] = df.groupby('user')['clk'].shift().fillna(0).groupby(df['user']).cumsum()
    df['user_total_interactions'] = df.groupby('user').cumcount()
    df['user_ctr'] = df['user_total_clicks'] / (df['user_total_interactions'] + 1e-6)
    df['ad_total_clicks'] = df.groupby('adgroup_id')['clk'].shift().fillna(0).groupby(df['adgroup_id']).cumsum()
    df['ad_total_impressions'] = df.groupby('adgroup_id').cumcount()
    df['ad_ctr'] = df['ad_total_clicks'] / (df['ad_total_impressions'] + 1e-6)
    return df

In [4]:

# =========================
# ADDITION: Enhanced behavior data loading with more sophisticated features
# =========================
def load_behavior_data(behavior_path, user_list, seq_len=50):  # Increased sequence length
    behavior_df = pd.read_csv(behavior_path)
    behavior_df['time_stamp'] = pd.to_datetime(behavior_df['time_stamp'], unit='s')
    behavior_df = behavior_df[behavior_df['user'].isin(user_list)]
    
    # Enhanced categorical encoding
    cate2idx = {c: i+1 for i, c in enumerate(behavior_df['cate'].dropna().unique())}
    brand2idx = {b: i+1 for i, b in enumerate(behavior_df['brand'].dropna().unique())}
    cate2idx['<PAD>'] = 0
    brand2idx['<PAD>'] = 0
    
    behavior_df = behavior_df.sort_values(['user', 'time_stamp'])
    user2seq = {}
    
    for uid, group in behavior_df.groupby('user'):
        seq = [(cate2idx.get(row['cate'], 0), brand2idx.get(row['brand'], 0)) for _, row in group.iterrows()]
        seq = seq[-seq_len:]  # Take last seq_len items
        while len(seq) < seq_len:
            seq.insert(0, (0, 0))  # Pad at beginning
        user2seq[uid] = seq
    
    behavior_sequences = [user2seq.get(uid, [(0, 0)] * seq_len) for uid in user_list]
    return torch.tensor(behavior_sequences, dtype=torch.long), len(cate2idx), len(brand2idx)


In [5]:
def feature_engineering(df):
    df['time_stamp'] = pd.to_datetime(df['time_stamp'], unit='s')
    df['hour'] = df['time_stamp'].dt.hour
    df['day_of_week'] = df['time_stamp'].dt.dayofweek
    df['is_weekend'] = df['day_of_week'].isin([5, 6]).astype(int)
    
    # Add time-based features
    df['is_morning'] = ((df['hour'] >= 6) & (df['hour'] < 12)).astype(int)
    df['is_afternoon'] = ((df['hour'] >= 12) & (df['hour'] < 18)).astype(int)
    df['is_evening'] = ((df['hour'] >= 18) & (df['hour'] < 24)).astype(int)
    df['is_night'] = ((df['hour'] >= 0) & (df['hour'] < 6)).astype(int)
    
    df = add_ctr_stats_no_leakage(df)
    
    # Enhanced outlier handling and feature scaling
    for col in ['price', 'user_total_interactions', 'ad_total_impressions']:
        if col in df.columns:
            df[col] = df[col].clip(upper=df[col].quantile(0.99))
    
    # Fill missing values more intelligently
    for col in df.select_dtypes(include=[np.number]).columns:
        if df[col].isna().sum() > 0:
            df[col] = df[col].fillna(df[col].median())
    
    for col in df.select_dtypes(include=['object']).columns:
        if df[col].isna().sum() > 0:
            df[col] = df[col].fillna(df[col].mode()[0] if len(df[col].mode()) > 0 else 'unknown')
    
    return df

In [6]:
def prepare_features(df):
    user_profile_features = [
        'cms_segid', 'cms_group_id', 'final_gender_code', 'age_level',
        'pvalue_level', 'shopping_level', 'occupation', 'new_user_class_level']
    time_features = ['hour', 'day_of_week', 'is_weekend', 'is_morning', 'is_afternoon', 'is_evening', 'is_night']
    user_behavior_features = ['user_total_interactions', 'user_ctr']
    ad_features = ['adgroup_id', 'cate_id', 'campaign_id', 'customer', 'brand']
    ad_performance_features = ['ad_total_impressions', 'ad_ctr']
    features = user_profile_features + time_features + user_behavior_features + ad_features + ad_performance_features + ['price']
    
    y = df['clk']
    X = df[features].copy()
    cat_features = ad_features + user_profile_features
    num_features = time_features + user_behavior_features + ad_performance_features + ['price']
    
    X_cat = X[cat_features].copy()
    X_num = X[num_features].copy()
    
    # Enhanced scaling
    scaler = StandardScaler()
    X_num_scaled = pd.DataFrame(scaler.fit_transform(X_num), columns=X_num.columns, index=X.index)
    
    # Enhanced encoding with handling unknown categories
    encoders = {}
    for col in X_cat.columns:
        le = LabelEncoder()
        X_cat[col] = le.fit_transform(X_cat[col].astype(str))
        encoders[col] = le
    
    X_cat_train, X_cat_test, X_num_train, X_num_test, y_train, y_test = train_test_split(
        X_cat, X_num_scaled, y, test_size=0.2, random_state=42, stratify=y)
    
    cat_dims = [X_cat[col].nunique() for col in X_cat.columns]
    return X_cat_train, X_cat_test, X_num_train, X_num_test, y_train, y_test, cat_dims, X_num_scaled.shape[1]


In [7]:
# =========================
# Enhanced Transformer-based CTR Model
# =========================
class MultiHeadAttention(nn.Module):
    def __init__(self, d_model, n_heads, dropout=0.1):
        super().__init__()
        self.d_model = d_model
        self.n_heads = n_heads
        self.d_k = d_model // n_heads
        
        self.w_q = nn.Linear(d_model, d_model)
        self.w_k = nn.Linear(d_model, d_model)
        self.w_v = nn.Linear(d_model, d_model)
        self.w_o = nn.Linear(d_model, d_model)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, query, key, value, mask=None):
        batch_size = query.size(0)
        
        Q = self.w_q(query).view(batch_size, -1, self.n_heads, self.d_k).transpose(1, 2)
        K = self.w_k(key).view(batch_size, -1, self.n_heads, self.d_k).transpose(1, 2)
        V = self.w_v(value).view(batch_size, -1, self.n_heads, self.d_k).transpose(1, 2)
        
        attention_scores = torch.matmul(Q, K.transpose(-2, -1)) / math.sqrt(self.d_k)
        
        if mask is not None:
            attention_scores = attention_scores.masked_fill(mask == 0, -1e9)
        
        attention_weights = F.softmax(attention_scores, dim=-1)
        attention_weights = self.dropout(attention_weights)
        
        context = torch.matmul(attention_weights, V)
        context = context.transpose(1, 2).contiguous().view(batch_size, -1, self.d_model)
        
        output = self.w_o(context)
        return output, attention_weights


In [8]:
class TransformerBlock(nn.Module):
    def __init__(self, d_model, n_heads, d_ff, dropout=0.1):
        super().__init__()
        self.attention = MultiHeadAttention(d_model, n_heads, dropout)
        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)
        self.feed_forward = nn.Sequential(
            nn.Linear(d_model, d_ff),
            nn.GELU(),
            nn.Dropout(dropout),
            nn.Linear(d_ff, d_model),
            nn.Dropout(dropout)
        )
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, x, mask=None):
        # Self-attention
        attn_output, _ = self.attention(x, x, x, mask)
        x = self.norm1(x + self.dropout(attn_output))
        
        # Feed forward
        ff_output = self.feed_forward(x)
        x = self.norm2(x + ff_output)
        
        return x


In [9]:
class CTRModelWithTransformer(nn.Module):
    def __init__(self, cat_dims, emb_dim, num_features, hidden_dims, 
                 cate_vocab_size, brand_vocab_size, behavior_emb_dim=32, 
                 seq_len=50, n_heads=8, n_layers=3, dropout=0.2):
        super().__init__()

        # Embedding for categorical features
        self.embeddings = nn.ModuleList([nn.Embedding(dim + 2, emb_dim) for dim in cat_dims])
        self.num_linear = nn.Linear(num_features, emb_dim)

        # Embedding for behavior sequence (cate, brand)
        self.cate_emb = nn.Embedding(cate_vocab_size, behavior_emb_dim)
        self.brand_emb = nn.Embedding(brand_vocab_size, behavior_emb_dim)
        self.seq_len = seq_len
        self.pos_encoding = nn.Parameter(torch.randn(seq_len, behavior_emb_dim * 2))

        # Transformer layers
        self.transformer_layers = nn.ModuleList([
            TransformerBlock(behavior_emb_dim * 2, n_heads, behavior_emb_dim * 4, dropout)
            for _ in range(n_layers)
        ])
        self.behavior_proj = nn.Linear(behavior_emb_dim * 2, emb_dim)

        # Feature dimensions (categorical + numerical + behavior)
        total_input_dim = emb_dim * (len(cat_dims) + 2)  # +2 for num and behavior
        mlp_input_dim = total_input_dim * 2  # concat(x0, xl)

        # Cross Network
        self.cross_net = nn.ModuleList([
            nn.Linear(total_input_dim, total_input_dim) for _ in range(2)
        ])

        # Deep MLP
        layers = []
        input_dim = mlp_input_dim
        for h in hidden_dims:
            layers.extend([
                nn.Linear(input_dim, h),
                nn.BatchNorm1d(h),
                nn.GELU(),
                nn.Dropout(dropout)
            ])
            input_dim = h

        # Final prediction layer
        layers.extend([
            nn.Linear(input_dim, input_dim // 2),
            nn.GELU(),
            nn.Dropout(dropout / 2),
            nn.Linear(input_dim // 2, 1)
        ])
        self.mlp = nn.Sequential(*layers)

        self._init_weights()

    def _init_weights(self):
        for module in self.modules():
            if isinstance(module, nn.Linear):
                nn.init.xavier_uniform_(module.weight)
                if module.bias is not None:
                    nn.init.zeros_(module.bias)
            elif isinstance(module, nn.Embedding):
                nn.init.normal_(module.weight, 0, 0.1)

    def forward(self, x_cat, x_num, behavior_seq):
        # Categorical feature embedding
        cat_embeds = [emb(x_cat[:, i]) for i, emb in enumerate(self.embeddings)]
        cat_emb = torch.cat(cat_embeds, dim=1)

        # Numeric feature projection
        num_emb = self.num_linear(x_num)

        # Behavior sequence embedding + position
        cate_seq = self.cate_emb(behavior_seq[:, :, 0])
        brand_seq = self.brand_emb(behavior_seq[:, :, 1])
        behavior_emb = torch.cat([cate_seq, brand_seq], dim=-1)
        behavior_emb = behavior_emb + self.pos_encoding.unsqueeze(0)

        # Attention mask
        mask = (behavior_seq[:, :, 0] != 0).unsqueeze(1).unsqueeze(1)

        for transformer in self.transformer_layers:
            behavior_emb = transformer(behavior_emb, mask)

        # Weighted pooling
        seq_mask = (behavior_seq[:, :, 0] != 0).float().unsqueeze(-1)
        behavior_repr = (behavior_emb * seq_mask).sum(dim=1) / (seq_mask.sum(dim=1) + 1e-8)
        behavior_repr = self.behavior_proj(behavior_repr)

        # Combine all features
        all_features = torch.cat([cat_emb, num_emb, behavior_repr], dim=1)

        # Cross Network
        x0 = all_features
        xl = all_features
        for cross_layer in self.cross_net:
            xl = x0 * cross_layer(xl) + xl

        # Combine cross and raw
        final_features = torch.cat([x0, xl], dim=1)

        # Prediction
        output = self.mlp(final_features)
        return output.squeeze()

In [10]:
# =========================
# Enhanced Training with Advanced Techniques
# =========================
class CTRDataset(Dataset):
    def __init__(self, X_cat, X_num, behavior_seq, y):
        self.X_cat = torch.tensor(X_cat.values, dtype=torch.long)
        self.X_num = torch.tensor(X_num.values, dtype=torch.float32)
        self.behavior_seq = behavior_seq
        self.y = torch.tensor(y.values, dtype=torch.float32)
    
    def __len__(self): 
        return len(self.y)
    
    def __getitem__(self, idx):
        return self.X_cat[idx], self.X_num[idx], self.behavior_seq[idx], self.y[idx]


In [11]:
def train_with_behavior_advanced(X_cat_train, X_cat_test, X_num_train, X_num_test, 
                               behavior_train, behavior_test, y_train, y_test, 
                               cat_dims, num_features, cate_vocab_size, brand_vocab_size,
                               emb_dim=64, hidden_dims=[512, 256, 128], batch_size=1024, 
                               epochs=50, lr=0.001, weight_decay=1e-5):
    
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(f"Using device: {device}")
    
    # Enhanced model architecture
    model = CTRModelWithTransformer(
        cat_dims, emb_dim, num_features, hidden_dims, 
        cate_vocab_size, brand_vocab_size, 
        behavior_emb_dim=48, seq_len=50, n_heads=8, n_layers=4, dropout=0.15
    ).to(device)
    
    # Datasets and loaders
    train_dataset = CTRDataset(X_cat_train, X_num_train, behavior_train, y_train)
    test_dataset = CTRDataset(X_cat_test, X_num_test, behavior_test, y_test)
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=4)
    test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, num_workers=4)
    
    # Advanced training setup
    pos_weight = torch.tensor([(y_train == 0).sum() / (y_train == 1).sum()], dtype=torch.float32).to(device)
    criterion = nn.BCEWithLogitsLoss(pos_weight=pos_weight)
    
    # AdamW optimizer with weight decay
    optimizer = torch.optim.AdamW(model.parameters(), lr=lr, weight_decay=weight_decay)
    
    # Learning rate scheduler
    scheduler = torch.optim.lr_scheduler.CosineAnnealingWarmRestarts(
        optimizer, T_0=10, T_mult=2, eta_min=lr/10
    )
    
    # Early stopping
    best_auc = 0
    patience = 10
    patience_counter = 0
    
    print(f"Model parameters: {sum(p.numel() for p in model.parameters()):,}")
    
    for epoch in range(epochs):
        # Training phase
        model.train()
        total_loss = 0
        train_preds = []
        train_targets = []
        
        for x_cat, x_num, beh, y in train_loader:
            x_cat, x_num, beh, y = x_cat.to(device), x_num.to(device), beh.to(device), y.to(device)
            
            optimizer.zero_grad()
            logits = model(x_cat, x_num, beh)
            loss = criterion(logits, y)
            loss.backward()
            
            # Gradient clipping
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
            
            optimizer.step()
            total_loss += loss.item()
            
            train_preds.extend(torch.sigmoid(logits).cpu().detach().numpy())
            train_targets.extend(y.cpu().numpy())
        
        # Validation phase
        model.eval()
        y_pred = []
        val_loss = 0
        
        with torch.no_grad():
            for x_cat, x_num, beh, y in test_loader:
                x_cat, x_num, beh = x_cat.to(device), x_num.to(device), beh.to(device)
                y = y.to(device)
                logits = model(x_cat, x_num, beh)
                val_loss += criterion(logits, y).item()
                probs = torch.sigmoid(logits).cpu().numpy()
                y_pred.extend(probs)
        
        # Calculate metrics
        train_auc = roc_auc_score(train_targets, train_preds)
        auc = roc_auc_score(y_test, y_pred)
        logloss = log_loss(y_test, y_pred)
        avg_precision = average_precision_score(y_test, y_pred)
        
        # Learning rate scheduling
        scheduler.step()
        current_lr = optimizer.param_groups[0]['lr']
        
        print(f"Epoch {epoch+1}/{epochs} - "
              f"Train Loss: {total_loss/len(train_loader):.4f} - "
              f"Val Loss: {val_loss/len(test_loader):.4f} - "
              f"Train AUC: {train_auc:.4f} - "
              f"Val AUC: {auc:.4f} - "
              f"AP: {avg_precision:.4f} - "
              f"LogLoss: {logloss:.4f} - "
              f"LR: {current_lr:.6f}")
        
        # Early stopping and best model saving
        if auc > best_auc:
            best_auc = auc
            best_model_state = model.state_dict().copy()
            patience_counter = 0
            print(f"★ New best AUC: {best_auc:.4f}")
        else:
            patience_counter += 1
            if patience_counter >= patience:
                print(f"Early stopping triggered after {epoch+1} epochs")
                break
    
    # Load best model
    model.load_state_dict(best_model_state)
    return model, best_auc, logloss


In [None]:
# =========================
# Main Function with Enhanced Configuration
# =========================
def main():
    print(" Loading data...")
    df = load_data('raw_sample.csv', 'user_profile.csv', 'ad_feature.csv')
    df = feature_engineering(df)
    
    print(" Preparing features...")
    X_cat_train, X_cat_test, X_num_train, X_num_test, y_train, y_test, cat_dims, num_features = prepare_features(df)
    
    print(" Dataset statistics:")
    print(f"  Training samples: {len(X_cat_train):,}")
    print(f"  Test samples: {len(X_cat_test):,}")
    print(f"  Positive rate: {y_train.mean():.4f}")
    print(f"  Categorical features: {len(cat_dims)}")
    print(f"  Numerical features: {num_features}")
    
    # Build behavior data only for users in df
    print(" Loading behavior data...")
    full_user_ids = df['user'].tolist()
    behavior_tensor, cate_vocab_size, brand_vocab_size = load_behavior_data('behavior_log.csv', full_user_ids, seq_len=50)
    
    print(f"  Behavior sequence length: 50")
    print(f"  Category vocabulary: {cate_vocab_size}")
    print(f"  Brand vocabulary: {brand_vocab_size}")
    
    # Align behavior_tensor to X_cat index (fixed alignment)
    behavior_tensor = pd.Series(list(behavior_tensor.numpy()), index=df.index)
    behavior_train = torch.tensor(behavior_tensor.loc[X_cat_train.index].tolist(), dtype=torch.long)
    behavior_test = torch.tensor(behavior_tensor.loc[X_cat_test.index].tolist(), dtype=torch.long)
    
    print(" Training Enhanced Transformer CTR model...")
    model, auc, logloss = train_with_behavior_advanced(
        X_cat_train, X_cat_test, X_num_train, X_num_test,
        behavior_train, behavior_test,
        y_train, y_test,
        cat_dims, num_features,
        cate_vocab_size, brand_vocab_size,
        emb_dim=80,  # Increased embedding dimension
        hidden_dims=[1024, 512, 256, 128],  # Deeper network
        batch_size=512,  # Smaller batch for better gradients
        epochs=20,  # More epochs with early stopping
        lr=0.002,  # Slightly higher learning rate
        weight_decay=1e-4  # Stronger regularization
    )
    
    print("\n Training completed!")
    print(f" Final AUC: {auc:.6f}")
    print(f" Final LogLoss: {logloss:.6f}")
    
    return model, auc, logloss

if __name__ == "__main__":
    main()

 Loading data...
 Preparing features...
 Dataset statistics:
  Training samples: 21,246,368
  Test samples: 5,311,593
  Positive rate: 0.0514
  Categorical features: 13
  Numerical features: 12
 Loading behavior data...
  Behavior sequence length: 50
  Category vocabulary: 9874
  Brand vocabulary: 203528
 Training Enhanced Transformer CTR model...
Using device: cuda
Model parameters: 147,227,201
Epoch 1/20 - Train Loss: 1.2445 - Val Loss: 1.2367 - Train AUC: 0.6530 - Val AUC: 0.6632 - AP: 0.0959 - LogLoss: 0.6061 - LR: 0.001956
★ New best AUC: 0.6632
Epoch 2/20 - Train Loss: 1.1981 - Val Loss: 1.2118 - Train AUC: 0.6935 - Val AUC: 0.6820 - AP: 0.1022 - LogLoss: 0.6129 - LR: 0.001828
★ New best AUC: 0.6820
Epoch 3/20 - Train Loss: 1.1331 - Val Loss: 1.2145 - Train AUC: 0.7354 - Val AUC: 0.6863 - AP: 0.1012 - LogLoss: 0.5981 - LR: 0.001629
★ New best AUC: 0.6863
Epoch 4/20 - Train Loss: 1.0839 - Val Loss: 1.2286 - Train AUC: 0.7637 - Val AUC: 0.6828 - AP: 0.0973 - LogLoss: 0.6041 - LR: 0

In [13]:
%whos

Variable                       Type        Data/Info
----------------------------------------------------
CTRDataset                     type        <class '__main__.CTRDataset'>
CTRModelWithTransformer        type        <class '__main__.CTRModelWithTransformer'>
DataLoader                     type        <class 'torch.utils.data.dataloader.DataLoader'>
Dataset                        type        <class 'torch.utils.data.dataset.Dataset'>
F                              module      <module 'torch.nn.functio<...>/torch/nn/functional.py'>
LabelEncoder                   type        <class 'sklearn.preproces<...>ing._label.LabelEncoder'>
MultiHeadAttention             type        <class '__main__.MultiHeadAttention'>
StandardScaler                 type        <class 'sklearn.preproces<...>ng._data.StandardScaler'>
TransformerBlock               type        <class '__main__.TransformerBlock'>
accuracy_score                 function    <function accuracy_score at 0x7f01cf5bdbd0>
add_ctr_stats