In [2]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/adobe-devcraft/dataset/bid.12.txt
/kaggle/input/adobe-devcraft/dataset/conv.10.txt
/kaggle/input/adobe-devcraft/dataset/bid.09.txt
/kaggle/input/adobe-devcraft/dataset/conv.07.txt
/kaggle/input/adobe-devcraft/dataset/imp.08.txt
/kaggle/input/adobe-devcraft/dataset/conv.06.txt
/kaggle/input/adobe-devcraft/dataset/conv.08.txt
/kaggle/input/adobe-devcraft/dataset/imp.10.txt
/kaggle/input/adobe-devcraft/dataset/clk.09.txt
/kaggle/input/adobe-devcraft/dataset/bid.10.txt
/kaggle/input/adobe-devcraft/dataset/imp.09.txt
/kaggle/input/adobe-devcraft/dataset/imp.07.txt
/kaggle/input/adobe-devcraft/dataset/conv.12.txt
/kaggle/input/adobe-devcraft/dataset/bid.08.txt
/kaggle/input/adobe-devcraft/dataset/conv.09.txt
/kaggle/input/adobe-devcraft/dataset/clk.07.txt
/kaggle/input/adobe-devcraft/dataset/imp.12.txt
/kaggle/input/adobe-devcraft/dataset/imp.06.txt
/kaggle/input/adobe-devcraft/dataset/imp.11.txt
/kaggle/input/adobe-devcraft/dataset/bid.06.txt
/kaggle/input/adobe-devcraft/datas

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
import lightgbm as lgb
from sklearn.metrics import roc_auc_score, average_precision_score, mean_squared_error
import warnings
warnings.filterwarnings('ignore')
import gc
from tqdm import tqdm

class RTBDataProcessor:
    def __init__(self):
        self.label_encoders = {}
        self.scaler = StandardScaler()
        self.column_names = [
            "BidID", "Timestamp", "Logtype", "VisitorID", "User-Agent", "IP", "Region", "City",
            "Adexchange", "Domain", "URL", "AnonymousURLID", "AdslotID", "Adslotwidth", "Adslotheight",
            "Adslotvisibility", "Adslotformat", "Adslotfloorprice", "CreativeID", "Biddingprice",
            "Payingprice", "KeypageURL", "AdvertiserID"
        ]
        self.column_names_bid = [
            "BidID", "Timestamp", "VisitorID", "User-Agent", "IP", "Region", "City",
            "Adexchange", "Domain", "URL", "AnonymousURLID", "AdslotID", "Adslotwidth", "Adslotheight",
            "Adslotvisibility", "Adslotformat", "Adslotfloorprice", "CreativeID", "Biddingprice",
            "AdvertiserID"
        ]

    def load_data(self, bid_path, imp_path, clk_path, conv_path, chunksize=100000):
        print("Loading data files in chunks...")
        def read_data_chunks(file_path, columns):
            chunks = []
            for chunk in pd.read_csv(file_path, sep='\t', header=None, chunksize=chunksize):
                chunk = chunk.iloc[:, :-1]  
                chunk.columns = columns
                chunks.append(chunk['BidID'])
            return pd.concat(chunks)
    
        bid_ids = read_data_chunks(bid_path, self.column_names_bid)
        imp_ids = read_data_chunks(imp_path, self.column_names)
        clk_ids = read_data_chunks(clk_path, self.column_names)
        conv_ids = read_data_chunks(conv_path, self.column_names)
    
        labels_df = pd.DataFrame({'BidID': pd.concat([bid_ids, imp_ids, clk_ids, conv_ids]).unique()})
        
 
        del bid_ids, imp_ids, clk_ids, conv_ids
        gc.collect()
    
        labels_df['is_impression'] = 0
        labels_df['is_click'] = 0
        labels_df['is_conversion'] = 0
        labels_df['Payingprice'] = 0.0
    
        
        print("Processing impression data...")
        for chunk in pd.read_csv(imp_path, sep='\t', header=None, chunksize=chunksize):
            chunk = chunk.iloc[:, :-1]
            chunk.columns = self.column_names
            chunk = chunk.drop_duplicates(subset='BidID', keep='first')
            temp_df = chunk[['BidID', 'Payingprice']].set_index('BidID')
            mask = labels_df['BidID'].isin(chunk['BidID'])
            labels_df.loc[mask, 'is_impression'] = 1
            matching_prices = temp_df.loc[labels_df.loc[mask, 'BidID']]['Payingprice'].values
            labels_df.loc[mask, 'Payingprice'] = matching_prices
            del chunk, temp_df
            gc.collect()

        print("Processing click data...")
        for chunk in pd.read_csv(clk_path, sep='\t', header=None, chunksize=chunksize):
            chunk = chunk.iloc[:, :-1]
            chunk.columns = self.column_names
            chunk = chunk.drop_duplicates(subset='BidID', keep='first')
            mask = labels_df['BidID'].isin(chunk['BidID'])
            labels_df.loc[mask, 'is_click'] = 1
            del chunk
            gc.collect()
    
        print("Processing conversion data...")
        for chunk in pd.read_csv(conv_path, sep='\t', header=None, chunksize=chunksize):
            chunk = chunk.iloc[:, :-1]
            chunk.columns = self.column_names
            chunk = chunk.drop_duplicates(subset='BidID', keep='first')
            mask = labels_df['BidID'].isin(chunk['BidID'])
            labels_df.loc[mask, 'is_conversion'] = 1
            del chunk
            gc.collect()
    
        print("Processing bid data...")
        bid_chunks = []
        for chunk in pd.read_csv(bid_path, sep='\t', header=None, chunksize=chunksize):
            chunk = chunk.iloc[:, :-1]
            chunk.columns = self.column_names_bid
            chunk = chunk.drop_duplicates(subset='BidID', keep='first')
            bid_chunks.append(chunk)
            
        bid_df = pd.concat(bid_chunks)
        del bid_chunks
        gc.collect()
    
        print("\nData loading completed:")
        print(f"Total bids: {len(labels_df):,}")
        print(f"Impressions: {labels_df['is_impression'].sum():,} ({labels_df['is_impression'].mean()*100:.2f}%)")
        print(f"Clicks: {labels_df['is_click'].sum():,} ({labels_df['is_click'].mean()*100:.4f}%)")
        print(f"Conversions: {labels_df['is_conversion'].sum():,} ({labels_df['is_conversion'].mean()*100:.4f}%)")
    
        return bid_df, labels_df

    def extract_features(self, df, chunk_size=50000):
        print("\nExtracting features in chunks...")
        
        feature_chunks = []
        for i in tqdm(range(0, len(df), chunk_size)):
            chunk = df.iloc[i:i + chunk_size].copy()
            
            
            chunk['Timestamp'] = pd.to_datetime(chunk['Timestamp'], format='%Y%m%d%H%M%S%f', errors='coerce')
            chunk['hour'] = chunk['Timestamp'].dt.hour.fillna(-1).astype(np.int8)
            chunk['day_of_week'] = chunk['Timestamp'].dt.dayofweek.fillna(-1).astype(np.int8)
            chunk['is_weekend'] = chunk['day_of_week'].isin([5, 6]).astype(np.int8)
            
         
            chunk['User-Agent'] = chunk['User-Agent'].fillna('unknown')
            chunk['is_mobile'] = chunk['User-Agent'].str.contains('Mobile|Android|iOS', case=False, na=False).astype(np.int8)
            chunk['is_chrome'] = chunk['User-Agent'].str.contains('Chrome', case=False, na=False).astype(np.int8)
            chunk['is_firefox'] = chunk['User-Agent'].str.contains('Firefox', case=False, na=False).astype(np.int8)
            chunk['is_safari'] = chunk['User-Agent'].str.contains('Safari', case=False, na=False).astype(np.int8)
            
         
            chunk['Adslotwidth'] = pd.to_numeric(chunk['Adslotwidth'], errors='coerce').fillna(0).astype(np.float32)
            chunk['Adslotheight'] = pd.to_numeric(chunk['Adslotheight'], errors='coerce').fillna(0).astype(np.float32)
            chunk['ad_area'] = (chunk['Adslotwidth'] * chunk['Adslotheight']).astype(np.float32)
            chunk['is_premium_ad'] = (chunk['ad_area'] >= 100000).astype(np.int8)
            
          
            cat_cols = ['Region', 'City', 'Adexchange', 'Domain', 'URL', 'AdslotID',
                       'Adslotvisibility', 'Adslotformat', 'CreativeID', 'AdvertiserID']
            
            for col in cat_cols:
                if col in chunk.columns:
                    chunk[col] = chunk[col].fillna('unknown')
                    if col not in self.label_encoders:
                        self.label_encoders[col] = LabelEncoder()
                        self.label_encoders[col].fit(df[col].fillna('unknown').astype(str))
                    chunk[col] = self.label_encoders[col].transform(chunk[col].astype(str))
                    chunk[col] = chunk[col].astype(np.int32)
            
           
            drop_cols = ['Timestamp', 'User-Agent', 'IP', 'AnonymousURLID', 'KeypageURL', 'BidID','VisitorID']
            chunk = chunk.drop(columns=[col for col in drop_cols if col in chunk.columns])
            
            feature_chunks.append(chunk)
            
    
            del chunk
            gc.collect()    

        features = pd.concat(feature_chunks, axis=0)
        del feature_chunks
        gc.collect()
        
        num_cols = ['Adslotwidth', 'Adslotheight', 'ad_area', 'Adslotfloorprice']
        features[num_cols] = self.scaler.fit_transform(features[num_cols])
        
        print(f"Final feature count: {len(features.columns)}")
        return features

def optimize_dtypes(df):
    for col in df.columns:
        # Optimize integers
        if df[col].dtype == 'int64':
            if df[col].min() >= 0:
                if df[col].max() < 255:
                    df[col] = df[col].astype(np.uint8)
                elif df[col].max() < 65535:
                    df[col] = df[col].astype(np.uint16)
                else:
                    df[col] = df[col].astype(np.uint32)
            else:
                if df[col].min() > -128 and df[col].max() < 127:
                    df[col] = df[col].astype(np.int8)
                elif df[col].min() > -32768 and df[col].max() < 32767:
                    df[col] = df[col].astype(np.int16)
                else:
                    df[col] = df[col].astype(np.int32)
        
        # Optimize floats
        elif df[col].dtype == 'float64':
            df[col] = df[col].astype(np.float32)
            
    return df
def train_and_evaluate():
    # Initialize processor
    processor = RTBDataProcessor()
    
    # Load data
    print("\nLoading and processing data...")
    bid_df, labels_df = processor.load_data(
        '/kaggle/input/adobe-devcraft/dataset/bid.06.txt',
        '/kaggle/input/adobe-devcraft/dataset/imp.06.txt',
        '/kaggle/input/adobe-devcraft/dataset/clk.06.txt',
        '/kaggle/input/adobe-devcraft/dataset/conv.06.txt'
    )
    
    # Process features
    features_df = processor.extract_features(bid_df,chunk_size = 50000)
    features_df = optimize_dtypes(features_df)
    labels_df = optimize_dtypes(labels_df)
    full_df = pd.concat([features_df, labels_df.drop('BidID', axis=1)], axis=1)
    
    # Split data
    print("\nSplitting data...")
    train_df, test_df = train_test_split(full_df, test_size=0.2, random_state=42)
    val_df, test_df = train_test_split(test_df, test_size=0.5, random_state=42)
    
    print(f"Train set: {len(train_df):,} rows")
    print(f"Validation set: {len(val_df):,} rows")
    print(f"Test set: {len(test_df):,} rows")
    
    # Define feature columns
    feature_cols = features_df.columns.tolist()
    
    # Train CTR model
    print("\nTraining CTR model...")
    ctr_model = lgb.LGBMClassifier(
        n_estimators=100,
        learning_rate=0.05,
        max_depth=7,
        num_leaves=31,
        min_child_samples=20,
        class_weight='balanced',
        random_state=42
    )
    eval_set = [(val_df[feature_cols], val_df['is_click'])]
    
    ctr_model.fit(
        train_df[feature_cols],
        train_df['is_click'],
        eval_set=eval_set,
        eval_metric='auc',
        callbacks=[
            lgb.early_stopping(stopping_rounds=10),
            lgb.log_evaluation(period=100)
        ]
    )
    
    # Train CVR model
    print("\nTraining CVR model...")
    cvr_model = lgb.LGBMClassifier(
        n_estimators=100,
        learning_rate=0.05,
        max_depth=7,
        num_leaves=31,
        min_child_samples=20,
        class_weight='balanced',
        random_state=42
    )
    
    # Train only on clicked impressions
    clicked_train = train_df[train_df['is_click'] == 1]
    clicked_val = val_df[val_df['is_click'] == 1]
    
    if len(clicked_train) > 0:
        eval_set = [(clicked_val[feature_cols], clicked_val['is_conversion'])]
        cvr_model.fit(
            clicked_train[feature_cols],
            clicked_train['is_conversion'],
            eval_set=eval_set,
            eval_metric='auc',
            callbacks=[
                lgb.early_stopping(stopping_rounds=10),
                lgb.log_evaluation(period=100)
            ]
        )
    
    # Train market price model
    print("\nTraining market price model...")
    market_price_model = lgb.LGBMRegressor(
        n_estimators=100,
        learning_rate=0.05,
        max_depth=7,
        num_leaves=31,
        min_child_samples=20,
        random_state=42
    )
    
    # Train only on won impressions
    won_train = train_df[train_df['is_impression'] == 1]
    won_val = val_df[val_df['is_impression'] == 1]
    eval_set = [(won_val[feature_cols], won_val['Payingprice'])]
    market_price_model.fit(
        won_train[feature_cols],
        won_train['Payingprice'],
        eval_set=eval_set,
        eval_metric='rmse',
        callbacks=[
            lgb.early_stopping(stopping_rounds=10),
            lgb.log_evaluation(period=100)
        ]
    )
    
    # Evaluate models
    print("\nEvaluating models...")
    
    # CTR predictions
    ctr_pred = ctr_model.predict_proba(test_df[feature_cols])[:, 1]
    ctr_auc = roc_auc_score(test_df['is_click'], ctr_pred)
    ctr_prauc = average_precision_score(test_df['is_click'], ctr_pred)
    
    # CVR predictions (only for clicked impressions)
    clicked_test = test_df[test_df['is_click'] == 1]
    if len(clicked_test) > 0:
        cvr_pred = cvr_model.predict_proba(clicked_test[feature_cols])[:, 1]
        cvr_auc = roc_auc_score(clicked_test['is_conversion'], cvr_pred)
        cvr_prauc = average_precision_score(clicked_test['is_conversion'], cvr_pred)
    
    # Market price predictions (only for won impressions)
    won_test = test_df[test_df['is_impression'] == 1]
    market_pred = market_price_model.predict(won_test[feature_cols])
    market_rmse = np.sqrt(mean_squared_error(won_test['Payingprice'], market_pred))
    
    # Print results
    print("\nTest Set Results:")
    print(f"CTR Model - AUC: {ctr_auc:.4f}, PR-AUC: {ctr_prauc:.4f}")
    if len(clicked_test) > 0:
        print(f"CVR Model - AUC: {cvr_auc:.4f}, PR-AUC: {cvr_prauc:.4f}")
    print(f"Market Price Model - RMSE: {market_rmse:.4f}")
    
    # Feature importance analysis
    print("\nTop 10 Important Features for CTR prediction:")
    feature_imp = pd.DataFrame({
        'feature': feature_cols,
        'importance': ctr_model.feature_importances_
    }).sort_values('importance', ascending=False)
    print(feature_imp.head(10))
    
    return ctr_model, cvr_model, market_price_model, processor

if __name__ == "__main__":
    models = train_and_evaluate()


Loading and processing data...
Loading data files in chunks...
Processing impression data...
Processing click data...
Processing conversion data...
Processing bid data...

Data loading completed:
Total bids: 9,586,949
Impressions: 1,815,075 (18.93%)
Clicks: 1,159 (0.0121%)
Conversions: 38 (0.0004%)

Extracting features in chunks...


100%|██████████| 192/192 [07:10<00:00,  2.24s/it]


Final feature count: 23

Splitting data...
Train set: 7,669,559 rows
Validation set: 958,695 rows
Test set: 958,695 rows

Training CTR model...
[LightGBM] [Info] Number of positive: 912, number of negative: 7668647
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.599197 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1229
[LightGBM] [Info] Number of data points in the train set: 7669559, number of used features: 22
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=-0.000000
[LightGBM] [Info] Start training from score -0.000000
Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[37]	valid_0's auc: 0.88555	valid_0's binary_logloss: 0.419999

Training CVR model...
[LightGBM] [Info] Number of positive: 28, number of negative: 884
[LightGBM] [Info] Auto-choosing col-wise multi-t

In [5]:
ctr_model, cvr_model, market_price_model, processor = models

In [None]:
import joblib

joblib.dump(ctr_model, 'ctr_model.pkl')
joblib.dump(cvr_model, 'cvr_model.pkl')
joblib.dump(market_price_model, 'market_price_model.pkl')
joblib.dump(processor.label_encoders, 'label_encoders.pkl')
joblib.dump(processor.scaler, 'scaler.pkl')

print("Models and preprocessing objects saved.")


Models and preprocessing objects saved.
