In [1]:
# Install lightgbm if not already installed
%pip install lightgbm

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


# Import the libraries

In [2]:
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import roc_auc_score
from datetime import datetime, timedelta
import warnings
import optuna

warnings.filterwarnings('ignore')
print("All libraries imported successfully!")

All libraries imported successfully!


  from .autonotebook import tqdm as notebook_tqdm


### Loading of the train data

In [3]:
# Load the dataset
print("Loading data...")
try:
    train_orders = pd.read_csv('Train/orders.csv', low_memory=False)
    train_customers = pd.read_csv('Train/train_customers.csv')
    train_locations = pd.read_csv('Train/train_locations.csv')
    vendors = pd.read_csv('Train/vendors.csv')
except FileNotFoundError as e:
    print(f"Error: {e}. Make sure all CSV files are in the same directory as the script.")
    exit()

print("Data loaded successfully.")

Loading data...
Data loaded successfully.
Data loaded successfully.


In [4]:
print("Loading data...")

try:
    # --- Load all source files ---
    train_orders = pd.read_csv('Train/orders.csv')
    train_customers = pd.read_csv('Train/train_customers.csv')
    train_locations = pd.read_csv('Train/train_locations.csv')
    vendors = pd.read_csv('Train/vendors.csv')

except FileNotFoundError as e:
    print(f"Error loading data: {e}")
    print("Please ensure all CSV files are in the correct 'Train/' subdirectory.")
    exit()

print("Preparing and merging data...")

# --- Rename columns BEFORE merging to avoid confusion ('_x', '_y') ---
vendors.rename(columns={
    'latitude': 'vendor_lat',
    'longitude': 'vendor_lon',
    'status': 'vendor_status',
    'rating': 'vendor_rating'
}, inplace=True)

train_locations.rename(columns={
    'latitude': 'customer_lat',
    'longitude': 'customer_lon'
}, inplace=True)

# --- Merge all training data sources ---
# Start with orders and add details about the customer, vendor, and location
train_merged = train_orders.merge(train_customers, on='customer_id', how='left')
train_merged = train_merged.merge(vendors, left_on='vendor_id', right_on='id', how='left')
train_merged = train_merged.merge(
    train_locations,
    on=['customer_id'],  # Only merge on customer_id
    how='left'
)

# Debug: print columns to check for missing/misnamed columns
print("\nColumns in train_merged:")
print(train_merged.columns.tolist())

# --- Define the specific columns required for training a model ---
# These features are known at the time of prediction and avoid data leakage
required_columns = [
    # --- IDs (for context, not as model features) ---
    'customer_id',
    'vendor_id',
    # 'LOCATION_NUMBER',  # Remove if not present

    # --- Customer Features ---
    'gender',
    'dob',                         # To calculate customer age
    'status',                      # Customer account status
    'created_at_x',                # To calculate customer tenure (from customers table)

    # --- Vendor Features ---
    'vendor_category_en',
    'delivery_charge',
    'serving_distance',
    'is_open',
    'prepration_time',             # Vendor's average preparation time
    'commission',
    'discount_percentage',
    'vendor_status',               # Vendor's account status
    'rank',
    # 'vendor_rating',               # Vendor's overall historical rating (removed)
    'vendor_tag_name',             # Descriptive tags like 'Healthy', 'Pizza'

    # --- Location & Interaction Features ---
    'is_favorite',                 # If the customer has favorited this vendor
    'LOCATION_TYPE',               # e.g., 'Home', 'Work'
    'customer_lat',
    'customer_lon',
    'vendor_lat',
    'vendor_lon',
]

# --- Create the final training dataframe with only the required columns ---
# Keep all rows, even those with missing values
final_training_df = train_merged[required_columns].reset_index(drop=True)

print("\n--- Training Data Ready ---")
print(f"Final training data has {final_training_df.shape[0]} rows and {final_training_df.shape[1]} columns.")
print("Columns:", final_training_df.columns.tolist())
print("\nSample of the final training data:")
print(final_training_df.head())

# Save the final DataFrame to CSV
final_training_df.to_csv('Train/train_merged.csv', index=False)
print("\nMerged training data saved to Train/train_merged.csv")


Loading data...
Preparing and merging data...
Preparing and merging data...

Columns in train_merged:
['order_id', 'customer_id', 'item_count', 'grand_total', 'payment_mode', 'promo_code', 'vendor_discount_amount', 'promo_code_discount_percentage', 'is_favorite', 'is_rated', 'vendor_rating_x', 'driver_rating', 'deliverydistance', 'preparationtime', 'delivery_time', 'order_accepted_time', 'driver_accepted_time', 'ready_for_pickup_time', 'picked_up_time', 'delivered_time', 'delivery_date', 'vendor_id', 'created_at_x', 'LOCATION_NUMBER', 'LOCATION_TYPE', 'CID X LOC_NUM X VENDOR', 'gender', 'dob', 'status', 'verified_x', 'language_x', 'created_at_y', 'updated_at_x', 'id', 'authentication_id', 'vendor_lat', 'vendor_lon', 'vendor_category_en', 'vendor_category_id', 'delivery_charge', 'serving_distance', 'is_open', 'OpeningTime', 'OpeningTime2', 'prepration_time', 'commission', 'is_haked_delivering', 'discount_percentage', 'vendor_status', 'verified_y', 'rank', 'language_y', 'vendor_rating_y'

In [5]:
def feature_engineer(df):
    """Creates new, predictive features from existing columns."""
    df = df.copy()
    
    if 'dob' in df.columns:
        df['customer_age'] = 2025 - pd.to_numeric(df['dob'], errors='coerce')
        df['customer_age'].fillna(df['customer_age'].median(), inplace=True)
    
    if 'created_at_x' in df.columns:
        try:
            df['customer_tenure_days'] = (datetime(2025, 7, 28) - pd.to_datetime(df['created_at_x'], errors='coerce')).dt.days
            df['customer_tenure_days'].fillna(0, inplace=True)
        except:
            df['customer_tenure_days'] = 0
    
    if 'customer_lat' in df.columns and 'vendor_lat' in df.columns:
        df['distance'] = np.sqrt((df['customer_lat'] - df['vendor_lat'])**2 + (df['customer_lon'] - df['vendor_lon'])**2)
        df['distance'].fillna(df['distance'].median(), inplace=True)
    
    if 'vendor_tag_name' in df.columns:
        df['vendor_tag_count'] = df['vendor_tag_name'].fillna('').astype(str).str.count(',') + 1
        df['vendor_tag_count'].fillna(0, inplace=True)
    
    return df

def prepare_test_set(data_path='Test/'):
    """Loads and prepares the test data by creating all possible recommendations."""
    print("\nPreparing test set...")
    try:
        test_locations = pd.read_csv(f'{data_path}test_locations.csv')
        customers = pd.read_csv('Train/train_customers.csv')
        vendors = pd.read_csv('Train/vendors.csv')
    except FileNotFoundError as e:
        print(f"❌ Error: {e}")
        print("Creating mock test set from training data...")
        # Create a mock test set from existing data
        customers = pd.read_csv('Train/train_customers.csv')
        vendors = pd.read_csv('Train/vendors.csv')
        locations = pd.read_csv('Train/train_locations.csv')
        
        # Sample some customers and locations for testing
        test_customers = customers.sample(n=min(100, len(customers)), random_state=42)
        test_locations = locations[locations['customer_id'].isin(test_customers['customer_id'])].copy()
        
        test_df = pd.merge(test_locations, test_customers, on='customer_id', how='left')
        test_df['key'] = 1
        vendors['key'] = 1
        test_df = pd.merge(test_df, vendors, on='key').drop('key', axis=1)
        
        test_df.rename(columns={
            'latitude_x': 'customer_lat', 'longitude_x': 'customer_lon', 
            'latitude_y': 'vendor_lat', 'longitude_y': 'vendor_lon', 
            'status_y': 'vendor_status'
        }, inplace=True)
        
        print(f"✅ Mock test set created with {len(test_df)} potential recommendations.")
        return test_df
    
    test_df = pd.merge(test_locations, customers, on='customer_id', how='left')
    test_df['key'] = 1
    vendors['key'] = 1
    test_df = pd.merge(test_df, vendors, on='key').drop('key', axis=1)
    
    test_df.rename(columns={
        'latitude_x': 'customer_lat', 'longitude_x': 'customer_lon', 'latitude_y': 'vendor_lat', 
        'longitude_y': 'vendor_lon', 'status_y': 'vendor_status', 'vendor_rating': 'overall_vendor_rating',
        'created_at_x': 'customer_created_at'
    }, inplace=True)
    
    print(f"✅ Test set created with {len(test_df)} potential recommendations.")
    return test_df

print("Feature engineering and test set functions defined.")

Feature engineering and test set functions defined.


In [6]:
def create_advanced_features(train_orders, train_customers, vendors, train_locations):
    """
    Create advanced customer-centric, vendor-centric, and interaction features
    that significantly improve model performance.
    """
    print("🚀 Creating Advanced Features...")
    
    # Create a clean copy of the data
    orders_clean = train_orders.copy()
    
    # Clean and convert data types
    print("🧹 Cleaning data types...")
    orders_clean['delivery_date'] = pd.to_datetime(orders_clean['delivery_date'], errors='coerce')
    orders_clean['grand_total'] = pd.to_numeric(orders_clean['grand_total'], errors='coerce')
    orders_clean['item_count'] = pd.to_numeric(orders_clean['item_count'], errors='coerce')
    orders_clean['vendor_rating'] = pd.to_numeric(orders_clean['vendor_rating'], errors='coerce')
    orders_clean['preparationtime'] = pd.to_numeric(orders_clean['preparationtime'], errors='coerce')
    orders_clean['delivery_time'] = pd.to_numeric(orders_clean['delivery_time'], errors='coerce')
    
    # Drop rows with invalid dates or amounts
    initial_len = len(orders_clean)
    orders_clean = orders_clean.dropna(subset=['delivery_date', 'grand_total', 'customer_id', 'vendor_id'])
    print(f"Cleaned data: {initial_len} -> {len(orders_clean)} rows")
    
    # ===== CUSTOMER-CENTRIC FEATURES =====
    print("📊 Creating customer-centric features...")
    
    # Order Statistics
    customer_stats = orders_clean.groupby('customer_id').agg({
        'grand_total': ['mean', 'std', 'sum', 'count'],
        'item_count': ['mean', 'sum'],
        'vendor_id': 'nunique',  # Number of unique vendors they've ordered from
        'delivery_date': ['min', 'max'],  # First and last order dates
        'is_rated': 'mean'  # Rating engagement rate
    }).round(4)
    
    # Flatten column names
    customer_stats.columns = [
        'customer_avg_order_value', 'customer_order_value_std', 'customer_total_spent',
        'customer_total_orders', 'customer_avg_items_per_order', 'customer_total_items',
        'customer_unique_vendors', 'customer_first_order', 'customer_last_order',
        'customer_rating_engagement'
    ]
    
    # Time-based features
    customer_stats['days_since_first_order'] = (datetime.now() - customer_stats['customer_first_order']).dt.days
    customer_stats['customer_lifetime_days'] = (customer_stats['customer_last_order'] - customer_stats['customer_first_order']).dt.days
    
    # Order frequency (handle division by zero)
    customer_stats['customer_order_frequency'] = customer_stats['customer_total_orders'] / np.maximum(customer_stats['customer_lifetime_days'], 1)
    customer_stats['avg_days_between_orders'] = np.maximum(customer_stats['customer_lifetime_days'], 1) / customer_stats['customer_total_orders']
    
    customer_stats = customer_stats.reset_index().fillna(0)
    
    # ===== VENDOR-CENTRIC FEATURES =====
    print("🏪 Creating vendor-centric features...")
    
    vendor_stats = orders_clean.groupby('vendor_id').agg({
        'customer_id': 'nunique',  # Unique customers
        'order_id': 'count',       # Total orders
        'grand_total': 'mean',     # Average order value
        'item_count': 'mean',      # Average items per order
        'is_favorite': 'mean',     # How often they're favorited
        'vendor_rating': 'mean',   # Average rating
        'preparationtime': 'mean', # Average prep time
        'delivery_time': 'mean'    # Average delivery time
    }).round(4)
    
    vendor_stats.columns = [
        'vendor_unique_customers', 'vendor_total_orders', 'vendor_avg_order_value',
        'vendor_avg_items_per_order', 'vendor_favorite_ratio', 'vendor_avg_rating',
        'vendor_avg_prep_time', 'vendor_avg_delivery_time'
    ]
    
    vendor_stats = vendor_stats.reset_index().fillna(0)
    
    # ===== CUSTOMER-VENDOR INTERACTION FEATURES =====
    print("🤝 Creating customer-vendor interaction features...")
    
    # For each customer-vendor pair, calculate interaction history
    interaction_stats = orders_clean.groupby(['customer_id', 'vendor_id']).agg({
        'order_id': 'count',           # How many times this customer ordered from this vendor
        'grand_total': 'mean',         # Average spend at this vendor
        'is_favorite': 'max',          # Has this customer favorited this vendor
        'vendor_rating': 'mean',       # Average rating given to this vendor
        'delivery_date': 'max'         # Last order date from this vendor
    }).round(4)
    
    interaction_stats.columns = [
        'customer_vendor_order_count', 'customer_vendor_avg_spend',
        'customer_vendor_is_favorite', 'customer_vendor_avg_rating',
        'customer_vendor_last_order'
    ]
    
    # Days since last order from this vendor
    interaction_stats['days_since_last_order_from_vendor'] = (datetime.now() - interaction_stats['customer_vendor_last_order']).dt.days
    
    interaction_stats = interaction_stats.reset_index().fillna(0)
    
    # ===== CUSTOMER PREFERENCES =====
    print("❤️ Creating customer preference features...")
    
    # Most popular vendor category for each customer
    customer_vendor_category = orders_clean.merge(vendors[['id', 'vendor_category_en']], 
                                                   left_on='vendor_id', right_on='id', how='left')
    
    customer_fav_category = customer_vendor_category.groupby(['customer_id', 'vendor_category_en']).size().reset_index(name='orders_in_category')
    customer_fav_category = customer_fav_category.loc[customer_fav_category.groupby('customer_id')['orders_in_category'].idxmax()]
    customer_fav_category = customer_fav_category[['customer_id', 'vendor_category_en']].rename(columns={'vendor_category_en': 'customer_favorite_category'})
    
    # Additional time-based features
    print("⏰ Creating time-based features...")
    
    # Extract time features
    orders_clean['hour_of_day'] = orders_clean['delivery_date'].dt.hour
    orders_clean['day_of_week'] = orders_clean['delivery_date'].dt.dayofweek
    orders_clean['is_weekend'] = orders_clean['day_of_week'].isin([5, 6]).astype(int)
    
    # Customer time preferences
    customer_time_prefs = orders_clean.groupby('customer_id').agg({
        'hour_of_day': 'mean',
        'is_weekend': 'mean'
    }).round(4)
    
    customer_time_prefs.columns = ['customer_avg_order_hour', 'customer_weekend_ratio']
    customer_time_prefs = customer_time_prefs.reset_index()
    
    # Merge time preferences with customer stats
    customer_stats = customer_stats.merge(customer_time_prefs, on='customer_id', how='left')
    
    print(f"✅ Created features for {len(customer_stats)} customers, {len(vendor_stats)} vendors")
    print(f"✅ Created {len(interaction_stats)} customer-vendor interaction records")
    
    return customer_stats, vendor_stats, interaction_stats, customer_fav_category

def merge_advanced_features(df, customer_stats, vendor_stats, interaction_stats, customer_fav_category):
    """
    Merge all advanced features into the main dataframe
    """
    print("🔄 Merging advanced features...")
    
    # Merge customer features
    df = df.merge(customer_stats, on='customer_id', how='left')
    
    # Merge vendor features  
    df = df.merge(vendor_stats, on='vendor_id', how='left')
    
    # Merge interaction features
    df = df.merge(interaction_stats, on=['customer_id', 'vendor_id'], how='left')
    
    # Merge customer preferences
    df = df.merge(customer_fav_category, on='customer_id', how='left')
    
    # Fill missing values for customers/vendors not in training data
    numeric_cols = df.select_dtypes(include=[np.number]).columns
    df[numeric_cols] = df[numeric_cols].fillna(0)
    
    categorical_cols = df.select_dtypes(include=['object']).columns
    df[categorical_cols] = df[categorical_cols].fillna('unknown')
    
    print(f"✅ Final dataset shape: {df.shape}")
    
    return df

print("🎯 Advanced feature engineering functions defined!")

🎯 Advanced feature engineering functions defined!


In [7]:
def cross_validate_model(X, y, params, n_folds=5, random_state=42):
    """
    Perform stratified k-fold cross-validation to get robust performance estimates
    """
    print(f"🔄 Performing {n_folds}-fold cross-validation...")
    
    cv = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=random_state)
    cv_scores = []
    models = []
    
    for fold, (train_idx, val_idx) in enumerate(cv.split(X, y)):
        print(f"  📊 Training fold {fold + 1}/{n_folds}...")
        
        X_train_fold, X_val_fold = X.iloc[train_idx], X.iloc[val_idx]
        y_train_fold, y_val_fold = y.iloc[train_idx], y.iloc[val_idx]
        
        # Train model with regularization to prevent overfitting
        model = lgb.LGBMClassifier(**params)
        model.fit(
            X_train_fold, y_train_fold,
            eval_set=[(X_val_fold, y_val_fold)],
            eval_metric='auc',
            callbacks=[lgb.early_stopping(50), lgb.log_evaluation(0)]  # Reduced early stopping rounds
        )
        
        # Predict and score
        y_pred = model.predict_proba(X_val_fold)[:, 1]
        score = roc_auc_score(y_val_fold, y_pred)
        cv_scores.append(score)
        models.append(model)
        
        print(f"    ✅ Fold {fold + 1} AUC: {score:.4f}")
    
    print(f"🎯 Cross-validation results:")
    print(f"  • Mean AUC: {np.mean(cv_scores):.4f} (+/- {np.std(cv_scores) * 2:.4f})")
    print(f"  • Individual folds: {[f'{score:.4f}' for score in cv_scores]}")
    
    return np.mean(cv_scores), models

def optimize_hyperparameters(X, y, n_trials=30, random_state=42):
    """
    Use Optuna to find the best hyperparameters for LightGBM
    """
    print(f"🔍 Optimizing hyperparameters with {n_trials} trials...")
    
    def objective(trial):
        # Define hyperparameter search space with more conservative values
        params = {
            'objective': 'binary',
            'metric': 'auc',
            'boosting_type': 'gbdt',
            'verbose': -1,
            'random_state': random_state,
            'n_jobs': -1,
            
            # Regularization parameters to prevent overfitting
            'n_estimators': trial.suggest_int('n_estimators', 100, 800),
            'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.1),
            'num_leaves': trial.suggest_int('num_leaves', 10, 50),  # Reduced to prevent overfitting
            'feature_fraction': trial.suggest_float('feature_fraction', 0.5, 0.9),
            'bagging_fraction': trial.suggest_float('bagging_fraction', 0.5, 0.9),
            'bagging_freq': trial.suggest_int('bagging_freq', 1, 7),
            'min_child_samples': trial.suggest_int('min_child_samples', 20, 200),  # Increased for regularization
            'reg_alpha': trial.suggest_float('reg_alpha', 0.0, 2.0),
            'reg_lambda': trial.suggest_float('reg_lambda', 0.0, 2.0),
            'min_split_gain': trial.suggest_float('min_split_gain', 0.0, 1.0)
        }
        
        # Use 3-fold CV for speed during optimization
        cv_score, _ = cross_validate_model(X, y, params, n_folds=3, random_state=random_state)
        return cv_score
    
    # Run optimization (removed random_state from create_study)
    study = optuna.create_study(direction='maximize')
    study.optimize(objective, n_trials=n_trials, show_progress_bar=True)
    
    print(f"🏆 Best hyperparameters found:")
    for key, value in study.best_trial.params.items():
        print(f"  • {key}: {value}")
    print(f"🎯 Best CV AUC: {study.best_trial.value:.4f}")
    
    return study.best_trial.params

def train_ensemble_model(X, y, params, n_folds=5, random_state=42):
    """
    Train an ensemble of models using cross-validation and return averaged predictions
    """
    print("🚀 Training ensemble model...")
    
    cv = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=random_state)
    models = []
    
    for fold, (train_idx, val_idx) in enumerate(cv.split(X, y)):
        print(f"  📊 Training ensemble model {fold + 1}/{n_folds}...")
        
        X_train_fold, X_val_fold = X.iloc[train_idx], X.iloc[val_idx]
        y_train_fold, y_val_fold = y.iloc[train_idx], y.iloc[val_idx]
        
        model = lgb.LGBMClassifier(**params)
        model.fit(
            X_train_fold, y_train_fold,
            eval_set=[(X_val_fold, y_val_fold)],
            eval_metric='auc',
            callbacks=[lgb.early_stopping(50), lgb.log_evaluation(0)]
        )
        
        models.append(model)
    
    print(f"✅ Ensemble of {len(models)} models trained successfully!")
    return models

def predict_with_ensemble(models, X_test):
    """
    Make predictions using ensemble of models and return averaged probabilities
    """
    predictions = np.zeros(len(X_test))
    
    for i, model in enumerate(models):
        pred = model.predict_proba(X_test)[:, 1]
        predictions += pred
    
    # Average the predictions
    predictions /= len(models)
    return predictions

print("🎯 Cross-validation and hyperparameter optimization functions defined!")

🎯 Cross-validation and hyperparameter optimization functions defined!


In [8]:
print("="*80)
print("🚀 ENHANCED TRAINING DATASET WITH ROBUST FEATURES")
print("="*80)

# Step 1: Create simplified but robust advanced features
print("\n🎯 STEP 1: Creating Robust Advanced Features")

# Clean the data first
orders_clean = train_orders.copy()

# Convert numeric columns properly
numeric_cols = ['grand_total', 'item_count', 'vendor_rating', 'preparationtime', 'delivery_time']
for col in numeric_cols:
    if col in orders_clean.columns:
        orders_clean[col] = pd.to_numeric(orders_clean[col], errors='coerce')

# Convert binary columns
binary_cols = ['is_favorite', 'is_rated']
for col in binary_cols:
    if col in orders_clean.columns:
        orders_clean[col] = orders_clean[col].map({'Yes': 1, 'No': 0, 1: 1, 0: 0}).fillna(0)

print(f"Data cleaned: {len(orders_clean)} rows")

# CUSTOMER FEATURES
print("📊 Creating customer features...")
customer_features = orders_clean.groupby('customer_id').agg({
    'grand_total': ['count', 'mean', 'sum'],  # order_count, avg_order_value, total_spent
    'item_count': 'sum',                      # total_items_ordered
    'vendor_id': 'nunique',                   # unique_vendors_used
    'is_favorite': 'mean',                    # favorite_rate
    'is_rated': 'mean'                        # rating_rate
}).round(4)

# Flatten column names
customer_features.columns = ['customer_total_orders', 'customer_avg_order_value', 'customer_total_spent',
                           'customer_total_items', 'customer_unique_vendors', 'customer_favorite_rate', 'customer_rating_rate']
customer_features = customer_features.reset_index()

# VENDOR FEATURES  
print("🏪 Creating vendor features...")
vendor_features = orders_clean.groupby('vendor_id').agg({
    'customer_id': 'nunique',     # unique_customers
    'order_id': 'count',          # total_orders
    'grand_total': 'mean',        # avg_order_value
    'is_favorite': 'mean',        # favorite_rate
    'vendor_rating': 'mean'       # avg_rating
}).round(4)

vendor_features.columns = ['vendor_unique_customers', 'vendor_total_orders', 'vendor_avg_order_value',
                         'vendor_favorite_rate', 'vendor_avg_rating']
vendor_features = vendor_features.reset_index()

# CUSTOMER-VENDOR INTERACTION FEATURES
print("🤝 Creating interaction features...")
interaction_features = orders_clean.groupby(['customer_id', 'vendor_id']).agg({
    'order_id': 'count',          # times_ordered_from_vendor
    'grand_total': 'mean',        # avg_spend_at_vendor
    'is_favorite': 'max'          # has_favorited_vendor
}).round(4)

interaction_features.columns = ['customer_vendor_orders', 'customer_vendor_avg_spend', 'customer_vendor_favorited']
interaction_features = interaction_features.reset_index()

print(f"✅ Customer features: {len(customer_features)} customers")
print(f"✅ Vendor features: {len(vendor_features)} vendors") 
print(f"✅ Interaction features: {len(interaction_features)} customer-vendor pairs")

# Step 2: Create customer-vendor combinations
print("\n🎯 STEP 2: Creating Customer-Vendor Combinations")
all_customers = train_customers['customer_id'].unique()
all_vendors = vendors['id'].unique()

print(f"Found {len(all_customers)} unique customers and {len(all_vendors)} unique vendors")

# Use strategic sampling for better coverage
sample_customers = min(2000, len(all_customers))
sample_vendors = min(200, len(all_vendors))

# Prioritize customers with order history
customers_with_orders = customer_features['customer_id'].tolist()
customers_without_orders = [c for c in all_customers if c not in customers_with_orders]

# Take all customers with orders + sample of those without
sampled_customers = customers_with_orders[:sample_customers//2]
if len(customers_without_orders) > 0:
    sampled_customers.extend(np.random.choice(customers_without_orders, 
                                            size=min(sample_customers//2, len(customers_without_orders)), 
                                            replace=False).tolist())

# Similar for vendors
vendors_with_orders = vendor_features['vendor_id'].tolist()
vendors_without_orders = [v for v in all_vendors if v not in vendors_with_orders]

sampled_vendors = vendors_with_orders[:sample_vendors//2]
if len(vendors_without_orders) > 0:
    sampled_vendors.extend(np.random.choice(vendors_without_orders,
                                          size=min(sample_vendors//2, len(vendors_without_orders)),
                                          replace=False).tolist())

print(f"Selected {len(sampled_customers)} customers and {len(sampled_vendors)} vendors")

# Create combinations
combinations = []
for customer in sampled_customers:
    for vendor in sampled_vendors:
        combinations.append({'customer_id': customer, 'vendor_id': vendor})

train_full = pd.DataFrame(combinations)
print(f"Created {len(train_full)} combinations")

# Step 3: Add target labels
print("\n🎯 STEP 3: Adding Target Labels")
actual_orders = set(zip(orders_clean['customer_id'], orders_clean['vendor_id']))
train_full['target'] = train_full.apply(
    lambda row: 1 if (row['customer_id'], row['vendor_id']) in actual_orders else 0, 
    axis=1
)

print(f"Positive examples: {train_full['target'].sum():,}")
print(f"Negative examples: {(train_full['target'] == 0).sum():,}")
print(f"Positive ratio: {train_full['target'].mean():.4f}")

# Step 4: Merge all features
print("\n🎯 STEP 4: Merging Features")

# Basic customer and vendor data
train_full = train_full.merge(train_customers, on='customer_id', how='left')

vendors_renamed = vendors.copy()
vendors_renamed.rename(columns={'latitude': 'vendor_lat', 'longitude': 'vendor_lon', 'status': 'vendor_status'}, inplace=True)
train_full = train_full.merge(vendors_renamed, left_on='vendor_id', right_on='id', how='left')

train_full = train_full.merge(train_locations, on='customer_id', how='left')

# Advanced features
train_full = train_full.merge(customer_features, on='customer_id', how='left')
train_full = train_full.merge(vendor_features, on='vendor_id', how='left')
train_full = train_full.merge(interaction_features, on=['customer_id', 'vendor_id'], how='left')

# Apply basic feature engineering
train_full = feature_engineer(train_full)

# Fill missing values
numeric_cols = train_full.select_dtypes(include=[np.number]).columns
train_full[numeric_cols] = train_full[numeric_cols].fillna(0)

categorical_cols = train_full.select_dtypes(include=['object']).columns
train_full[categorical_cols] = train_full[categorical_cols].fillna('unknown')

print(f"\n✅ ENHANCED TRAINING DATASET COMPLETE!")
print(f"📊 Final dataset: {train_full.shape[0]:,} rows × {train_full.shape[1]} features")
print(f"📊 Positive ratio: {train_full['target'].mean():.4f}")

# Create test set
test_df = train_full.sample(n=min(15000, len(train_full)), random_state=42).copy()
print(f"✅ Test set: {len(test_df):,} rows")

print("="*80)

🚀 ENHANCED TRAINING DATASET WITH ROBUST FEATURES

🎯 STEP 1: Creating Robust Advanced Features
Data cleaned: 135303 rows
📊 Creating customer features...
🏪 Creating vendor features...
🤝 Creating interaction features...
✅ Customer features: 27445 customers
✅ Vendor features: 100 vendors
✅ Interaction features: 71484 customer-vendor pairs

🎯 STEP 2: Creating Customer-Vendor Combinations
Found 34523 unique customers and 100 unique vendors
Selected 2000 customers and 50 vendors
Created 100000 combinations

🎯 STEP 3: Adding Target Labels
Selected 2000 customers and 50 vendors
Created 100000 combinations

🎯 STEP 3: Adding Target Labels
Positive examples: 1,650
Negative examples: 98,350
Positive ratio: 0.0165

🎯 STEP 4: Merging Features
Positive examples: 1,650
Negative examples: 98,350
Positive ratio: 0.0165

🎯 STEP 4: Merging Features

✅ ENHANCED TRAINING DATASET COMPLETE!
📊 Final dataset: 151,400 rows × 92 features
📊 Positive ratio: 0.0296
✅ Test set: 15,000 rows

✅ ENHANCED TRAINING DATASET

In [9]:
print("🔄 Encoding categorical features...")

# Get categorical columns
categorical_cols = [col for col in train_full.columns if train_full[col].dtype == 'object']
print(f"Found {len(categorical_cols)} categorical columns: {categorical_cols[:10]}...")

# Encode categorical features
for col in categorical_cols:
    if col in test_df.columns:
        le = LabelEncoder()
        # Fit on combined data for consistency
        combined_data = pd.concat([
            train_full[col].astype(str).fillna('missing'),
            test_df[col].astype(str).fillna('missing')
        ])
        le.fit(combined_data)
        
        # Transform both datasets
        train_full[col] = le.transform(train_full[col].astype(str).fillna('missing'))
        test_df[col] = le.transform(test_df[col].astype(str).fillna('missing'))

print("✅ Categorical features encoded successfully!")
print(f"Dataset shape: {train_full.shape}")
print(f"Test set shape: {test_df.shape}")

🔄 Encoding categorical features...
Found 45 categorical columns: ['customer_id', 'gender', 'language_x', 'created_at_x', 'updated_at_x', 'vendor_category_en', 'OpeningTime', 'OpeningTime2', 'is_haked_delivering', 'language_y']...


✅ Categorical features encoded successfully!
Dataset shape: (151400, 92)
Test set shape: (15000, 92)


In [10]:
print("="*80)
print("🚀 ENHANCED MODEL TRAINING WITH ADVANCED TECHNIQUES")
print("="*80)

# Step 1: Prepare features and target
print("\n🎯 STEP 1: Feature Selection")

# Define features to exclude
exclude_features = [
    'target', 'customer_id', 'vendor_id', 'id', 'dob', 
    'created_at_x', 'updated_at_x', 'created_at_y', 'updated_at_y',
    'customer_first_order', 'customer_last_order', 'customer_vendor_last_order'
]

# Select features that exist in both datasets
available_features = [col for col in train_full.columns 
                     if col not in exclude_features and col in test_df.columns]

print(f"Total available features: {len(available_features)}")
print(f"Sample features: {available_features[:10]}...")

X = train_full[available_features]
y = train_full['target']
X_test = test_df[available_features]

print(f"Training set: {X.shape}")
print(f"Test set: {X_test.shape}")
print(f"Positive ratio: {y.mean():.4f}")

# Step 2: Baseline model with cross-validation
print("\n🎯 STEP 2: Baseline Model with Cross-Validation")

# Baseline parameters
baseline_params = {
    'objective': 'binary',
    'metric': 'auc',
    'boosting_type': 'gbdt',
    'n_estimators': 1000,
    'learning_rate': 0.05,
    'num_leaves': 31,
    'feature_fraction': 0.8,
    'bagging_fraction': 0.8,
    'bagging_freq': 5,
    'verbose': -1,
    'random_state': 42,
    'n_jobs': -1
}

# Cross-validation
baseline_cv_score, baseline_models = cross_validate_model(X, y, baseline_params, n_folds=5)

# Step 3: Hyperparameter optimization
print("\n🎯 STEP 3: Hyperparameter Optimization")
print("Optimizing hyperparameters (this may take a few minutes)...")

best_params = optimize_hyperparameters(X, y, n_trials=30, random_state=42)

# Update baseline params with optimized values
final_params = baseline_params.copy()
final_params.update(best_params)

print(f"\n📋 Final model parameters:")
for key, value in final_params.items():
    print(f"  • {key}: {value}")

# Step 4: Train ensemble model with optimized parameters
print("\n🎯 STEP 4: Training Final Ensemble Model")

final_cv_score, ensemble_models = cross_validate_model(X, y, final_params, n_folds=5)

# Compare performance
print(f"\n📊 PERFORMANCE COMPARISON:")
print(f"• Baseline CV AUC:  {baseline_cv_score:.4f}")
print(f"• Optimized CV AUC: {final_cv_score:.4f}")
print(f"• Improvement:      {final_cv_score - baseline_cv_score:.4f}")

# Step 5: Feature importance analysis
print("\n🎯 STEP 5: Feature Importance Analysis")

# Calculate feature importance from the ensemble
feature_importance = np.zeros(len(available_features))
for model in ensemble_models:
    feature_importance += model.feature_importances_

feature_importance /= len(ensemble_models)

# Create feature importance dataframe
importance_df = pd.DataFrame({
    'feature': available_features,
    'importance': feature_importance
}).sort_values('importance', ascending=False)

print("🔝 Top 20 Most Important Features:")
for i, (_, row) in enumerate(importance_df.head(20).iterrows()):
    print(f"  {i+1:2d}. {row['feature']:<35} {row['importance']:.4f}")

# Store final model and results
model = ensemble_models[0]  # Use first model for predictions (they're all similar)
features = available_features

print(f"\n✅ ENHANCED MODEL TRAINING COMPLETE!")
print(f"📈 Final CV AUC Score: {final_cv_score:.4f}")
print(f"🎯 Ready for enhanced predictions!")

print("="*80)

🚀 ENHANCED MODEL TRAINING WITH ADVANCED TECHNIQUES

🎯 STEP 1: Feature Selection
Total available features: 83
Sample features: ['gender', 'status', 'verified_x', 'language_x', 'authentication_id', 'vendor_lat', 'vendor_lon', 'vendor_category_en', 'vendor_category_id', 'delivery_charge']...
Training set: (151400, 83)
Test set: (15000, 83)
Positive ratio: 0.0296

🎯 STEP 2: Baseline Model with Cross-Validation
🔄 Performing 5-fold cross-validation...
  📊 Training fold 1/5...
Training until validation scores don't improve for 50 rounds
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[4]	valid_0's auc: 1
    ✅ Fold 1 AUC: 1.0000
  📊 Training fold 2/5...
Early stopping, best iteration is:
[4]	valid_0's auc: 1
    ✅ Fold 1 AUC: 1.0000
  📊 Training fold 2/5...
Training until validation scores don't improve for 50 rounds
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[4]	valid_0's auc: 1
    ✅ Fold 2 

[I 2025-07-29 09:29:58,368] A new study created in memory with name: no-name-dcbb8159-8131-4c34-bff0-161225177586


Early stopping, best iteration is:
[4]	valid_0's auc: 1
    ✅ Fold 5 AUC: 1.0000
🎯 Cross-validation results:
  • Mean AUC: 1.0000 (+/- 0.0000)
  • Individual folds: ['1.0000', '1.0000', '1.0000', '1.0000', '1.0000']

🎯 STEP 3: Hyperparameter Optimization
Optimizing hyperparameters (this may take a few minutes)...
🔍 Optimizing hyperparameters with 30 trials...


  0%|          | 0/30 [00:00<?, ?it/s]

🔄 Performing 3-fold cross-validation...
  📊 Training fold 1/3...
Training until validation scores don't improve for 50 rounds
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[1]	valid_0's auc: 1
    ✅ Fold 1 AUC: 1.0000
  📊 Training fold 2/3...
Early stopping, best iteration is:
[1]	valid_0's auc: 1
    ✅ Fold 1 AUC: 1.0000
  📊 Training fold 2/3...
Training until validation scores don't improve for 50 rounds
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[1]	valid_0's auc: 1
    ✅ Fold 2 AUC: 1.0000
  📊 Training fold 3/3...
Early stopping, best iteration is:
[1]	valid_0's auc: 1
    ✅ Fold 2 AUC: 1.0000
  📊 Training fold 3/3...
Training until validation scores don't improve for 50 rounds
Training until validation scores don't improve for 50 rounds


Best trial: 0. Best value: 1:   3%|▎         | 1/30 [00:02<01:21,  2.80s/it]

Early stopping, best iteration is:
[1]	valid_0's auc: 1
    ✅ Fold 3 AUC: 1.0000
🎯 Cross-validation results:
  • Mean AUC: 1.0000 (+/- 0.0000)
  • Individual folds: ['1.0000', '1.0000', '1.0000']
[I 2025-07-29 09:30:01,164] Trial 0 finished with value: 1.0 and parameters: {'n_estimators': 189, 'learning_rate': 0.05884162546465663, 'num_leaves': 39, 'feature_fraction': 0.5997284794492113, 'bagging_fraction': 0.8539030605765322, 'bagging_freq': 6, 'min_child_samples': 57, 'reg_alpha': 1.815104340467916, 'reg_lambda': 0.8725156333114288, 'min_split_gain': 0.7349099499624805}. Best is trial 0 with value: 1.0.
🔄 Performing 3-fold cross-validation...
  📊 Training fold 1/3...
Training until validation scores don't improve for 50 rounds
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[1]	valid_0's auc: 1
    ✅ Fold 1 AUC: 1.0000
  📊 Training fold 2/3...
Early stopping, best iteration is:
[1]	valid_0's auc: 1
    ✅ Fold 1 AUC: 1.0000
  📊 Training 

Best trial: 0. Best value: 1:   7%|▋         | 2/30 [00:05<01:15,  2.69s/it]

Early stopping, best iteration is:
[1]	valid_0's auc: 1
    ✅ Fold 3 AUC: 1.0000
🎯 Cross-validation results:
  • Mean AUC: 1.0000 (+/- 0.0000)
  • Individual folds: ['1.0000', '1.0000', '1.0000']
[I 2025-07-29 09:30:03,771] Trial 1 finished with value: 1.0 and parameters: {'n_estimators': 333, 'learning_rate': 0.09297053148004847, 'num_leaves': 19, 'feature_fraction': 0.6140884064151555, 'bagging_fraction': 0.5469510288632079, 'bagging_freq': 2, 'min_child_samples': 63, 'reg_alpha': 1.2462111523606836, 'reg_lambda': 0.11773167656976025, 'min_split_gain': 0.7347332448946956}. Best is trial 0 with value: 1.0.
🔄 Performing 3-fold cross-validation...
  📊 Training fold 1/3...
Training until validation scores don't improve for 50 rounds
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[2]	valid_0's auc: 1
    ✅ Fold 1 AUC: 1.0000
  📊 Training fold 2/3...
Early stopping, best iteration is:
[2]	valid_0's auc: 1
    ✅ Fold 1 AUC: 1.0000
  📊 Trainin

Best trial: 0. Best value: 1:  10%|█         | 3/30 [00:07<01:06,  2.48s/it]

Early stopping, best iteration is:
[1]	valid_0's auc: 1
    ✅ Fold 3 AUC: 1.0000
🎯 Cross-validation results:
  • Mean AUC: 1.0000 (+/- 0.0000)
  • Individual folds: ['1.0000', '1.0000', '1.0000']
[I 2025-07-29 09:30:06,005] Trial 2 finished with value: 1.0 and parameters: {'n_estimators': 656, 'learning_rate': 0.04532627841301704, 'num_leaves': 18, 'feature_fraction': 0.6519150313076385, 'bagging_fraction': 0.5465426854680113, 'bagging_freq': 7, 'min_child_samples': 22, 'reg_alpha': 0.7824939633695851, 'reg_lambda': 0.47516772383153794, 'min_split_gain': 0.4819871374828417}. Best is trial 0 with value: 1.0.
🔄 Performing 3-fold cross-validation...
  📊 Training fold 1/3...
Training until validation scores don't improve for 50 rounds
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[1]	valid_0's auc: 1
    ✅ Fold 1 AUC: 1.0000
  📊 Training fold 2/3...
Early stopping, best iteration is:
[1]	valid_0's auc: 1
    ✅ Fold 1 AUC: 1.0000
  📊 Trainin

Best trial: 0. Best value: 1:  13%|█▎        | 4/30 [00:10<01:07,  2.61s/it]

Early stopping, best iteration is:
[1]	valid_0's auc: 1
    ✅ Fold 3 AUC: 1.0000
🎯 Cross-validation results:
  • Mean AUC: 1.0000 (+/- 0.0000)
  • Individual folds: ['1.0000', '1.0000', '1.0000']
[I 2025-07-29 09:30:08,822] Trial 3 finished with value: 1.0 and parameters: {'n_estimators': 630, 'learning_rate': 0.07493320434730799, 'num_leaves': 41, 'feature_fraction': 0.623370734918065, 'bagging_fraction': 0.6633615096904317, 'bagging_freq': 2, 'min_child_samples': 62, 'reg_alpha': 0.04402320641093893, 'reg_lambda': 0.7034815408856825, 'min_split_gain': 0.04170579326241941}. Best is trial 0 with value: 1.0.
🔄 Performing 3-fold cross-validation...
  📊 Training fold 1/3...
Training until validation scores don't improve for 50 rounds
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[1]	valid_0's auc: 1
    ✅ Fold 1 AUC: 1.0000
  📊 Training fold 2/3...
Early stopping, best iteration is:
[1]	valid_0's auc: 1
    ✅ Fold 1 AUC: 1.0000
  📊 Trainin

Best trial: 0. Best value: 1:  17%|█▋        | 5/30 [00:13<01:06,  2.65s/it]

Early stopping, best iteration is:
[1]	valid_0's auc: 1
    ✅ Fold 3 AUC: 1.0000
🎯 Cross-validation results:
  • Mean AUC: 1.0000 (+/- 0.0000)
  • Individual folds: ['1.0000', '1.0000', '1.0000']
[I 2025-07-29 09:30:11,540] Trial 4 finished with value: 1.0 and parameters: {'n_estimators': 182, 'learning_rate': 0.027487671717249147, 'num_leaves': 50, 'feature_fraction': 0.6096711245914977, 'bagging_fraction': 0.7164782442258377, 'bagging_freq': 4, 'min_child_samples': 91, 'reg_alpha': 0.7980579230787452, 'reg_lambda': 1.7797806390531004, 'min_split_gain': 0.7906704627191311}. Best is trial 0 with value: 1.0.
🔄 Performing 3-fold cross-validation...
  📊 Training fold 1/3...
Training until validation scores don't improve for 50 rounds
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[1]	valid_0's auc: 1
    ✅ Fold 1 AUC: 1.0000
  📊 Training fold 2/3...
Early stopping, best iteration is:
[1]	valid_0's auc: 1
    ✅ Fold 1 AUC: 1.0000
  📊 Trainin

Best trial: 0. Best value: 1:  20%|██        | 6/30 [00:16<01:05,  2.71s/it]

Early stopping, best iteration is:
[1]	valid_0's auc: 1
    ✅ Fold 3 AUC: 1.0000
🎯 Cross-validation results:
  • Mean AUC: 1.0000 (+/- 0.0000)
  • Individual folds: ['1.0000', '1.0000', '1.0000']
[I 2025-07-29 09:30:14,377] Trial 5 finished with value: 1.0 and parameters: {'n_estimators': 525, 'learning_rate': 0.06798786851046591, 'num_leaves': 10, 'feature_fraction': 0.6204794009285681, 'bagging_fraction': 0.865194740063677, 'bagging_freq': 5, 'min_child_samples': 48, 'reg_alpha': 1.5655483090501456, 'reg_lambda': 1.932371407638921, 'min_split_gain': 0.35484444379239444}. Best is trial 0 with value: 1.0.
🔄 Performing 3-fold cross-validation...
  📊 Training fold 1/3...
Training until validation scores don't improve for 50 rounds
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[1]	valid_0's auc: 1
    ✅ Fold 1 AUC: 1.0000
  📊 Training fold 2/3...
Early stopping, best iteration is:
[1]	valid_0's auc: 1
    ✅ Fold 1 AUC: 1.0000
  📊 Training 

Best trial: 0. Best value: 1:  23%|██▎       | 7/30 [00:18<00:59,  2.58s/it]

Early stopping, best iteration is:
[1]	valid_0's auc: 1
    ✅ Fold 3 AUC: 1.0000
🎯 Cross-validation results:
  • Mean AUC: 1.0000 (+/- 0.0000)
  • Individual folds: ['1.0000', '1.0000', '1.0000']
[I 2025-07-29 09:30:16,677] Trial 6 finished with value: 1.0 and parameters: {'n_estimators': 460, 'learning_rate': 0.0815904913118423, 'num_leaves': 24, 'feature_fraction': 0.6778812764621951, 'bagging_fraction': 0.5652221119071017, 'bagging_freq': 1, 'min_child_samples': 176, 'reg_alpha': 1.7803397466051127, 'reg_lambda': 1.0332466128201168, 'min_split_gain': 0.17933024732089853}. Best is trial 0 with value: 1.0.
🔄 Performing 3-fold cross-validation...
  📊 Training fold 1/3...
Training until validation scores don't improve for 50 rounds
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[1]	valid_0's auc: 1
    ✅ Fold 1 AUC: 1.0000
  📊 Training fold 2/3...
Early stopping, best iteration is:
[1]	valid_0's auc: 1
    ✅ Fold 1 AUC: 1.0000
  📊 Trainin

Best trial: 0. Best value: 1:  27%|██▋       | 8/30 [00:20<00:55,  2.52s/it]

Early stopping, best iteration is:
[1]	valid_0's auc: 1
    ✅ Fold 3 AUC: 1.0000
🎯 Cross-validation results:
  • Mean AUC: 1.0000 (+/- 0.0000)
  • Individual folds: ['1.0000', '1.0000', '1.0000']
[I 2025-07-29 09:30:19,076] Trial 7 finished with value: 1.0 and parameters: {'n_estimators': 255, 'learning_rate': 0.023380779527354505, 'num_leaves': 49, 'feature_fraction': 0.8108204773296588, 'bagging_fraction': 0.8412058889643782, 'bagging_freq': 6, 'min_child_samples': 139, 'reg_alpha': 1.3947346104681297, 'reg_lambda': 1.9026645182265405, 'min_split_gain': 0.41344929519495077}. Best is trial 0 with value: 1.0.
🔄 Performing 3-fold cross-validation...
  📊 Training fold 1/3...
Training until validation scores don't improve for 50 rounds
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[1]	valid_0's auc: 1
    ✅ Fold 1 AUC: 1.0000
  📊 Training fold 2/3...
Early stopping, best iteration is:
[1]	valid_0's auc: 1
    ✅ Fold 1 AUC: 1.0000
  📊 Train

Best trial: 0. Best value: 1:  30%|███       | 9/30 [00:22<00:50,  2.42s/it]

Early stopping, best iteration is:
[1]	valid_0's auc: 1
    ✅ Fold 3 AUC: 1.0000
🎯 Cross-validation results:
  • Mean AUC: 1.0000 (+/- 0.0000)
  • Individual folds: ['1.0000', '1.0000', '1.0000']
[I 2025-07-29 09:30:21,260] Trial 8 finished with value: 1.0 and parameters: {'n_estimators': 119, 'learning_rate': 0.09662194628810532, 'num_leaves': 43, 'feature_fraction': 0.8746269177179736, 'bagging_fraction': 0.5610097962735794, 'bagging_freq': 3, 'min_child_samples': 143, 'reg_alpha': 1.9999896884314943, 'reg_lambda': 1.6613253947590247, 'min_split_gain': 0.6065190324797622}. Best is trial 0 with value: 1.0.
🔄 Performing 3-fold cross-validation...
  📊 Training fold 1/3...
Training until validation scores don't improve for 50 rounds
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[1]	valid_0's auc: 1
    ✅ Fold 1 AUC: 1.0000
  📊 Training fold 2/3...
Early stopping, best iteration is:
[1]	valid_0's auc: 1
    ✅ Fold 1 AUC: 1.0000
  📊 Trainin

Best trial: 0. Best value: 1:  33%|███▎      | 10/30 [00:25<00:48,  2.43s/it]

Early stopping, best iteration is:
[1]	valid_0's auc: 1
    ✅ Fold 3 AUC: 1.0000
🎯 Cross-validation results:
  • Mean AUC: 1.0000 (+/- 0.0000)
  • Individual folds: ['1.0000', '1.0000', '1.0000']
[I 2025-07-29 09:30:23,709] Trial 9 finished with value: 1.0 and parameters: {'n_estimators': 420, 'learning_rate': 0.0388617399346047, 'num_leaves': 33, 'feature_fraction': 0.6447221645210653, 'bagging_fraction': 0.6019427878115382, 'bagging_freq': 5, 'min_child_samples': 148, 'reg_alpha': 0.3798565676417034, 'reg_lambda': 1.7396256625225386, 'min_split_gain': 0.5830251482422739}. Best is trial 0 with value: 1.0.
🔄 Performing 3-fold cross-validation...
  📊 Training fold 1/3...
Training until validation scores don't improve for 50 rounds
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[1]	valid_0's auc: 1
    ✅ Fold 1 AUC: 1.0000
  📊 Training fold 2/3...
Early stopping, best iteration is:
[1]	valid_0's auc: 1
    ✅ Fold 1 AUC: 1.0000
  📊 Training

Best trial: 0. Best value: 1:  37%|███▋      | 11/30 [00:28<00:50,  2.65s/it]

Early stopping, best iteration is:
[1]	valid_0's auc: 1
    ✅ Fold 3 AUC: 1.0000
🎯 Cross-validation results:
  • Mean AUC: 1.0000 (+/- 0.0000)
  • Individual folds: ['1.0000', '1.0000', '1.0000']
[I 2025-07-29 09:30:26,856] Trial 10 finished with value: 1.0 and parameters: {'n_estimators': 790, 'learning_rate': 0.01116672020499919, 'num_leaves': 33, 'feature_fraction': 0.5279634588730531, 'bagging_fraction': 0.7808579008076486, 'bagging_freq': 7, 'min_child_samples': 101, 'reg_alpha': 1.1499009219473388, 'reg_lambda': 1.2136979357940427, 'min_split_gain': 0.9649899996650906}. Best is trial 0 with value: 1.0.
🔄 Performing 3-fold cross-validation...
  📊 Training fold 1/3...
Training until validation scores don't improve for 50 rounds
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[1]	valid_0's auc: 1
    ✅ Fold 1 AUC: 1.0000
  📊 Training fold 2/3...
Early stopping, best iteration is:
[1]	valid_0's auc: 1
    ✅ Fold 1 AUC: 1.0000
  📊 Traini

Best trial: 0. Best value: 1:  40%|████      | 12/30 [00:31<00:49,  2.73s/it]

Early stopping, best iteration is:
[1]	valid_0's auc: 1
    ✅ Fold 3 AUC: 1.0000
🎯 Cross-validation results:
  • Mean AUC: 1.0000 (+/- 0.0000)
  • Individual folds: ['1.0000', '1.0000', '1.0000']
[I 2025-07-29 09:30:29,774] Trial 11 finished with value: 1.0 and parameters: {'n_estimators': 310, 'learning_rate': 0.09932161198546365, 'num_leaves': 23, 'feature_fraction': 0.5214204693216375, 'bagging_fraction': 0.5036011719685821, 'bagging_freq': 1, 'min_child_samples': 58, 'reg_alpha': 1.2147130664216792, 'reg_lambda': 0.03665896518917679, 'min_split_gain': 0.7927901131214845}. Best is trial 0 with value: 1.0.
🔄 Performing 3-fold cross-validation...
  📊 Training fold 1/3...
Training until validation scores don't improve for 50 rounds
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[1]	valid_0's auc: 1
    ✅ Fold 1 AUC: 1.0000
  📊 Training fold 2/3...
Early stopping, best iteration is:
[1]	valid_0's auc: 1
    ✅ Fold 1 AUC: 1.0000
  📊 Traini

Best trial: 0. Best value: 1:  43%|████▎     | 13/30 [00:33<00:44,  2.64s/it]

Early stopping, best iteration is:
[1]	valid_0's auc: 1
    ✅ Fold 3 AUC: 1.0000
🎯 Cross-validation results:
  • Mean AUC: 1.0000 (+/- 0.0000)
  • Individual folds: ['1.0000', '1.0000', '1.0000']
[I 2025-07-29 09:30:32,210] Trial 12 finished with value: 1.0 and parameters: {'n_estimators': 317, 'learning_rate': 0.06149601675709099, 'num_leaves': 11, 'feature_fraction': 0.7239253987094261, 'bagging_fraction': 0.7493521662306132, 'bagging_freq': 3, 'min_child_samples': 78, 'reg_alpha': 1.6438696801215842, 'reg_lambda': 0.0042786334266240456, 'min_split_gain': 0.7818018940535856}. Best is trial 0 with value: 1.0.
🔄 Performing 3-fold cross-validation...
  📊 Training fold 1/3...
Training until validation scores don't improve for 50 rounds
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[1]	valid_0's auc: 1
    ✅ Fold 1 AUC: 1.0000
  📊 Training fold 2/3...
Early stopping, best iteration is:
[1]	valid_0's auc: 1
    ✅ Fold 1 AUC: 1.0000
  📊 Trai

Best trial: 0. Best value: 1:  47%|████▋     | 14/30 [00:36<00:44,  2.79s/it]

Early stopping, best iteration is:
[1]	valid_0's auc: 1
    ✅ Fold 3 AUC: 1.0000
🎯 Cross-validation results:
  • Mean AUC: 1.0000 (+/- 0.0000)
  • Individual folds: ['1.0000', '1.0000', '1.0000']
[I 2025-07-29 09:30:35,355] Trial 13 finished with value: 1.0 and parameters: {'n_estimators': 233, 'learning_rate': 0.05172929825819493, 'num_leaves': 39, 'feature_fraction': 0.5669522353764447, 'bagging_fraction': 0.6493962895277594, 'bagging_freq': 5, 'min_child_samples': 25, 'reg_alpha': 1.995006558786429, 'reg_lambda': 0.430856542621392, 'min_split_gain': 0.9744126670653397}. Best is trial 0 with value: 1.0.
🔄 Performing 3-fold cross-validation...
  📊 Training fold 1/3...
Training until validation scores don't improve for 50 rounds
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[1]	valid_0's auc: 1
    ✅ Fold 1 AUC: 1.0000
  📊 Training fold 2/3...
Early stopping, best iteration is:
[1]	valid_0's auc: 1
    ✅ Fold 1 AUC: 1.0000
  📊 Training 

Best trial: 0. Best value: 1:  50%|█████     | 15/30 [00:39<00:41,  2.73s/it]

Early stopping, best iteration is:
[1]	valid_0's auc: 1
    ✅ Fold 3 AUC: 1.0000
🎯 Cross-validation results:
  • Mean AUC: 1.0000 (+/- 0.0000)
  • Individual folds: ['1.0000', '1.0000', '1.0000']
[I 2025-07-29 09:30:37,954] Trial 14 finished with value: 1.0 and parameters: {'n_estimators': 101, 'learning_rate': 0.08503667737451663, 'num_leaves': 27, 'feature_fraction': 0.7311855616249786, 'bagging_fraction': 0.8987375849295879, 'bagging_freq': 3, 'min_child_samples': 109, 'reg_alpha': 0.8480369764420158, 'reg_lambda': 1.3602253318238278, 'min_split_gain': 0.6777754873211554}. Best is trial 0 with value: 1.0.
🔄 Performing 3-fold cross-validation...
  📊 Training fold 1/3...
Training until validation scores don't improve for 50 rounds
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[1]	valid_0's auc: 1
    ✅ Fold 1 AUC: 1.0000
  📊 Training fold 2/3...
Early stopping, best iteration is:
[1]	valid_0's auc: 1
    ✅ Fold 1 AUC: 1.0000
  📊 Traini

Best trial: 0. Best value: 1:  53%|█████▎    | 16/30 [00:42<00:40,  2.86s/it]

Early stopping, best iteration is:
[1]	valid_0's auc: 1
    ✅ Fold 3 AUC: 1.0000
🎯 Cross-validation results:
  • Mean AUC: 1.0000 (+/- 0.0000)
  • Individual folds: ['1.0000', '1.0000', '1.0000']
[I 2025-07-29 09:30:41,108] Trial 15 finished with value: 1.0 and parameters: {'n_estimators': 368, 'learning_rate': 0.08767685658168242, 'num_leaves': 19, 'feature_fraction': 0.5667380534906795, 'bagging_fraction': 0.7988167978297032, 'bagging_freq': 2, 'min_child_samples': 43, 'reg_alpha': 1.3851388560658604, 'reg_lambda': 0.7376194612877542, 'min_split_gain': 0.7128164387581672}. Best is trial 0 with value: 1.0.
🔄 Performing 3-fold cross-validation...
  📊 Training fold 1/3...
Training until validation scores don't improve for 50 rounds
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[1]	valid_0's auc: 1
    ✅ Fold 1 AUC: 1.0000
  📊 Training fold 2/3...
Early stopping, best iteration is:
[1]	valid_0's auc: 1
    ✅ Fold 1 AUC: 1.0000
  📊 Trainin

Best trial: 0. Best value: 1:  57%|█████▋    | 17/30 [00:45<00:36,  2.81s/it]

Early stopping, best iteration is:
[1]	valid_0's auc: 1
    ✅ Fold 3 AUC: 1.0000
🎯 Cross-validation results:
  • Mean AUC: 1.0000 (+/- 0.0000)
  • Individual folds: ['1.0000', '1.0000', '1.0000']
[I 2025-07-29 09:30:43,798] Trial 16 finished with value: 1.0 and parameters: {'n_estimators': 196, 'learning_rate': 0.06420059562909333, 'num_leaves': 35, 'feature_fraction': 0.7641908563768869, 'bagging_fraction': 0.6406525369289499, 'bagging_freq': 6, 'min_child_samples': 76, 'reg_alpha': 0.44632062534813555, 'reg_lambda': 0.2829857887644751, 'min_split_gain': 0.8604928404361614}. Best is trial 0 with value: 1.0.
🔄 Performing 3-fold cross-validation...
  📊 Training fold 1/3...
Training until validation scores don't improve for 50 rounds
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[1]	valid_0's auc: 1
    ✅ Fold 1 AUC: 1.0000
  📊 Training fold 2/3...
Early stopping, best iteration is:
[1]	valid_0's auc: 1
    ✅ Fold 1 AUC: 1.0000
  📊 Traini

Best trial: 0. Best value: 1:  60%|██████    | 18/30 [00:48<00:34,  2.90s/it]

Early stopping, best iteration is:
[1]	valid_0's auc: 1
    ✅ Fold 3 AUC: 1.0000
🎯 Cross-validation results:
  • Mean AUC: 1.0000 (+/- 0.0000)
  • Individual folds: ['1.0000', '1.0000', '1.0000']
[I 2025-07-29 09:30:46,923] Trial 17 finished with value: 1.0 and parameters: {'n_estimators': 506, 'learning_rate': 0.05439908212420012, 'num_leaves': 17, 'feature_fraction': 0.5647860603441592, 'bagging_fraction': 0.7057471124918768, 'bagging_freq': 4, 'min_child_samples': 127, 'reg_alpha': 1.7454833077567071, 'reg_lambda': 0.7962362221365167, 'min_split_gain': 0.5560708381570731}. Best is trial 0 with value: 1.0.
🔄 Performing 3-fold cross-validation...
  📊 Training fold 1/3...
Training until validation scores don't improve for 50 rounds
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[1]	valid_0's auc: 1
    ✅ Fold 1 AUC: 1.0000
  📊 Training fold 2/3...
Early stopping, best iteration is:
[1]	valid_0's auc: 1
    ✅ Fold 1 AUC: 1.0000
  📊 Traini

Best trial: 0. Best value: 1:  63%|██████▎   | 19/30 [00:51<00:31,  2.84s/it]

Early stopping, best iteration is:
[1]	valid_0's auc: 1
    ✅ Fold 3 AUC: 1.0000
🎯 Cross-validation results:
  • Mean AUC: 1.0000 (+/- 0.0000)
  • Individual folds: ['1.0000', '1.0000', '1.0000']
[I 2025-07-29 09:30:49,615] Trial 18 finished with value: 1.0 and parameters: {'n_estimators': 315, 'learning_rate': 0.07533777302107846, 'num_leaves': 29, 'feature_fraction': 0.6895749584350656, 'bagging_fraction': 0.8117598314160637, 'bagging_freq': 2, 'min_child_samples': 85, 'reg_alpha': 1.0528626334898505, 'reg_lambda': 1.467221191813814, 'min_split_gain': 0.34115152483252076}. Best is trial 0 with value: 1.0.
🔄 Performing 3-fold cross-validation...
  📊 Training fold 1/3...
Training until validation scores don't improve for 50 rounds
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[1]	valid_0's auc: 1
    ✅ Fold 1 AUC: 1.0000
  📊 Training fold 2/3...
Early stopping, best iteration is:
[1]	valid_0's auc: 1
    ✅ Fold 1 AUC: 1.0000
  📊 Trainin

Best trial: 0. Best value: 1:  67%|██████▋   | 20/30 [00:54<00:29,  2.93s/it]

Early stopping, best iteration is:
[1]	valid_0's auc: 1
    ✅ Fold 3 AUC: 1.0000
🎯 Cross-validation results:
  • Mean AUC: 1.0000 (+/- 0.0000)
  • Individual folds: ['1.0000', '1.0000', '1.0000']
[I 2025-07-29 09:30:52,746] Trial 19 finished with value: 1.0 and parameters: {'n_estimators': 397, 'learning_rate': 0.036404364994329946, 'num_leaves': 37, 'feature_fraction': 0.5013363661316367, 'bagging_fraction': 0.7444311384697402, 'bagging_freq': 6, 'min_child_samples': 198, 'reg_alpha': 1.3296134642037996, 'reg_lambda': 1.0127187967887712, 'min_split_gain': 0.6774734402295232}. Best is trial 0 with value: 1.0.
🔄 Performing 3-fold cross-validation...
  📊 Training fold 1/3...
Training until validation scores don't improve for 50 rounds
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[1]	valid_0's auc: 1
    ✅ Fold 1 AUC: 1.0000
  📊 Training fold 2/3...
Early stopping, best iteration is:
[1]	valid_0's auc: 1
    ✅ Fold 1 AUC: 1.0000
  📊 Train

Best trial: 0. Best value: 1:  70%|███████   | 21/30 [00:56<00:25,  2.83s/it]

Early stopping, best iteration is:
[1]	valid_0's auc: 1
    ✅ Fold 3 AUC: 1.0000
🎯 Cross-validation results:
  • Mean AUC: 1.0000 (+/- 0.0000)
  • Individual folds: ['1.0000', '1.0000', '1.0000']
[I 2025-07-29 09:30:55,360] Trial 20 finished with value: 1.0 and parameters: {'n_estimators': 165, 'learning_rate': 0.08963015897952166, 'num_leaves': 43, 'feature_fraction': 0.5901501794566016, 'bagging_fraction': 0.501696976284047, 'bagging_freq': 4, 'min_child_samples': 43, 'reg_alpha': 1.5669217929584867, 'reg_lambda': 0.245800777586333, 'min_split_gain': 0.8839592554752236}. Best is trial 0 with value: 1.0.
🔄 Performing 3-fold cross-validation...
  📊 Training fold 1/3...
Training until validation scores don't improve for 50 rounds
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[2]	valid_0's auc: 1
    ✅ Fold 1 AUC: 1.0000
  📊 Training fold 2/3...
Early stopping, best iteration is:
[2]	valid_0's auc: 1
    ✅ Fold 1 AUC: 1.0000
  📊 Training 

Best trial: 0. Best value: 1:  73%|███████▎  | 22/30 [00:59<00:21,  2.69s/it]

Early stopping, best iteration is:
[1]	valid_0's auc: 1
    ✅ Fold 3 AUC: 1.0000
🎯 Cross-validation results:
  • Mean AUC: 1.0000 (+/- 0.0000)
  • Individual folds: ['1.0000', '1.0000', '1.0000']
[I 2025-07-29 09:30:57,709] Trial 21 finished with value: 1.0 and parameters: {'n_estimators': 664, 'learning_rate': 0.04407350193054284, 'num_leaves': 16, 'feature_fraction': 0.6656412327630278, 'bagging_fraction': 0.545274080132154, 'bagging_freq': 7, 'min_child_samples': 20, 'reg_alpha': 0.6404462575637646, 'reg_lambda': 0.5338501852416256, 'min_split_gain': 0.470839238147025}. Best is trial 0 with value: 1.0.
🔄 Performing 3-fold cross-validation...
  📊 Training fold 1/3...
Training until validation scores don't improve for 50 rounds
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[1]	valid_0's auc: 1
    ✅ Fold 1 AUC: 1.0000
  📊 Training fold 2/3...
Early stopping, best iteration is:
[1]	valid_0's auc: 1
    ✅ Fold 1 AUC: 1.0000
  📊 Training 

Best trial: 0. Best value: 1:  77%|███████▋  | 23/30 [01:01<00:18,  2.58s/it]

Early stopping, best iteration is:
[1]	valid_0's auc: 1
    ✅ Fold 3 AUC: 1.0000
🎯 Cross-validation results:
  • Mean AUC: 1.0000 (+/- 0.0000)
  • Individual folds: ['1.0000', '1.0000', '1.0000']
[I 2025-07-29 09:31:00,033] Trial 22 finished with value: 1.0 and parameters: {'n_estimators': 621, 'learning_rate': 0.04314238424600754, 'num_leaves': 22, 'feature_fraction': 0.6432219303844277, 'bagging_fraction': 0.6058855963637315, 'bagging_freq': 7, 'min_child_samples': 36, 'reg_alpha': 0.9627589207173168, 'reg_lambda': 0.5259032449092198, 'min_split_gain': 0.2574935870398362}. Best is trial 0 with value: 1.0.
🔄 Performing 3-fold cross-validation...
  📊 Training fold 1/3...
Training until validation scores don't improve for 50 rounds
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[1]	valid_0's auc: 1
    ✅ Fold 1 AUC: 1.0000
  📊 Training fold 2/3...
Early stopping, best iteration is:
[1]	valid_0's auc: 1
    ✅ Fold 1 AUC: 1.0000
  📊 Trainin

Best trial: 0. Best value: 1:  80%|████████  | 24/30 [01:04<00:15,  2.61s/it]

Early stopping, best iteration is:
[1]	valid_0's auc: 1
    ✅ Fold 3 AUC: 1.0000
🎯 Cross-validation results:
  • Mean AUC: 1.0000 (+/- 0.0000)
  • Individual folds: ['1.0000', '1.0000', '1.0000']
[I 2025-07-29 09:31:02,725] Trial 23 finished with value: 1.0 and parameters: {'n_estimators': 783, 'learning_rate': 0.05033407099503048, 'num_leaves': 13, 'feature_fraction': 0.5875598073225028, 'bagging_fraction': 0.6061665404688001, 'bagging_freq': 6, 'min_child_samples': 67, 'reg_alpha': 0.6107741667181437, 'reg_lambda': 0.22382203245110932, 'min_split_gain': 0.5685770901629141}. Best is trial 0 with value: 1.0.
🔄 Performing 3-fold cross-validation...
  📊 Training fold 1/3...
Training until validation scores don't improve for 50 rounds
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[1]	valid_0's auc: 1
    ✅ Fold 1 AUC: 1.0000
  📊 Training fold 2/3...
Early stopping, best iteration is:
[1]	valid_0's auc: 1
    ✅ Fold 1 AUC: 1.0000
  📊 Traini

Best trial: 0. Best value: 1:  83%|████████▎ | 25/30 [01:06<00:13,  2.62s/it]

Early stopping, best iteration is:
[1]	valid_0's auc: 1
    ✅ Fold 3 AUC: 1.0000
🎯 Cross-validation results:
  • Mean AUC: 1.0000 (+/- 0.0000)
  • Individual folds: ['1.0000', '1.0000', '1.0000']
[I 2025-07-29 09:31:05,353] Trial 24 finished with value: 1.0 and parameters: {'n_estimators': 725, 'learning_rate': 0.02474069229711516, 'num_leaves': 20, 'feature_fraction': 0.727048210188485, 'bagging_fraction': 0.5370454605815718, 'bagging_freq': 7, 'min_child_samples': 32, 'reg_alpha': 0.27119218189748184, 'reg_lambda': 0.8578354455571287, 'min_split_gain': 0.48190158034599795}. Best is trial 0 with value: 1.0.
🔄 Performing 3-fold cross-validation...
  📊 Training fold 1/3...
Training until validation scores don't improve for 50 rounds
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[1]	valid_0's auc: 1
    ✅ Fold 1 AUC: 1.0000
  📊 Training fold 2/3...
Early stopping, best iteration is:
[1]	valid_0's auc: 1
    ✅ Fold 1 AUC: 1.0000
  📊 Traini

Best trial: 0. Best value: 1:  87%|████████▋ | 26/30 [01:09<00:10,  2.52s/it]

Early stopping, best iteration is:
[1]	valid_0's auc: 1
    ✅ Fold 3 AUC: 1.0000
🎯 Cross-validation results:
  • Mean AUC: 1.0000 (+/- 0.0000)
  • Individual folds: ['1.0000', '1.0000', '1.0000']
[I 2025-07-29 09:31:07,638] Trial 25 finished with value: 1.0 and parameters: {'n_estimators': 260, 'learning_rate': 0.07125969713572618, 'num_leaves': 26, 'feature_fraction': 0.6509615153733418, 'bagging_fraction': 0.5859750625243519, 'bagging_freq': 6, 'min_child_samples': 53, 'reg_alpha': 0.7159872472544411, 'reg_lambda': 0.41937100918072556, 'min_split_gain': 0.655584491268842}. Best is trial 0 with value: 1.0.
🔄 Performing 3-fold cross-validation...
  📊 Training fold 1/3...
Training until validation scores don't improve for 50 rounds
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[1]	valid_0's auc: 1
    ✅ Fold 1 AUC: 1.0000
  📊 Training fold 2/3...
Early stopping, best iteration is:
[1]	valid_0's auc: 1
    ✅ Fold 1 AUC: 1.0000
  📊 Trainin

Best trial: 0. Best value: 1:  90%|█████████ | 27/30 [01:11<00:07,  2.52s/it]

Early stopping, best iteration is:
[1]	valid_0's auc: 1
    ✅ Fold 3 AUC: 1.0000
🎯 Cross-validation results:
  • Mean AUC: 1.0000 (+/- 0.0000)
  • Individual folds: ['1.0000', '1.0000', '1.0000']
[I 2025-07-29 09:31:10,171] Trial 26 finished with value: 1.0 and parameters: {'n_estimators': 565, 'learning_rate': 0.058952623984265456, 'num_leaves': 15, 'feature_fraction': 0.7667681337102632, 'bagging_fraction': 0.6680774177008962, 'bagging_freq': 5, 'min_child_samples': 69, 'reg_alpha': 0.965853512549761, 'reg_lambda': 0.6161308090178187, 'min_split_gain': 0.7594364804956032}. Best is trial 0 with value: 1.0.
🔄 Performing 3-fold cross-validation...
  📊 Training fold 1/3...
Training until validation scores don't improve for 50 rounds
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[1]	valid_0's auc: 1
    ✅ Fold 1 AUC: 1.0000
  📊 Training fold 2/3...
Early stopping, best iteration is:
[1]	valid_0's auc: 1
    ✅ Fold 1 AUC: 1.0000
  📊 Trainin

Best trial: 0. Best value: 1:  93%|█████████▎| 28/30 [01:14<00:04,  2.48s/it]

Early stopping, best iteration is:
[1]	valid_0's auc: 1
    ✅ Fold 3 AUC: 1.0000
🎯 Cross-validation results:
  • Mean AUC: 1.0000 (+/- 0.0000)
  • Individual folds: ['1.0000', '1.0000', '1.0000']
[I 2025-07-29 09:31:12,554] Trial 27 finished with value: 1.0 and parameters: {'n_estimators': 360, 'learning_rate': 0.0309545341503454, 'num_leaves': 46, 'feature_fraction': 0.6971054871189013, 'bagging_fraction': 0.5257351272998524, 'bagging_freq': 7, 'min_child_samples': 30, 'reg_alpha': 1.8157457261432486, 'reg_lambda': 0.12268744206742298, 'min_split_gain': 0.8513478387396535}. Best is trial 0 with value: 1.0.
🔄 Performing 3-fold cross-validation...
  📊 Training fold 1/3...
Training until validation scores don't improve for 50 rounds
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[1]	valid_0's auc: 1
    ✅ Fold 1 AUC: 1.0000
  📊 Training fold 2/3...
Early stopping, best iteration is:
[1]	valid_0's auc: 1
    ✅ Fold 1 AUC: 1.0000
  📊 Trainin

Best trial: 0. Best value: 1:  97%|█████████▋| 29/30 [01:17<00:02,  2.69s/it]

Early stopping, best iteration is:
[1]	valid_0's auc: 1
    ✅ Fold 3 AUC: 1.0000
🎯 Cross-validation results:
  • Mean AUC: 1.0000 (+/- 0.0000)
  • Individual folds: ['1.0000', '1.0000', '1.0000']
[I 2025-07-29 09:31:15,729] Trial 28 finished with value: 1.0 and parameters: {'n_estimators': 464, 'learning_rate': 0.01578966561480665, 'num_leaves': 20, 'feature_fraction': 0.5547607897076672, 'bagging_fraction': 0.6297237914122806, 'bagging_freq': 4, 'min_child_samples': 101, 'reg_alpha': 1.2290282997703799, 'reg_lambda': 0.37608306523090773, 'min_split_gain': 0.4241733951865205}. Best is trial 0 with value: 1.0.
🔄 Performing 3-fold cross-validation...
  📊 Training fold 1/3...
Training until validation scores don't improve for 50 rounds
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[1]	valid_0's auc: 1
    ✅ Fold 1 AUC: 1.0000
  📊 Training fold 2/3...
Early stopping, best iteration is:
[1]	valid_0's auc: 1
    ✅ Fold 1 AUC: 1.0000
  📊 Train

Best trial: 0. Best value: 1: 100%|██████████| 30/30 [01:20<00:00,  2.67s/it]

Early stopping, best iteration is:
[1]	valid_0's auc: 1
    ✅ Fold 3 AUC: 1.0000
🎯 Cross-validation results:
  • Mean AUC: 1.0000 (+/- 0.0000)
  • Individual folds: ['1.0000', '1.0000', '1.0000']
[I 2025-07-29 09:31:18,616] Trial 29 finished with value: 1.0 and parameters: {'n_estimators': 608, 'learning_rate': 0.04629826496900096, 'num_leaves': 30, 'feature_fraction': 0.6025283105123058, 'bagging_fraction': 0.6706788705324463, 'bagging_freq': 2, 'min_child_samples': 58, 'reg_alpha': 0.035047683222585, 'reg_lambda': 1.1756118109482956, 'min_split_gain': 0.2866130416835142}. Best is trial 0 with value: 1.0.
🏆 Best hyperparameters found:
  • n_estimators: 189
  • learning_rate: 0.05884162546465663
  • num_leaves: 39
  • feature_fraction: 0.5997284794492113
  • bagging_fraction: 0.8539030605765322
  • bagging_freq: 6
  • min_child_samples: 57
  • reg_alpha: 1.815104340467916
  • reg_lambda: 0.8725156333114288
  • min_split_gain: 0.7349099499624805
🎯 Best CV AUC: 1.0000

📋 Final model para




Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[1]	valid_0's auc: 1
    ✅ Fold 1 AUC: 1.0000
  📊 Training fold 2/5...
Early stopping, best iteration is:
[1]	valid_0's auc: 1
    ✅ Fold 1 AUC: 1.0000
  📊 Training fold 2/5...
Training until validation scores don't improve for 50 rounds
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[1]	valid_0's auc: 1
    ✅ Fold 2 AUC: 1.0000
  📊 Training fold 3/5...
Early stopping, best iteration is:
[1]	valid_0's auc: 1
    ✅ Fold 2 AUC: 1.0000
  📊 Training fold 3/5...
Training until validation scores don't improve for 50 rounds
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[1]	valid_0's auc: 1
    ✅ Fold 3 AUC: 1.0000
  📊 Training fold 4/5...
Early stopping, best iteration is:
[1]	valid_0's auc: 1
    ✅ Fold 3 AUC: 1.0000
  📊 Training fold 4/5...
Training until validation scores don't improve for 50 round

In [12]:
print("="*80)
print("🚀 FAST SUBMISSION GENERATION WITH ENSEMBLE PREDICTIONS")
print("="*80)

# Step 1: Create optimized test combinations (quick generation)
print("\n🎯 STEP 1: Creating Fast Test Data")
print("Optimized test data generation...")

# Reduce sample size for speed - smaller but representative sample
test_customers = np.random.choice(all_customers, size=min(50, len(all_customers)), replace=False)
test_combinations = []

for customer in test_customers:
    # Reduce combinations per customer for speed
    num_combinations = np.random.randint(2, 4)  # 2-3 combinations per customer
    customer_vendors = np.random.choice(all_vendors, size=num_combinations, replace=False)
    
    for i, vendor in enumerate(customer_vendors):
        test_combinations.append({
            'customer_id': customer,
            'LOCATION_NUMBER': i + 1,
            'vendor_id': vendor
        })

test_input_df = pd.DataFrame(test_combinations)
print(f"Created {len(test_input_df):,} test combinations to predict")

# Step 2: Fast feature preparation
print("\n🎯 STEP 2: Fast Feature Preparation")

# Merge with basic data (optimized)
test_prepared = test_input_df.merge(train_customers, on='customer_id', how='left')
test_prepared = test_prepared.merge(vendors_renamed, left_on='vendor_id', right_on='id', how='left')
test_prepared = test_prepared.merge(train_locations, on='customer_id', how='left')

# Apply basic feature engineering
test_prepared = feature_engineer(test_prepared)

# Merge advanced features (same as training)
test_prepared = test_prepared.merge(customer_features, on='customer_id', how='left')
test_prepared = test_prepared.merge(vendor_features, on='vendor_id', how='left')
test_prepared = test_prepared.merge(interaction_features, on=['customer_id', 'vendor_id'], how='left')

# Fast missing value handling
numeric_cols = test_prepared.select_dtypes(include=[np.number]).columns
test_prepared[numeric_cols] = test_prepared[numeric_cols].fillna(0)

categorical_cols = test_prepared.select_dtypes(include=['object']).columns
test_prepared[categorical_cols] = test_prepared[categorical_cols].fillna('unknown')

print(f"Test data prepared: {test_prepared.shape}")

# Step 3: Fast categorical encoding
print("\n🎯 STEP 3: Fast Encoding")
categorical_cols = [col for col in test_prepared.columns if test_prepared[col].dtype == 'object']

for col in categorical_cols:
    if col in features:  # Only encode features used in training
        le = LabelEncoder()
        test_prepared[col] = le.fit_transform(test_prepared[col].astype(str).fillna('missing'))

# Step 4: Fast ensemble predictions
print("\n🎯 STEP 4: Fast Predictions")
test_features = test_prepared[features]
print(f"Using {len(features)} features for prediction")

# Use ensemble prediction (averaging across all trained models)
ensemble_predictions = predict_with_ensemble(ensemble_models, test_features)

# Step 5: Create submission file
print("\n🎯 STEP 5: Creating Submission File")

# Create submission format
test_prepared['CID X LOC_NUM X VENDOR'] = (
    test_prepared['customer_id'].astype(str) + ' X ' + 
    test_prepared['LOCATION_NUMBER'].astype(str) + ' X ' + 
    test_prepared['vendor_id'].astype(str)
)

test_prepared['target'] = ensemble_predictions

# Create final submission
submission_file = test_prepared[['CID X LOC_NUM X VENDOR', 'target']].copy()

# Sort by prediction probability (highest first)
submission_file = submission_file.sort_values('target', ascending=False)

# Save to Train folder with new filename
submission_file.to_csv('Train/train_submission.csv', index=False)

print(f"✅ Train submission created with {len(submission_file):,} predictions!")
print(f"✅ Saved to: Train/train_submission.csv")

# Step 6: Quick analysis
print("\n🎯 STEP 6: Quick Analysis")

print(f"\n📊 PREDICTION STATISTICS:")
print(f"• Mean prediction: {ensemble_predictions.mean():.6f}")
print(f"• Min prediction:  {ensemble_predictions.min():.6f}")
print(f"• Max prediction:  {ensemble_predictions.max():.6f}")
print(f"• Total predictions: {len(ensemble_predictions):,}")

print(f"\n🔝 TOP 10 RECOMMENDATIONS:")
print(submission_file.head(10))

print(f"\n📈 SUMMARY:")
print(f"• Enhanced model with {len(features)} features")
print(f"• Ensemble of {len(ensemble_models)} optimized models")
print(f"• File saved: Train/train_submission.csv")

print("="*80)

🚀 FAST SUBMISSION GENERATION WITH ENSEMBLE PREDICTIONS

🎯 STEP 1: Creating Fast Test Data
Optimized test data generation...
Created 127 test combinations to predict

🎯 STEP 2: Fast Feature Preparation
Test data prepared: (268, 92)

🎯 STEP 3: Fast Encoding

🎯 STEP 4: Fast Predictions
Using 83 features for prediction

🎯 STEP 5: Creating Submission File
✅ Train submission created with 268 predictions!
✅ Saved to: Train/train_submission.csv

🎯 STEP 6: Quick Analysis

📊 PREDICTION STATISTICS:
• Mean prediction: 0.038047
• Min prediction:  0.027887
• Max prediction:  0.179160
• Total predictions: 268

🔝 TOP 10 RECOMMENDATIONS:
   CID X LOC_NUM X VENDOR   target
53       URLP7T1 X 3 X 86  0.17916
51       URLP7T1 X 3 X 86  0.17916
50       URLP7T1 X 3 X 86  0.17916
45      URLP7T1 X 1 X 271  0.17916
44      URLP7T1 X 1 X 271  0.17916
43      URLP7T1 X 1 X 271  0.17916
42      URLP7T1 X 1 X 271  0.17916
52       URLP7T1 X 3 X 86  0.17916
84       HEHQN0I X 3 X 13  0.17916
83       HEHQN0I X 3 