# ✈️ FlightRank 2025: Aeroclub RecSys Cup¶

**Personalized Flight Recommendations for Business Travelers**




![FlightRank Header](https://raw.githubusercontent.com/Ishita95-harvad/flightrank-2025-aeroclub-recsys-cup/main/header.png)

**A data-driven journey into flight recommendation systems. Presented by Aeroclub for the RecSys Cup 2025, this visual blends aviation with algorithmic precision—featuring a stylized aircraft built from numerical data points, set against a sleek grid backdrop. Innovation takes flight**

--------------------------------------------------------------------------

**Step 1: Initial Setup and Data Loading**

In [None]:
# Import essential libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm
import os

# Set up notebook display options
pd.set_option('display.max_columns', 200)
plt.style.use('ggplot')

# Load the data
train = pd.read_parquet('/kaggle/input/flight-selection-prediction/train.parquet')
test = pd.read_parquet('/kaggle/input/flight-selection-prediction/test.parquet')
sample_sub = pd.read_parquet('/kaggle/input/flight-selection-prediction/sample_submission.parquet')

# Quick checks
print(f"Train shape: {train.shape}")
print(f"Test shape: {test.shape}")
print(f"Sample submission shape: {sample_sub.shape}")

# Display first few rows
train.head()

**Step 2: Exploratory Data Analysis (EDA)**

In [None]:
# Basic statistics
train.describe(include='all')

# Target variable distribution
plt.figure(figsize=(8,5))
sns.countplot(x='selected', data=train)
plt.title('Target Variable Distribution')
plt.show()

# Check number of options per search session
options_per_search = train.groupby('ranker_id').size().value_counts()
plt.figure(figsize=(10,5))
options_per_search.plot(kind='bar')
plt.title('Number of Flight Options per Search Session')
plt.xlabel('Number of Options')
plt.ylabel('Count')
plt.show()

# Analyze numerical features
num_cols = train.select_dtypes(include=np.number).columns.tolist()
num_cols.remove('selected')  # Remove target

plt.figure(figsize=(15, 20))
for i, col in enumerate(num_cols[:15]):  # First 15 numerical columns
    plt.subplot(5, 3, i+1)
    sns.histplot(train[col], kde=True, bins=30)
    plt.title(col)
plt.tight_layout()
plt.show()

# Analyze categorical features
cat_cols = ['sex', 'nationality', 'frequentFlyer', 'isVip', 'bySelf', 'isAccess3D', 'corporateTariffCode']

plt.figure(figsize=(15, 15))
for i, col in enumerate(cat_cols):
    plt.subplot(3, 3, i+1)
    sns.countplot(x=col, data=train)
    plt.title(col)
    plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

**Step 3: Feature Engineering**

In [4]:
# Feature engineering function
def create_features(df):
    # Copy dataframe to avoid SettingWithCopyWarning
    df = df.copy()
    
    # Price features
    df['price_per_km'] = df['totalPrice'] / (df['legs0_duration'] + df.get('legs1_duration', 0))
    
    # Time features
    df['departure_hour'] = pd.to_datetime(df['legs0_departureAt']).dt.hour
    df['is_weekend'] = pd.to_datetime(df['legs0_departureAt']).dt.weekday >= 5
    
    # Duration features
    df['total_duration'] = df['legs0_duration'] + df.get('legs1_duration', 0)
    df['duration_per_km'] = df['total_duration'] / df['totalPrice']
    
    # Cabin class features (assuming highest class determines overall class)
    df['max_cabin_class'] = df[[f'legs{i}_segments0_cabinClass' for i in [0,1] if f'legs{i}_segments0_cabinClass' in df.columns]].max(axis=1)
    
    # Baggage features
    for leg in [0,1]:
        seg_col = f'legs{leg}_segments0_baggageAllowance_quantity'
        if seg_col in df.columns:
            df[f'legs{leg}_baggage'] = df[seg_col]
    
    # Cancellation penalty features
    if 'miniRules0_monetaryAmount' in df.columns:
        df['total_cancellation_penalty'] = df['miniRules0_monetaryAmount'] + (df['totalPrice'] * df['miniRules0_percentage'] / 100)
    
    return df

# Apply feature engineering
train_fe = create_features(train)
test_fe = create_features(test)

# Display new features
train_fe[['price_per_km', 'departure_hour', 'is_weekend', 'total_duration', 'max_cabin_class']].head()

NameError: name 'train' is not defined

**Step 4: Data Preprocessing**

In [5]:
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import GroupKFold

# Identify columns to use
useful_cols = [
    # User features
    'sex', 'nationality', 'frequentFlyer', 'isVip', 'bySelf',
    
    # Flight features
    'totalPrice', 'taxes', 'legs0_duration', 'legs1_duration',
    'legs0_segments0_cabinClass', 'legs1_segments0_cabinClass',
    'legs0_segments0_baggageAllowance_quantity', 'legs1_segments0_baggageAllowance_quantity',
    
    # Engineered features
    'price_per_km', 'departure_hour', 'is_weekend', 'total_duration', 'max_cabin_class'
]

# Filter columns (keep only those present in both train and test)
available_cols = [col for col in useful_cols if col in train_fe.columns and col in test_fe.columns]
target = 'selected'

# Prepare data
X = train_fe[available_cols]
y = train_fe[target]
groups = train_fe['ranker_id']
X_test = test_fe[available_cols]

# Handle categorical variables
cat_cols = X.select_dtypes(include=['object', 'category']).columns.tolist()
for col in cat_cols:
    le = LabelEncoder()
    # Combine train and test to handle all categories
    combined = pd.concat([X[col], X_test[col]], axis=0)
    le.fit(combined)
    X[col] = le.transform(X[col])
    X_test[col] = le.transform(X_test[col])

# Scale numerical features
num_cols = [col for col in available_cols if col not in cat_cols]
scaler = StandardScaler()
X[num_cols] = scaler.fit_transform(X[num_cols])
X_test[num_cols] = scaler.transform(X_test[num_cols])

# Display processed data
X.head()

NameError: name 'train_fe' is not defined

**Step 5: Model Training with Cross-Validation**

In [None]:
import lightgbm as lgb
from sklearn.metrics import ndcg_score

# Custom evaluation metric for ranking
def ndcg_scorer(estimator, X, y):
    # Get probabilities
    y_pred = estimator.predict_proba(X)[:, 1]
    
    # Create groups for NDCG calculation
    unique_groups = groups.loc[X.index].unique()
    ndcg_scores = []
    
    for group in unique_groups:
        group_mask = (groups.loc[X.index] == group)
        y_true_group = y[group_mask]
        y_pred_group = y_pred[group_mask]
        
        # Reshape for ndcg_score
        y_true_group = y_true_group.values.reshape(1, -1)
        y_pred_group = y_pred_group.reshape(1, -1)
        
        ndcg_scores.append(ndcg_score(y_true_group, y_pred_group))
    
    return np.mean(ndcg_scores)

# Set up LightGBM parameters
params = {
    'objective': 'binary',
    'metric': 'binary_logloss',
    'boosting_type': 'gbdt',
    'learning_rate': 0.05,
    'num_leaves': 31,
    'min_child_samples': 20,
    'feature_fraction': 0.8,
    'bagging_fraction': 0.8,
    'bagging_freq': 1,
    'verbose': -1,
    'seed': 42
}

# GroupKFold cross-validation
n_folds = 5
gkf = GroupKFold(n_splits=n_folds)
oof_preds = np.zeros(len(X))
models = []

for fold, (train_idx, valid_idx) in enumerate(gkf.split(X, y, groups=groups)):
    print(f"\nFold {fold + 1}")
    
    # Split data
    X_train, X_valid = X.iloc[train_idx], X.iloc[valid_idx]
    y_train, y_valid = y.iloc[train_idx], y.iloc[valid_idx]
    
    # LightGBM dataset
    train_set = lgb.Dataset(X_train, y_train)
    valid_set = lgb.Dataset(X_valid, y_valid, reference=train_set)
    
    # Train model
    model = lgb.train(
        params,
        train_set,
        num_boost_round=1000,
        valid_sets=[valid_set],
        early_stopping_rounds=50,
        verbose_eval=100
    )
    
    # Store model and predictions
    oof_preds[valid_idx] = model.predict(X_valid)
    models.append(model)
    
    # Calculate NDCG for this fold
    fold_ndcg = ndcg_scorer(model, X_valid, y_valid)
    print(f"Fold {fold + 1} NDCG: {fold_ndcg:.4f}")

# Overall OOF NDCG
unique_groups = groups.unique()
ndcg_scores = []

for group in unique_groups:
    group_mask = (groups == group)
    y_true_group = y[group_mask]
    y_pred_group = oof_preds[group_mask]
    
    y_true_group = y_true_group.values.reshape(1, -1)
    y_pred_group = y_pred_group.reshape(1, -1)
    
    ndcg_scores.append(ndcg_score(y_true_group, y_pred_group))

print(f"\nOverall OOF NDCG: {np.mean(ndcg_scores):.4f}")

**Step 6: Feature Importance Analysis**

In [None]:
# Plot feature importance
plt.figure(figsize=(12, 8))
lgb.plot_importance(models[0], max_num_features=20, importance_type='gain')
plt.title('Feature Importance (Gain)')
plt.show()

# Get feature importance dataframe
importance_df = pd.DataFrame({
    'feature': models[0].feature_name(),
    'importance': models[0].feature_importance(importance_type='gain')
}).sort_values('importance', ascending=False)

importance_df.head(10)

**Step 7: Generate Predictions and Submission File**

In [None]:
# Generate test predictions (average across folds)
test_preds = np.zeros(len(X_test))
for model in models:
    test_preds += model.predict(X_test) / len(models)

# Create submission dataframe
submission = test[['Id', 'ranker_id']].copy()

# Assign ranks within each search session (ranker_id)
submission['selected'] = 0  # Initialize

for ranker_id in tqdm(submission['ranker_id'].unique()):
    mask = submission['ranker_id'] == ranker_id
    # Rank predictions (1 is best, higher numbers worse)
    submission.loc[mask, 'selected'] = (-test_preds[mask]).argsort().argsort() + 1

# Verify ranks are valid
for ranker_id in submission['ranker_id'].unique()[:5]:  # Check first 5 groups
    group_ranks = submission[submission['ranker_id'] == ranker_id]['selected']
    print(f"Group {ranker_id} ranks: {sorted(group_ranks.values)}")

# Save submission
submission.to_csv('submission.csv', index=False)

# Display sample submission
submission.head()

**Step 8: Advanced Techniques (Optional)**

In [None]:
# Optional: Hyperparameter Optimization
import optuna

def objective(trial):
    params = {
        'objective': 'binary',
        'metric': 'binary_logloss',
        'boosting_type': 'gbdt',
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3),
        'num_leaves': trial.suggest_int('num_leaves', 20, 100),
        'min_child_samples': trial.suggest_int('min_child_samples', 10, 100),
        'feature_fraction': trial.suggest_float('feature_fraction', 0.5, 1.0),
        'bagging_fraction': trial.suggest_float('bagging_fraction', 0.5, 1.0),
        'bagging_freq': trial.suggest_int('bagging_freq', 1, 10),
        'verbose': -1
    }
    
    # Use first fold for optimization
    train_idx, valid_idx = next(gkf.split(X, y, groups=groups))
    X_train, X_valid = X.iloc[train_idx], X.iloc[valid_idx]
    y_train, y_valid = y.iloc[train_idx], y.iloc[valid_idx]
    
    train_set = lgb.Dataset(X_train, y_train)
    valid_set = lgb.Dataset(X_valid, y_valid)
    
    model = lgb.train(
        params,
        train_set,
        num_boost_round=500,
        valid_sets=[valid_set],
        early_stopping_rounds=50,
        verbose_eval=False
    )
    
    return ndcg_scorer(model, X_valid, y_valid)

# Run optimization (commented out as it takes time)
# study = optuna.create_study(direction='maximize')
# study.optimize(objective, n_trials=20)
# print("Best trial:")
# trial = study.best_trial
# print(f"NDCG: {trial.value:.4f}")
# print(f"Best params: {trial.params}")

**JSON files**

In [None]:
# Example of processing JSONs in chunks (if needed)
import json
import tarfile

def process_json_chunks(tar_path, chunk_size=1000):
    features = []
    with tarfile.open(tar_path, 'r:gz') as tar:
        members = tar.getmembers()
        for i in range(0, len(members), chunk_size):
            chunk = members[i:i+chunk_size]
            for member in chunk:
                f = tar.extractfile(member)
                data = json.load(f)
                # Extract features from JSON
                # features.append(...)
    return pd.DataFrame(features)

## Verified Notebook Setup

In [None]:
# Import all required libraries (verified)
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm
import os
import lightgbm as lgb
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import GroupKFold
from sklearn.metrics import ndcg_score
import json
import tarfile

# Set display options
pd.set_option('display.max_columns', 200)
plt.style.use('ggplot')

# Verify input directory structure
input_dir = '/kaggle/input/aeroclub-recsys-2025'
print("Input directory contents:")
print(os.listdir(input_dir))

# Load data with verified paths
train = pd.read_parquet(f'{input_dir}/train.parquet')
test = pd.read_parquet(f'{input_dir}/test.parquet')
sample_sub = pd.read_parquet(f'{input_dir}/sample_submission.parquet')

# Data verification
print(f"\nTrain shape: {train.shape}, columns: {train.columns.tolist()}")
print(f"Test shape: {test.shape}")
print(f"Sample submission shape: {sample_sub.shape}")
print("\nTrain target distribution:")
print(train['selected'].value_counts(normalize=True))

**Data Exploration and Visualization**

In [None]:
# Visualize target distribution
plt.figure(figsize=(10, 5))
train['selected'].value_counts().plot(kind='bar', color=['skyblue', 'salmon'])
plt.title('Target Variable Distribution (0=Not Selected, 1=Selected)')
plt.xticks(rotation=0)
plt.show()

# Analyze options per search session
options_per_search = train.groupby('ranker_id').size()
plt.figure(figsize=(12, 6))
options_per_search.value_counts().sort_index().plot(kind='bar', color='teal')
plt.title('Number of Flight Options per Search Session')
plt.xlabel('Number of Options')
plt.ylabel('Count of Search Sessions')
plt.show()

# Price analysis
plt.figure(figsize=(15, 5))
plt.subplot(1, 2, 1)
sns.histplot(train['totalPrice'], bins=50, color='royalblue')
plt.title('Distribution of Total Price')
plt.subplot(1, 2, 2)
sns.boxplot(x='selected', y='totalPrice', data=train)
plt.title('Price Distribution by Selection Status')
plt.tight_layout()
plt.show()

# Flight duration analysis
plt.figure(figsize=(15, 5))
plt.subplot(1, 2, 1)
sns.histplot(train['legs0_duration'], bins=50, color='purple')
plt.title('Outbound Flight Duration')
plt.subplot(1, 2, 2)
sns.boxplot(x='selected', y='legs0_duration', data=train)
plt.title('Duration Distribution by Selection Status')
plt.tight_layout()
plt.show()

**Feature Engineering (Updated)**

In [6]:
def create_features(df):
    df = df.copy()
    
    # Price features
    df['price_to_taxes_ratio'] = df['totalPrice'] / (df['taxes'] + 1)
    
    # Time features
    df['legs0_departure_hour'] = pd.to_datetime(df['legs0_departureAt']).dt.hour
    df['legs0_departure_day'] = pd.to_datetime(df['legs0_departureAt']).dt.dayofweek
    df['is_weekend'] = df['legs0_departure_day'] >= 5
    
    # Duration features
    df['total_duration'] = df['legs0_duration'] + df.get('legs1_duration', 0)
    df['duration_per_price'] = df['total_duration'] / (df['totalPrice'] + 1)
    
    # Cabin class features
    cabin_cols = [f'legs{i}_segments{j}_cabinClass' 
                 for i in [0,1] for j in [0] 
                 if f'legs{i}_segments{j}_cabinClass' in df.columns]
    if cabin_cols:
        df['max_cabin_class'] = df[cabin_cols].max(axis=1)
    
    # Baggage features
    baggage_cols = [f'legs{i}_segments0_baggageAllowance_quantity' 
                   for i in [0,1] 
                   if f'legs{i}_segments0_baggageAllowance_quantity' in df.columns]
    if baggage_cols:
        df['total_baggage'] = df[baggage_cols].sum(axis=1)
    
    # Cancellation features
    if 'miniRules0_monetaryAmount' in df.columns:
        df['total_cancellation_penalty'] = (
            df['miniRules0_monetaryAmount'] + 
            (df['totalPrice'] * df['miniRules0_percentage'] / 100)
    
    return df

# Apply feature engineering
train_fe = create_features(train)
test_fe = create_features(test)

# Display new features
print("Engineered features sample:")
display(train_fe[[
    'price_to_taxes_ratio', 
    'legs0_departure_hour',
    'total_duration',
    'max_cabin_class'
]].head())

SyntaxError: '(' was never closed (3859334657.py, line 32)

**Model Training with HitRate@3 Optimization**

In [7]:
# Prepare data for modeling
useful_cols = [
    # Basic features
    'totalPrice', 'taxes', 'legs0_duration', 'legs1_duration',
    
    # User features
    'sex', 'nationality', 'frequentFlyer', 'isVip',
    
    # Cabin class
    'legs0_segments0_cabinClass', 'legs1_segments0_cabinClass',
    
    # Engineered features
    'price_to_taxes_ratio', 'legs0_departure_hour', 
    'total_duration', 'max_cabin_class', 'total_baggage'
]

# Filter available columns
available_cols = [col for col in useful_cols if col in train_fe.columns and col in test_fe.columns]
target = 'selected'

X = train_fe[available_cols]
y = train_fe[target]
groups = train_fe['ranker_id']
X_test = test_fe[available_cols]

# Preprocessing
cat_cols = X.select_dtypes(include=['object', 'category']).columns.tolist()
for col in cat_cols:
    le = LabelEncoder()
    combined = pd.concat([X[col], X_test[col]], axis=0)
    le.fit(combined)
    X[col] = le.transform(X[col])
    X_test[col] = le.transform(X_test[col])

num_cols = [col for col in available_cols if col not in cat_cols]
scaler = StandardScaler()
X[num_cols] = scaler.fit_transform(X[num_cols])
X_test[num_cols] = scaler.transform(X_test[num_cols])

# Custom HitRate@3 metric
def hit_rate_at_3(y_true, y_pred, groups):
    results = []
    for group in groups.unique():
        group_mask = (groups == group)
        y_true_group = y_true[group_mask]
        y_pred_group = y_pred[group_mask]
        
        # Get top 3 predictions
        top3 = y_pred_group.argsort()[-3:][::-1]
        hit = y_true_group.iloc[top3].sum() > 0
        results.append(hit)
    
    return np.mean(results)

# LightGBM parameters optimized for ranking
params = {
    'objective': 'binary',
    'metric': 'binary_logloss',
    'boosting_type': 'gbdt',
    'learning_rate': 0.05,
    'num_leaves': 63,
    'min_child_samples': 30,
    'feature_fraction': 0.8,
    'bagging_fraction': 0.8,
    'bagging_freq': 1,
    'lambda_l1': 0.1,
    'lambda_l2': 0.1,
    'verbose': -1,
    'seed': 42
}

# GroupKFold cross-validation
n_folds = 5
gkf = GroupKFold(n_splits=n_folds)
oof_preds = np.zeros(len(X))
models = []
hit_rates = []

for fold, (train_idx, valid_idx) in enumerate(gkf.split(X, y, groups=groups)):
    print(f"\nFold {fold + 1}")
    
    X_train, X_valid = X.iloc[train_idx], X.iloc[valid_idx]
    y_train, y_valid = y.iloc[train_idx], y.iloc[valid_idx]
    groups_valid = groups.iloc[valid_idx]
    
    train_set = lgb.Dataset(X_train, y_train)
    valid_set = lgb.Dataset(X_valid, y_valid)
    
    model = lgb.train(
        params,
        train_set,
        num_boost_round=1000,
        valid_sets=[valid_set],
        early_stopping_rounds=50,
        verbose_eval=100
    )
    
    oof_preds[valid_idx] = model.predict(X_valid)
    models.append(model)
    
    # Calculate HitRate@3 for this fold
    fold_hit_rate = hit_rate_at_3(y_valid, oof_preds[valid_idx], groups_valid)
    hit_rates.append(fold_hit_rate)
    print(f"Fold {fold + 1} HitRate@3: {fold_hit_rate:.4f}")

# Overall OOF HitRate@3
overall_hit_rate = hit_rate_at_3(y, oof_preds, groups)
print(f"\nOverall OOF HitRate@3: {overall_hit_rate:.4f}")
print(f"Average fold HitRate@3: {np.mean(hit_rates):.4f}")

NameError: name 'train_fe' is not defined

**Generate Submission File**

In [None]:
# Generate test predictions
test_preds = np.zeros(len(X_test))
for model in models:
    test_preds += model.predict(X_test) / len(models)

# Create submission with ranks
submission = test[['Id', 'ranker_id']].copy()

# Assign ranks within each search session (1 = best)
for ranker_id in tqdm(submission['ranker_id'].unique()):
    mask = submission['ranker_id'] == ranker_id
    submission.loc[mask, 'selected'] = (-test_preds[mask]).argsort().argsort() + 1

# Verify submission format
print("\nSubmission verification:")
print(f"Unique ranks in first group: {submission[submission['ranker_id'] == submission['ranker_id'].iloc[0]]['selected'].unique()}")

# Save submission
submission.to_csv('submission.csv', index=False)
print("\nSubmission file created: submission.csv")

# Display sample
print("\nSample submission:")
display(submission.head())

**Feature Importance Analysis**

In [None]:
# Plot feature importance
plt.figure(figsize=(12, 8))
lgb.plot_importance(models[0], max_num_features=20, importance_type='gain')
plt.title('Feature Importance (Gain)')
plt.show()

# Create importance dataframe
importance_df = pd.DataFrame({
    'feature': models[0].feature_name(),
    'importance': models[0].feature_importance(importance_type='gain')
}).sort_values('importance', ascending=False)

print("Top 10 important features:")
display(importance_df.head(10))

# Plot top features
plt.figure(figsize=(12, 6))
sns.barplot(x='importance', y='feature', data=importance_df.head(10), palette='viridis')
plt.title('Top 10 Features by Importance')
plt.tight_layout()
plt.show()

**JSON Data Processing (Optional)**

In [None]:
# Optional: Process JSON raw data
def extract_json_features(json_path):
    """Example function to process JSON files"""
    features = []
    with tarfile.open(json_path, 'r:gz') as tar:
        for member in tqdm(tar.getmembers()[:1000]):  # Process first 1000 files as example
            f = tar.extractfile(member)
            data = json.load(f)
            
            # Extract relevant features from JSON
            feature = {
                'ranker_id': member.name.split('.')[0],
                # Add other features from JSON structure
            }
            features.append(feature)
    return pd.DataFrame(features)

# Uncomment to process JSONs (requires significant memory)
# json_path = f'{input_dir}/jsons_raw.tar.kaggle'
# json_features = extract_json_features(json_path)
# print(f"Extracted {len(json_features)} JSON features")

**Complete Workflow**

--------------------

**1. Understanding HitRate@3 Implementation**

In [None]:
import pandas as pd
import numpy as np
from tqdm import tqdm

def calculate_hitrate_at_3(submission_df, test_df):
    """
    Calculates HitRate@3 metric for the competition
    
    Args:
        submission_df: Your ranked predictions (must contain 'ranker_id' and 'selected' columns)
        test_df: Original test data with ground truth ('selected' column)
    
    Returns:
        hitrate_at_3: The competition metric score
    """
    # Merge submission with ground truth
    merged = submission_df.merge(test_df[['Id', 'ranker_id', 'selected']], 
                               on=['Id', 'ranker_id'], 
                               suffixes=('_pred', '_true'))
    
    # Filter groups with >10 options (as per competition rules)
    group_sizes = merged.groupby('ranker_id').size()
    valid_groups = group_sizes[group_sizes > 10].index
    filtered = merged[merged['ranker_id'].isin(valid_groups)]
    
    # Calculate HitRate@3 for each group
    hitrates = []
    for ranker_id, group in filtered.groupby('ranker_id'):
        # Get the true selected flight
        true_selected = group[group['selected_true'] == 1].iloc[0]
        
        # Check if it's in top 3 predictions
        group['rank'] = group['selected_pred']  # Your predicted ranks
        top_3_flights = group.nsmallest(3, 'rank')['Id'].values
        hit = true_selected['Id'] in top_3_flights
        hitrates.append(hit)
    
    # Return overall hitrate
    return np.mean(hitrates) if hitrates else 0.0

**2. Model Training with HitRate@3 Optimization**

In [None]:
from lightgbm import LGBMRanker
from sklearn.model_selection import GroupKFold

def train_ranker(X, y, groups, n_folds=5):
    """
    Trains a ranking model with cross-validation
    
    Args:
        X: Feature matrix
        y: Target (1 for selected flight, 0 otherwise)
        groups: ranker_id for group-wise splitting
        n_folds: Number of cross-validation folds
    
    Returns:
        model: Trained ranking model
        oof_preds: Out-of-fold predictions
    """
    models = []
    oof_preds = np.zeros(len(X))
    group_kfold = GroupKFold(n_splits=n_folds)
    
    for fold, (train_idx, val_idx) in enumerate(group_kfold.split(X, y, groups)):
        X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
        y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]
        groups_train = groups.iloc[train_idx]
        
        # Train LightGBM Ranker
        model = LGBMRanker(
            objective="lambdarank",
            metric="ndcg",
            boosting_type="gbdt",
            n_estimators=500,
            learning_rate=0.05,
            num_leaves=31,
            max_depth=6,
            min_child_samples=20,
            random_state=42
        )
        
        model.fit(
            X_train, y_train,
            group=groups_train.value_counts().sort_index().values,
            eval_set=[(X_val, y_val)],
            eval_group=[groups.iloc[val_idx].value_counts().sort_index().values],
            early_stopping_rounds=50,
            verbose=50
        )
        
        # Store predictions and model
        oof_preds[val_idx] = model.predict(X_val)
        models.append(model)
    
    return models, oof_preds

**3. Converting Scores to Competition Ranks**

In [None]:
def scores_to_ranks(pred_scores, ranker_ids):
    """
    Converts model scores to competition-required ranks
    
    Args:
        pred_scores: Array of model predictions (higher = better)
        ranker_ids: Corresponding ranker_id for each prediction
    
    Returns:
        ranks: Array of ranks (1 = best) per flight option
    """
    ranks = np.zeros_like(pred_scores)
    for ranker_id in np.unique(ranker_ids):
        mask = ranker_ids == ranker_id
        group_scores = pred_scores[mask]
        # argsort twice converts scores to ranks (1=best)
        group_ranks = np.argsort(np.argsort(-group_scores)) + 1
        ranks[mask] = group_ranks
    return ranks.astype(int)

**4. Complete Workflow Example**

In [None]:
# Load data
train = pd.read_csv("/kaggle/input/flightrank-2025/train.csv")
test = pd.read_csv("/kaggle/input/flightrank-2025/test.csv")

# Feature engineering (example features)
def create_features(df):
    # Add your feature engineering here
    df['price_per_mile'] = df['price'] / (df['distance'] + 1e-6)
    return df

train = create_features(train)
test = create_features(test)

# Prepare for ranking model
X_train = train.drop(['selected', 'ranker_id', 'Id'], axis=1)
y_train = train['selected']
groups_train = train['ranker_id']

# Train model
models, oof_preds = train_ranker(X_train, y_train, groups_train)

# Validate on training data (pseudo-test)
train['pred_rank'] = scores_to_ranks(oof_preds, groups_train)
train_hitrate = calculate_hitrate_at_3(train, train)
print(f"OOF HitRate@3: {train_hitrate:.4f}")

# Prepare test predictions
X_test = test.drop(['ranker_id', 'Id'], axis=1)
test_preds = np.mean([model.predict(X_test) for model in models], axis=0)
test['selected'] = scores_to_ranks(test_preds, test['ranker_id'])

# Create submission
submission = test[['Id', 'ranker_id', 'selected']]
submission.to_csv("submission.csv", index=False)


**5. Key Optimization Strategies**

**1.Group-Aware Validation:**

In [None]:
from sklearn.model_selection import GroupShuffleSplit

gss = GroupShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
train_idx, val_idx = next(gss.split(X_train, y_train, groups_train))

**2. Feature Importance Analysis:**

In [None]:
pd.DataFrame({
    'feature': X_train.columns,
    'importance': models[0].feature_importances_
}).sort_values('importance', ascending=False)

**3. Hyperparameter Tuning:**

In [None]:
param_grid = {
    'num_leaves': [31, 63],
    'learning_rate': [0.01, 0.05],
    'n_estimators': [300, 500]
}

**6. Final Submission**

In [None]:
def validate_submission(sub_df, test_df):
    # Check row count matches
    assert len(sub_df) == len(test_df)
    
    # Check rank validity per group
    for ranker_id, group in sub_df.groupby('ranker_id'):
        ranks = group['selected'].values
        assert sorted(ranks) == list(range(1, len(ranks)+1))
    
    print("✅ Submission is valid!")

validate_submission(submission, test)

----------------------------------------------------

**Advanced Machine Learning Pipeline**

**1. Feature Engineering Enhancements**

In [None]:
def enhanced_feature_engineering(interactions, users, flights):
    """Create advanced features for flight recommendations"""
    # Merge datasets
    data = interactions.merge(users, on='user_id', how='left')
    data = data.merge(flights, on='flight_id', how='left')
    
    # Convert timestamps
    data['interaction_time'] = pd.to_datetime(data['interaction_time'])
    data['departure_time'] = pd.to_datetime(data['departure_time'])
    
    # Time-based features
    data['days_to_departure'] = (data['departure_time'] - data['interaction_time']).dt.days
    data['booking_lead_time'] = data['days_to_departure']
    data['is_last_minute'] = (data['days_to_departure'] <= 3).astype(int)
    data['is_advanced_booking'] = (data['days_to_departure'] > 21).astype(int)
    
    # User history features
    user_history = data.groupby('user_id').agg({
        'flight_id': 'count',
        'price': ['mean', 'std'],
        'airline': lambda x: x.mode()[0]
    })
    user_history.columns = ['user_flight_count', 'avg_price', 'price_std', 'preferred_airline']
    data = data.merge(user_history, on='user_id', how='left')
    
    # Flight popularity features
    flight_popularity = data['flight_id'].value_counts().reset_index()
    flight_popularity.columns = ['flight_id', 'flight_popularity']
    data = data.merge(flight_popularity, on='flight_id', how='left')
    
    # User-Flight affinity features
    user_airline_counts = data.groupby(['user_id', 'airline']).size().unstack(fill_value=0)
    data = data.merge(user_airline_counts, on='user_id', how='left')
    
    return data

**2. Advanced Model Training with Hyperparameter Tuning**

In [None]:
import optuna
from lightgbm import LGBMRanker

def objective(trial, X_train, y_train, groups):
    """Optuna objective function for hyperparameter tuning"""
    params = {
        'objective': 'lambdarank',
        'metric': 'ndcg',
        'verbosity': -1,
        'boosting_type': 'gbdt',
        'lambda_l1': trial.suggest_float('lambda_l1', 1e-8, 10.0, log=True),
        'lambda_l2': trial.suggest_float('lambda_l2', 1e-8, 10.0, log=True),
        'num_leaves': trial.suggest_int('num_leaves', 2, 256),
        'feature_fraction': trial.suggest_float('feature_fraction', 0.4, 1.0),
        'bagging_fraction': trial.suggest_float('bagging_fraction', 0.4, 1.0),
        'bagging_freq': trial.suggest_int('bagging_freq', 1, 7),
        'min_child_samples': trial.suggest_int('min_child_samples', 5, 100),
        'learning_rate': trial.suggest_float('learning_rate', 0.001, 0.1, log=True)
    }
    
    model = LGBMRanker(**params)
    model.fit(
        X_train, 
        y_train,
        group=groups,
        eval_set=[(X_val, y_val)],
        eval_group=[val_groups],
        early_stopping_rounds=50,
        verbose=False
    )
    
    return model.best_score_['valid_0']['ndcg']

def train_best_model(X_train, y_train, groups):
    """Train model with optimal hyperparameters"""
    study = optuna.create_study(direction='maximize')
    study.optimize(lambda trial: objective(trial, X_train, y_train, groups), n_trials=50)
    
    best_params = study.best_params
    best_params.update({
        'objective': 'lambdarank',
        'metric': 'ndcg',
        'verbosity': 1
    })
    
    best_model = LGBMRanker(**best_params)
    best_model.fit(X_train, y_train, group=groups)
    
    return best_model

**3. Ensemble Approach**

In [None]:
from sklearn.base import BaseEstimator
from sklearn.ensemble import VotingClassifier

class RankEnsemble(BaseEstimator):
    """Ensemble of ranking models"""
    def __init__(self, models):
        self.models = models
        
    def fit(self, X, y, groups):
        for model in self.models:
            model.fit(X, y, group=groups)
        return self
    
    def predict(self, X):
        predictions = np.zeros((X.shape[0], len(self.models)))
        for i, model in enumerate(self.models):
            predictions[:, i] = model.predict(X)
        return np.mean(predictions, axis=1)

def create_ensemble():
    """Create ensemble of diverse ranking models"""
    model1 = LGBMRanker(
        objective='lambdarank',
        metric='ndcg',
        num_leaves=31,
        learning_rate=0.05
    )
    
    model2 = LGBMRanker(
        objective='lambdarank',
        metric='ndcg',
        boosting_type='dart',
        num_leaves=63,
        learning_rate=0.01
    )
    
    return RankEnsemble([model1, model2])

**Complete Training Pipeline**

In [None]:
def full_pipeline():
    # Load and preprocess data
    data = enhanced_feature_engineering(interactions, users, flights)
    
    # Prepare features and target
    X = data.drop(['user_id', 'flight_id', 'interaction_time', 'departure_time', 'target'], axis=1)
    y = data['target']
    groups = data.groupby('user_id').size().values
    
    # Time-based cross validation
    best_ndcg = 0
    best_model = None
    
    for train_idx, val_idx in time_based_cv(data):
        X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
        y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]
        train_groups = groups[train_idx]
        val_groups = groups[val_idx]
        
        # Train model with hyperparameter tuning
        model = train_best_model(X_train, y_train, train_groups)
        
        # Evaluate
        current_ndcg = evaluate_model(model, X_val, y_val)
        if current_ndcg > best_ndcg:
            best_ndcg = current_ndcg
            best_model = model
    
    # Train final model on all data
    final_model = train_best_model(X, y, groups)
    
    return final_model

**Submission Generation**

In [None]:
import pandas as pd
import numpy as np
from lightgbm import LGBMRanker
from sklearn.model_selection import GroupKFold

def load_data():
    """Load and validate competition data"""
    try:
        train = pd.read_csv('/kaggle/input/flightrank-2025-aeroclub-recsys-cup/train.csv')
        test = pd.read_csv('/kaggle/input/flightrank-2025-aeroclub-recsys-cup/test.csv')
        
        # Basic validation
        assert not train.empty, "Train data is empty"
        assert not test.empty, "Test data is empty"
        assert all(col in train.columns for col in ['Id', 'ranker_id', 'selected'])
        assert all(col in test.columns for col in ['Id', 'ranker_id'])
        
        print("✅ Data loaded successfully")
        print(f"Train samples: {len(train):,}, Test samples: {len(test):,}")
        print(f"Unique search sessions - Train: {train['ranker_id'].nunique():,}, Test: {test['ranker_id'].nunique():,}")
        return train, test
    
    except Exception as e:
        print(f"❌ Error loading data: {str(e)}")
        return None, None

def create_features(df):
    """Create features for ranking model"""
    # Basic features (customize based on available columns)
    num_features = []
    
    if 'price' in df.columns:
        df['price_norm'] = df.groupby('ranker_id')['price'].transform(lambda x: (x - x.mean()) / x.std())
        num_features.append('price_norm')
    
    if 'duration' in df.columns:
        df['duration_norm'] = df.groupby('ranker_id')['duration'].transform(lambda x: (x - x.mean()) / x.std())
        num_features.append('duration_norm')
    
    if 'departure_time' in df.columns:
        df['departure_hour'] = pd.to_datetime(df['departure_time']).dt.hour
        df['departure_day'] = pd.to_datetime(df['departure_time']).dt.dayofweek
        num_features.extend(['departure_hour', 'departure_day'])
    
    # Add more features as needed
    print(f"Generated {len(num_features)} numerical features")
    return df, num_features

def train_model(train_df, features):
    """Train LightGBM ranking model with cross-validation"""
    X = train_df[features]
    y = train_df['selected']
    groups = train_df.groupby('ranker_id').size().values
    
    params = {
        'objective': 'lambdarank',
        'metric': 'ndcg',
        'num_leaves': 63,
        'learning_rate': 0.05,
        'min_data_in_leaf': 50,
        'feature_fraction': 0.8,
        'bagging_fraction': 0.8,
        'bagging_freq': 1,
        'random_state': 42
    }
    
    model = LGBMRanker(**params)
    
    # GroupKFold validation
    cv = GroupKFold(n_splits=3)
    best_ndcg = 0
    
    for fold, (train_idx, val_idx) in enumerate(cv.split(X, y, groups)):
        print(f"\n🏁 Fold {fold + 1}")
        model.fit(
            X.iloc[train_idx], y.iloc[train_idx],
            group=groups[train_idx],
            eval_set=[(X.iloc[val_idx], y.iloc[val_idx])],
            eval_group=[groups[val_idx]],
            eval_metric='ndcg',
            early_stopping_rounds=50,
            verbose=50
        )
        
        # Track best fold
        current_ndcg = model.best_score_['valid_0']['ndcg']
        if current_ndcg > best_ndcg:
            best_ndcg = current_ndcg
            print(f"🎯 New best NDCG: {best_ndcg:.4f}")
    
    print(f"\n🔥 Best Validation NDCG: {best_ndcg:.4f}")
    return model

def generate_submission(model, test_df, features):
    """Generate properly formatted submission file"""
    # Prepare test features
    test_df = test_df.copy()
    test_df, _ = create_features(test_df)
    
    # Predict scores (higher = better)
    test_df['score'] = model.predict(test_df[features])
    
    # Convert scores to ranks (1 = best)
    test_df['selected'] = test_df.groupby('ranker_id')['score'].rank(ascending=False, method='first')
    test_df['selected'] = test_df['selected'].astype(int)
    
    # Create submission in correct format
    submission = test_df[['Id', 'ranker_id', 'selected']].copy()
    
    # Validate before saving
    if validate_submission(submission, test_df):
        submission.to_csv('submission.csv', index=False)
        print("\n🎉 submission.csv created successfully!")
        print("Final submission preview:")
        display(submission.head())
        
        # Show some example rankings
        example_session = submission['ranker_id'].iloc[0]
        print(f"\nExample rankings for session '{example_session}':")
        display(submission[submission['ranker_id'] == example_session].sort_values('selected'))
        
        return submission
    return None

def validate_submission(sub, test_df):
    """Validate submission meets all requirements"""
    try:
        print("\n🔍 Validating submission...")
        
        # Check 1: Maintains original row order
        assert all(sub['Id'] == test_df['Id']), "Row order changed from original test.csv"
        
        # Check 2: Complete rankings for each search
        ranker_groups = sub.groupby('ranker_id')
        for name, group in ranker_groups:
            expected_ranks = set(range(1, len(group) + 1))
            actual_ranks = set(group['selected'])
            assert actual_ranks == expected_ranks, f"Invalid ranks for session {name}"
        
        # Check 3: No duplicate ranks per search
        assert not sub.duplicated(['ranker_id', 'selected']).any(), "Duplicate ranks found"
        
        # Check 4: All ranks ≥ 1
        assert sub['selected'].min() >= 1, "Ranks below 1 found"
        
        print("✅ All validation checks passed!")
        return True
    
    except AssertionError as e:
        print(f"❌ Validation failed: {str(e)}")
        return False

# Main Execution
if __name__ == "__main__":
    # Load data
    print("📂 Loading data...")
    train_df, test_df = load_data()
    
    if train_df is not None and test_df is not None:
        # Feature engineering
        print("\n🛠️ Creating features...")
        train_df, features = create_features(train_df)
        print("Available features:", features)
        
        # Train model
        print("\n🤖 Training ranking model...")
        model = train_model(train_df, features)
        
        # Generate submission
        print("\n💾 Creating submission file...")
        submission = generate_submission(model, test_df, features)
        
        if submission is not None:
            print(f"\n✔ Successfully created submission file with {len(submission):,} rows")
            print("File saved as: submission.csv")
        else:
            print("\n❌ Failed to create valid submission file")

In [None]:
import pandas as pd
import numpy as np
from lightgbm import LGBMRanker
from sklearn.model_selection import GroupKFold

# 1. Data Loading Function
def load_data():
    """Load and validate competition data"""
    try:
        train = pd.read_csv('/kaggle/input/flightrank-2025-aeroclub-recsys-cup/train.csv')
        test = pd.read_csv('/kaggle/input/flightrank-2025-aeroclub-recsys-cup/test.csv')
        
        # Basic validation
        assert not train.empty, "Train data is empty"
        assert not test.empty, "Test data is empty"
        assert all(col in train.columns for col in ['Id', 'ranker_id', 'selected'])
        assert all(col in test.columns for col in ['Id', 'ranker_id'])
        
        print("✅ Data loaded successfully")
        print(f"Train shape: {train.shape}, Test shape: {test.shape}")
        return train, test
    
    except Exception as e:
        print(f"❌ Error loading data: {str(e)}")
        return None, None

# 2. Feature Engineering Function
def create_features(df):
    """Create features for ranking model"""
    data = df.copy()
    
    # Basic ranking features
    if 'price' in data.columns:
        data['price_rank'] = data.groupby('ranker_id')['price'].rank()
    if 'duration' in data.columns:
        data['duration_rank'] = data.groupby('ranker_id')['duration'].rank()
    
    # Normalized features
    if 'price' in data.columns:
        data['price_norm'] = data.groupby('ranker_id')['price'].transform(
            lambda x: (x - x.mean()) / x.std())
    
    return data

# 3. Model Training Function
def train_model(train_df):
    """Train ranking model with cross-validation"""
    train_df = create_features(train_df)
    features = [col for col in train_df.columns 
               if col not in ['Id', 'ranker_id', 'selected']]
    
    model = LGBMRanker(
        objective="lambdarank",
        metric="ndcg",
        num_leaves=31,
        learning_rate=0.05,
        n_estimators=100
    )
    
    model.fit(
        train_df[features],
        train_df['selected'],
        group=train_df.groupby('ranker_id').size().values,
        verbose=10
    )
    
    return model, features

# 4. Submission Generation Function
def create_submission(model, test_df, features):
    """Generate properly formatted submission file"""
    test_df = create_features(test_df)
    test_df['score'] = model.predict(test_df[features])
    
    # Convert scores to ranks (1 = best)
    test_df['selected'] = test_df.groupby('ranker_id')['score'].rank(
        ascending=False, method='first').astype(int)
    
    # Create final submission
    submission = test_df[['Id', 'ranker_id', 'selected']].copy()
    submission.to_csv('submission.csv', index=False)
    
    print("\nFirst 5 rows of submission:")
    print(submission.head())
    return submission

# 5. Main Execution
if __name__ == "__main__":
    # Load data
    train_df, test_df = load_data()
    if train_df is None or test_df is None:
        raise SystemExit("Failed to load data")
    
    # Train model
    model, features = train_model(train_df)
    
    # Create submission
    submission = create_submission(model, test_df, features)
    print(f"\n✅ submission.csv created successfully with {len(submission)} rows")

---------------------------------------------------------------------------------

**1. Enhanced Categorical Encoding**

In [None]:
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer

def enhanced_categorical_encoding(train_df, test_df, cat_cols):
    """
    Enhanced categorical encoding with multiple strategies:
    - Label encoding for high-cardinality features
    - One-hot encoding for low-cardinality features
    - Frequency encoding as fallback
    """
    # Make copies to avoid SettingWithCopyWarning
    train_df = train_df.copy()
    test_df = test_df.copy()
    
    encoders = {}
    
    for col in cat_cols:
        if col not in train_df.columns:
            print(f"Warning: Column {col} not found in training data")
            continue
            
        # Determine encoding strategy based on cardinality
        n_unique = train_df[col].nunique()
        
        if n_unique > 20:  # High cardinality
            # Label encoding with unseen values handling
            le = LabelEncoder()
            combined = pd.concat([train_df[col].astype(str), test_df[col].astype(str)])
            le.fit(combined)
            train_df[f"{col}_le"] = le.transform(train_df[col].astype(str))
            test_df[f"{col}_le"] = le.transform(test_df[col].astype(str))
            encoders[col] = {'type': 'label', 'encoder': le}
            
        elif 2 <= n_unique <= 20:  # Good for one-hot
            # One-hot encoding with test set alignment
            ohe = OneHotEncoder(handle_unknown='ignore', sparse=False)
            ohe.fit(train_df[[col]])
            
            # Get feature names
            feature_names = [f"{col}_{val}" for val in ohe.categories_[0]]
            
            # Transform both sets
            train_ohe = ohe.transform(train_df[[col]])
            test_ohe = ohe.transform(test_df[[col]])
            
            # Create DataFrames for the encoded features
            train_ohe_df = pd.DataFrame(train_ohe, columns=feature_names, index=train_df.index)
            test_ohe_df = pd.DataFrame(test_ohe, columns=feature_names, index=test_df.index)
            
            # Concatenate with original DataFrames
            train_df = pd.concat([train_df, train_ohe_df], axis=1)
            test_df = pd.concat([test_df, test_ohe_df], axis=1)
            encoders[col] = {'type': 'onehot', 'encoder': ohe}
            
        else:  # Very low cardinality or binary
            # Simple label encoding
            le = LabelEncoder()
            le.fit(train_df[col].astype(str))
            train_df[f"{col}_le"] = le.transform(train_df[col].astype(str))
            test_df[f"{col}_le"] = le.transform(test_df[col].astype(str))
            encoders[col] = {'type': 'label', 'encoder': le}
    
    return train_df, test_df, encoders

# Define categorical columns (expand as needed)
cat_cols = [
    "profileId", 
    "nationality", 
    "companyID", 
    "searchRoute",
    "legs0_segments0_marketingCarrier_code",
    "legs0_segments0_cabinClass",
    "frequentFlyer",
    "isVip",
    "pricingInfo_isAccessTP"
]

# Apply enhanced encoding
train, test, categorical_encoders = enhanced_categorical_encoding(train, test, cat_cols)

**2. Numerical Feature Scaling**

In [None]:
from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.pipeline import Pipeline

def scale_numerical_features(train_df, test_df, num_cols):
    """
    Scale numerical features with robust scaling for outliers
    Returns:
    - Scaled DataFrames
    - Fitted scaler object
    - List of all numerical features (including generated ones)
    """
    # Make copies to avoid SettingWithCopyWarning
    train_df = train_df.copy()
    test_df = test_df.copy()
    
    # Identify all numerical columns (including generated features)
    all_num_cols = [
        col for col in train_df.columns 
        if (train_df[col].dtype in ['int64', 'float64']) and 
        (col not in ['selected', 'Id', 'ranker_id'])
    ]
    
    # If specific columns are provided, use those
    final_num_cols = num_cols if num_cols else all_num_cols
    
    # Create scaling pipeline
    scaler = Pipeline([
        ('robust', RobustScaler()),  # Handles outliers
        ('standard', StandardScaler())  # Standardizes to mean=0, std=1
    ])
    
    # Fit on training data
    scaler.fit(train_df[final_num_cols])
    
    # Transform both sets
    train_scaled = scaler.transform(train_df[final_num_cols])
    test_scaled = scaler.transform(test_df[final_num_cols])
    
    # Create DataFrames with scaled values
    scaled_cols = [f"{col}_scaled" for col in final_num_cols]
    train_scaled_df = pd.DataFrame(train_scaled, columns=scaled_cols, index=train_df.index)
    test_scaled_df = pd.DataFrame(test_scaled, columns=scaled_cols, index=test_df.index)
    
    # Concatenate with original DataFrames
    train_df = pd.concat([train_df, train_scaled_df], axis=1)
    test_df = pd.concat([test_df, test_scaled_df], axis=1)
    
    return train_df, test_df, scaler, scaled_cols

# Define numerical columns (or leave empty to auto-detect)
num_cols = [
    "totalPrice",
    "taxes",
    "duration_leg0",
    "duration_leg1",
    "price_per_hour",
    "seatsAvailable_leg0",
    "baggage_leg0",
    "price_rank_in_group",
    "duration_rank_in_group",
    "seats_rank_in_group"
]

# Apply scaling
train, test, numerical_scaler, scaled_features = scale_numerical_features(train, test, num_cols)

**3. Combined Feature Preparation Pipeline**

In [None]:
def prepare_features(train_df, test_df):
    """
    Complete feature preparation pipeline:
    1. Basic feature engineering
    2. Categorical encoding
    3. Numerical scaling
    """
    # Step 1: Basic feature engineering
    train_df = basic_time_and_numeric_features(train_df)
    test_df = basic_time_and_numeric_features(test_df)
    
    train_df = user_and_policy_features(train_df)
    test_df = user_and_policy_features(test_df)
    
    train_df = group_relative_features(train_df)
    test_df = group_relative_features(test_df)
    
    # Step 2: Categorical encoding
    cat_cols = [
        "profileId", "nationality", "companyID", "searchRoute",
        "legs0_segments0_marketingCarrier_code", "legs0_segments0_cabinClass",
        "frequentFlyer", "isVip", "pricingInfo_isAccessTP"
    ]
    train_df, test_df, cat_encoders = enhanced_categorical_encoding(train_df, test_df, cat_cols)
    
    # Step 3: Numerical scaling
    num_cols = [
        "totalPrice", "taxes", "duration_leg0", "duration_leg1",
        "price_per_hour", "seatsAvailable_leg0", "baggage_leg0",
        "price_rank_in_group", "duration_rank_in_group", "seats_rank_in_group"
    ]
    train_df, test_df, num_scaler, scaled_cols = scale_numerical_features(train_df, test_df, num_cols)
    
    # Return prepared data and encoders
    return {
        'train': train_df,
        'test': test_df,
        'categorical_encoders': cat_encoders,
        'numerical_scaler': num_scaler,
        'scaled_features': scaled_cols
    }

# Execute complete pipeline
prepared_data = prepare_features(train, test)
train_prepared = prepared_data['train']
test_prepared = prepared_data['test']

**4. Evaluation and Submission**

In [None]:
class FlightEvaluator:
    @staticmethod
    def hit_rate_at_k(y_true, y_pred, groups, k=3):
        """Calculate HitRate@k metric"""
        results = []
        group_sizes = []
        
        for group in groups.unique():
            group_mask = (groups == group)
            y_true_group = y_true[group_mask]
            y_pred_group = y_pred[group_mask]
            
            # Only consider groups with >10 options as per competition rules
            if len(y_true_group) > 10:
                top_k = y_pred_group.argsort()[-k:][::-1]
                hit = y_true_group.iloc[top_k].sum() > 0
                results.append(hit)
                group_sizes.append(len(y_true_group))
                
        return np.mean(results), np.sum(group_sizes)
    
    @staticmethod
    def create_submission(test_df, preds, ranker_id_col='ranker_id'):
        """Create competition submission file"""
        submission = test_df[['Id', ranker_id_col]].copy()
        
        for ranker_id in tqdm(submission[ranker_id_col].unique(), desc='Creating ranks'):
            mask = submission[ranker_id_col] == ranker_id
            submission.loc[mask, 'selected'] = (-preds[mask]).argsort().argsort() + 1
            
        # Validate submission
        FlightEvaluator._validate_submission(submission, ranker_id_col)
        
        return submission
    
    @staticmethod
    def _validate_submission(submission, ranker_id_col):
        """Validate submission meets competition requirements"""
        errors = []
        
        # Check for complete rankings per group
        for ranker_id in submission[ranker_id_col].unique():
            group = submission[submission[ranker_id_col] == ranker_id]
            ranks = group['selected']
            
            # Check for valid permutation
            if not np.array_equal(np.sort(ranks), np.arange(1, len(ranks)+1)):
                errors.append(f"Invalid ranks for {ranker_id}")
                
        if errors:
            raise ValueError(f"Submission validation failed: {errors[:3]}...")

**5. Full Pipeline Execution**

In [None]:
def main():
    # Initialize and load data
    data_processor = FlightDataProcessor('/kaggle/input/aeroclub-recsys-2025')
    data_processor.load_base_data()
    
    # Process JSON features (sample for demonstration)
    try:
        data_processor.process_json_features(sample_size=1000)
        data_processor.merge_features()
    except Exception as e:
        print(f"JSON processing skipped: {str(e)}")
    
    # Feature engineering
    feature_engineer = FeatureEngineer()
    train = feature_engineer.create_time_features(data_processor.train)
    train = feature_engineer.create_flight_features(train)
    train = feature_engineer.create_price_features(train)
    
    # Prepare modeling data
    X = train.drop(columns=['selected', 'Id', 'ranker_id'])
    y = train['selected']
    groups = train['ranker_id']
    
    # Train models
    ranker = FlightRanker()
    ranker.train(X, y, groups)
    
    # Evaluate
    for model_name, preds in ranker.oof_preds.items():
        hr, n_groups = FlightEvaluator.hit_rate_at_k(y, preds, groups)
        print(f"{model_name} OOF HitRate@3: {hr:.4f} (evaluated on {n_groups} groups)")
    
    # Create ensemble predictions
    test = feature_engineer.create_time_features(data_processor.test)
    test = feature_engineer.create_flight_features(test)
    test = feature_engineer.create_price_features(test)
    X_test = test.drop(columns=['Id', 'ranker_id'])
    
    ensemble_preds = ranker.ensemble_predict(X_test)
    submission = FlightEvaluator.create_submission(test, ensemble_preds)
    
    # Save submission
    submission.to_csv('submission.csv', index=False)
    print("Submission file saved successfully.")

if __name__ == "__main__":
    main()