In [None]:
import numpy as np
import polars as pl
import pandas as pd
import lightgbm as lgb
import optuna
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GroupKFold
from sklearn.metrics import ndcg_score

# Assuming train_data_with_agg and test_data_with_agg are already prepared with aggregated features
# from the previous code snippet

# loading in the cleaned training data
train_data = pl.read_csv('data/cleaned_training_data.csv')
test_data = pl.read_csv('data/cleaned_test_data.csv')

# I will drop unnecessary columns
train_data = train_data.drop(['prop_log_historical_price', 'price_usd', 'parsed_date', 'year', 'month', 'day', 'search_hour', 'day_of_week', 'year_month', 'date_time', 'prop_historical_price', 'price_usd_per_night_test', 'price_ratio', 'position', 'gross_bookings_usd', 'click_bool']). \
    rename({'price_usd_without_promo': 'price_usd'})

test_data = test_data.drop(['prop_log_historical_price', 'price_usd', 'parsed_date', 'year', 'month', 'day', 'search_hour', 'day_of_week', 'year_month', 'date_time', 'prop_historical_price', 'price_usd_per_night_test', 'price_ratio']). \
    rename({'price_usd_without_promo': 'price_usd'})

# Convert to pandas for easier manipulation with scikit-learn
train_df = train_data.to_pandas()
test_df = test_data.to_pandas()

# Extract features, groups, and labels
all_feature_cols = [col for col in train_df.columns if col not in 
                    ['srch_id', 'prop_id', 'booking_bool', 'click_bool', 'gross_bookings_usd']]

X_train = train_df[all_feature_cols].values
y_train = train_df['booking_bool'].values
groups_train = train_df['srch_id'].values

X_test = test_df[all_feature_cols].values
groups_test = test_df['srch_id'].values

# Scale features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Define evaluation function for NDCG
def calculate_ndcg(y_true, y_pred, group_indices, k=5):
    """Calculate NDCG@k for a set of queries."""
    ndcg_scores = []
    
    # Process each group (query)
    start_idx = 0
    for group_size in group_indices:
        if group_size <= 1:  # Skip queries with only one result
            start_idx += group_size
            continue
            
        end_idx = start_idx + group_size
        
        # Get true relevance and predictions for this query
        query_y_true = y_true[start_idx:end_idx]
        query_y_pred = y_pred[start_idx:end_idx]
        
        # Calculate NDCG for this query
        # If all relevance scores are 0, skip this query
        if np.sum(query_y_true) > 0:
            # Limit k to the size of the group
            actual_k = min(k, len(query_y_true))
            
            # Compute NDCG
            try:
                score = ndcg_score(
                    np.asarray([query_y_true]), 
                    np.asarray([query_y_pred]), 
                    k=actual_k
                )
                ndcg_scores.append(score)
            except:
                # Handle any errors in NDCG calculation
                pass
        
        start_idx = end_idx
    
    # Return average NDCG across all queries
    if len(ndcg_scores) > 0:
        return np.mean(ndcg_scores)
    else:
        return 0.0

# Define objective function for Optuna
def objective(trial):
    # Define parameters to tune
    params = {
        'objective': 'lambdarank',
        'metric': 'ndcg',
        'boosting_type': 'gbdt',
        # Key parameters to tune
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3, log=True),
        'num_leaves': trial.suggest_int('num_leaves', 15, 255),
        'max_depth': trial.suggest_int('max_depth', 3, 12),
        'min_data_in_leaf': trial.suggest_int('min_data_in_leaf', 10, 100),
        'feature_fraction': trial.suggest_float('feature_fraction', 0.6, 1.0),
        'bagging_fraction': trial.suggest_float('bagging_fraction', 0.6, 1.0),
        'bagging_freq': trial.suggest_int('bagging_freq', 1, 10),
        'lambda_l1': trial.suggest_float('lambda_l1', 1e-8, 10.0, log=True),
        'lambda_l2': trial.suggest_float('lambda_l2', 1e-8, 10.0, log=True),
        # LambdaMART specific parameters
        'max_position': trial.suggest_int('max_position', 5, 20),
        # Other parameters
        'verbosity': -1,
        'ndcg_eval_at': [5]
    }
    
    # Set up GroupKFold for cross-validation
    n_folds = 3
    group_kfold = GroupKFold(n_splits=n_folds)
    
    # Store CV scores
    cv_scores = []
    
    # Perform cross-validation
    for train_idx, valid_idx in group_kfold.split(X_train, y_train, groups=groups_train):
        X_train_fold, X_valid_fold = X_train[train_idx], X_train[valid_idx]
        y_train_fold, y_valid_fold = y_train[train_idx], y_train[valid_idx]
        
        # Get group information for training and validation sets
        train_groups = np.array(pd.Series(groups_train[train_idx]).groupby(groups_train[train_idx]).count())
        valid_groups = np.array(pd.Series(groups_train[valid_idx]).groupby(groups_train[valid_idx]).count())
        
        # Create LightGBM datasets
        train_dataset = lgb.Dataset(X_train_fold, y_train_fold, group=train_groups)
        valid_dataset = lgb.Dataset(X_valid_fold, y_valid_fold, group=valid_groups, reference=train_dataset)
        
        # Train model
        model = lgb.train(
            params,
            train_dataset,
            num_boost_round=1000,
            valid_sets=[valid_dataset],
            early_stopping_rounds=50,
            verbose_eval=False
        )
        
        # Predict on validation set
        valid_preds = model.predict(X_valid_fold, num_iteration=model.best_iteration)
        
        # Calculate NDCG score
        ndcg_score = calculate_ndcg(y_valid_fold, valid_preds, valid_groups, k=5)
        cv_scores.append(ndcg_score)
    
    # Return mean NDCG score across folds
    return np.mean(cv_scores)

# Create Optuna study for hyperparameter optimization
print("Starting hyperparameter optimization...")
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=50)  # Adjust number of trials based on your computational resources

# Print best parameters
print("Best parameters found:")
print(study.best_params)
print(f"Best NDCG score: {study.best_value}")

# Train final model with best parameters
best_params = study.best_params
best_params.update({
    'objective': 'lambdarank',
    'metric': 'ndcg',
    'boosting_type': 'gbdt',
    'verbosity': -1,
    'ndcg_eval_at': [5]
})

# Get group information for full training set
train_groups = np.array(pd.Series(groups_train).groupby(groups_train).count())
test_groups = np.array(pd.Series(groups_test).groupby(groups_test).count())

# Create final training dataset
final_train_dataset = lgb.Dataset(X_train, y_train, group=train_groups)

# Train final model
print("Training final model with best parameters...")
final_model = lgb.train(
    best_params,
    final_train_dataset,
    num_boost_round=2000  # Increase num_boost_round for final model
)

# Make predictions on test set
print("Making predictions on test set...")
test_predictions = final_model.predict(X_test)

# Add predictions to test dataframe
test_df['predictions'] = test_predictions

# Create submission file
result_df = test_df[['srch_id', 'prop_id', 'predictions']]
result_df = result_df.sort_values(['srch_id', 'predictions'], ascending=[True, False])
result_df.to_csv('lambdamart_optimized_predictions.csv', index=False)

# Feature importance analysis
feature_importance = final_model.feature_importance(importance_type='gain')
feature_names = all_feature_cols
importance_df = pd.DataFrame({'Feature': feature_names, 'Importance': feature_importance})
importance_df = importance_df.sort_values('Importance', ascending=False)
print("Top 20 important features:")
print(importance_df.head(20))

# Additional analysis: NDCG score on test set
# Note: Since we don't have true relevance for test data, this is only applicable
# if your test data actually contains relevance judgments
if 'booking_bool' in test_df.columns:
    y_test = test_df['booking_bool'].values
    test_ndcg = calculate_ndcg(y_test, test_predictions, test_groups, k=5)
    print(f"NDCG@10 on test set: {test_ndcg}")

# Save the model for future use
final_model.save_model('lambdamart_optimized_model.txt')
print("Hyperparameter optimization and model training completed!")

# Create detailed parameter importance visualization
import matplotlib.pyplot as plt
import optuna.visualization as optvis

# Parameter importance plot
try:
    param_importance = optvis.plot_param_importances(study)
    param_importance.write_image('parameter_importance.png')
    
    # Optimization history
    optimization_history = optvis.plot_optimization_history(study)
    optimization_history.write_image('optimization_history.png')
except:
    print("Could not create visualization - may need plotly installed")

# Create a visualization for feature importance
plt.figure(figsize=(12, 10))
plt.barh(importance_df['Feature'].head(30), importance_df['Importance'].head(30))
plt.xlabel('Importance')
plt.ylabel('Feature')
plt.title('Top 30 Features by Importance')
plt.tight_layout()
plt.savefig('feature_importance.png')

In [None]:
# Save the srch_id and prop_id for the full predictions
submission_df = test_df[['srch_id', 'prop_id']]
submission_df = submission_df.sort_values(['srch_id'], ascending=[True])
submission_df.to_csv('submission.csv', index=False)
print("Submission file created: submission.csv")

Submission file created: submission.csv
