In [8]:
import numpy as np
import polars as pl
import lightgbm as lgb
from sklearn.metrics import ndcg_score
from sklearn.preprocessing import StandardScaler
import pandas as pd
# loading in the cleaned training data
train_data = pl.read_csv('data/cleaned_training_data.csv')
test_data = pl.read_csv('data/cleaned_test_data.csv')

# Print all features except ['prop_log_historical_price', 'price_usd', 'parsed_date', 'year', 'month', 'day', 'search_hour', 'day_of_week', 'year_month', 'date_time', 'prop_historical_price', 'price_usd_per_night_test', 'price_ratio', 'position', 'gross_bookings_usd', 'click_bool']
# excluded_features = ['prop_log_historical_price', 'price_usd', 'parsed_date', 'year', 'month', 'day', 'search_hour', 'day_of_week', 'year_month', 'date_time', 'prop_historical_price', 'price_usd_per_night_test', 'price_ratio', 'position', 'gross_bookings_usd', 'click_bool', 'random_bool', 'site_id', 'promotion_flag', 'is_weekend_search', 'month_sin', 'search_hour_sin', 'day_of_week_sin', 'has_usable_review', 'query_affinity_missing']
# features = [col for col in train_data.columns if col not in excluded_features]
# print("Features used for training:", features)

In [9]:
# I will drop unnecessary columns, but only those that exist in the DataFrame
drop_cols = [
    'prop_log_historical_price', 'price_usd', 'parsed_date', 'year', 'month', 'day', 'search_hour',
    'day_of_week', 'year_month', 'date_time', 'prop_historical_price', 'price_usd_per_night_test',
    'price_ratio', 'position', 'click_bool', 'random_bool', 'site_id', 'promotion_flag',
    'is_weekend_search', 'month_sin', 'search_hour_sin', 'day_of_week_sin', 'has_usable_review',
    'query_affinity_missing'
]

train_data = train_data.drop([col for col in drop_cols if col in train_data.columns]).rename({'price_usd_without_promo': 'price_usd'} if 'price_usd_without_promo' in train_data.columns else {})

test_data = test_data.drop([col for col in drop_cols if col in test_data.columns]).rename({'price_usd_without_promo': 'price_usd'} if 'price_usd_without_promo' in test_data.columns else {})

# Convert to pandas for easier manipulation with scikit-learn
train_df = train_data.to_pandas()
test_df = test_data.to_pandas()
# 1. Identify numeric features
# We'll exclude categorical features and target variables
# Let's first check the data types and columns
print("Training data columns:", train_data.columns)
print("Training data schema:", train_data.schema)

# Let's assume we'll identify numeric features based on their data types
numeric_cols = [col for col in train_data.columns if 
                train_data[col].dtype in [pl.Float32, pl.Float64, pl.Int32, pl.Int64] and 
                col not in ['prop_id', 'srch_id', 'booking_bool', 'click_bool', 'gross_bookings_usd']]

print(f"Identified {len(numeric_cols)} numeric features: {numeric_cols}")

# 2. Calculate aggregate features per prop_id
# For each numeric feature, we'll calculate average, stddev, and median per prop_id

# First, let's create functions to calculate these aggregates
def add_aggregated_features(data, numeric_cols):
    # Create copies to avoid modifying original
    result_df = data.clone()
    
    # Calculate aggregates for each numeric column
    for col in numeric_cols:
        # Calculate mean per prop_id
        mean_per_prop = data.group_by('prop_id').agg(pl.mean(col).alias(f"{col}_mean_per_prop"))
        result_df = result_df.join(mean_per_prop, on='prop_id')
        
        # Calculate stddev per prop_id
        stddev_per_prop = data.group_by('prop_id').agg(pl.std(col).alias(f"{col}_stddev_per_prop"))
        result_df = result_df.join(stddev_per_prop, on='prop_id')
        
        # Calculate median per prop_id
        median_per_prop = data.group_by('prop_id').agg(pl.median(col).alias(f"{col}_median_per_prop"))
        result_df = result_df.join(median_per_prop, on='prop_id')
    
    return result_df

# Add aggregated features to both train and test sets
train_data_with_agg = add_aggregated_features(train_data, numeric_cols)
test_data_with_agg = add_aggregated_features(test_data, numeric_cols)

# 3. Prepare data for LambdaMART (LightGBM)
# For LightGBM ranking task, we need to prepare data in the correct format
# We need features, groups (queries/srch_id), and labels (relevance)

# Get all feature columns (original numeric + aggregated)
all_feature_cols = numeric_cols.copy()
for col in numeric_cols:
    all_feature_cols.extend([f"{col}_mean_per_prop", f"{col}_stddev_per_prop", f"{col}_median_per_prop"])

# Replace NaN values with 0
train_data_with_agg = train_data_with_agg.fill_null(0)
test_data_with_agg = test_data_with_agg.fill_null(0)

# Convert to pandas for easier use with LightGBM
train_df = train_data_with_agg.to_pandas()
test_df = test_data_with_agg.to_pandas()

# Extract features, groups, and labels
X_train = train_df[all_feature_cols].values
# For ranking, we'll use booking_bool as the relevance label
y_train = train_df['booking_bool'].values
# Group by srch_id for ranking
groups_train = train_df.groupby('srch_id').size().values

X_test = test_df[all_feature_cols].values
groups_test = test_df.groupby('srch_id').size().values

# Optional: Scale features for better performance
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# 4. Train LambdaMART model using LightGBM
# Create LightGBM dataset
train_dataset = lgb.Dataset(X_train, y_train, group=groups_train)

# Set parameters for LambdaMART
params = {
    'objective': 'lambdarank',
    'metric': 'ndcg',
    'boosting_type': 'gbdt',
    'min_data_in_leaf': 57,
    'num_leaves': 76,
    'learning_rate': 0.024720389075475732,
    'feature_fraction': 0.6572262837001418,
    'bagging_fraction': 0.8834318020487413,
    'bagging_freq': 10,
    'lambda_l1': 3.9616918691847163,
    'lambda_l2': 0.0016270516531082348,
    'max_position': 12,  # Unlimited position
    'max_depth': 12,  # Unlimited depth
    'verbosity': -1,
    'ndcg_eval_at': [5],  # Evaluate NDCG at position 5
}

# Train the model
print("Training LambdaMART model...")
model = lgb.train(
    params,
    train_dataset,
    num_boost_round=1000,
    valid_sets=[train_dataset],
    callbacks=[
        lgb.early_stopping(stopping_rounds=50, verbose=True),
        lgb.log_evaluation(period=100)
    ],
)

# 5. Make predictions and evaluate
print("Making predictions...")
test_predictions = model.predict(X_test)

# Save the predictions to file for further analysis or submission
test_df['predictions'] = test_predictions
result_df = test_df[['srch_id', 'prop_id', 'predictions']]

# Sort by srch_id and prediction score (descending) to get the final ranking
result_df = result_df.sort_values(['srch_id', 'predictions'], ascending=[True, False])
result_df.to_csv('lambdamart_predictions.csv', index=False)

# 6. Feature importance analysis
feature_importance = model.feature_importance(importance_type='gain')
feature_names = all_feature_cols
importance_df = pd.DataFrame({'Feature': feature_names, 'Importance': feature_importance})
importance_df = importance_df.sort_values('Importance', ascending=False)
print("Top 20 important features:")
print(importance_df.head(20))

print("LambdaMART model training completed!")

Training data columns: ['srch_id', 'visitor_location_country_id', 'visitor_hist_starrating', 'visitor_hist_adr_usd', 'prop_country_id', 'prop_id', 'prop_starrating', 'prop_review_score', 'prop_brand_bool', 'prop_location_score1', 'prop_location_score2', 'srch_destination_id', 'srch_length_of_stay', 'srch_booking_window', 'srch_adults_count', 'srch_children_count', 'srch_room_count', 'srch_saturday_night_bool', 'srch_query_affinity_score', 'orig_destination_distance', 'gross_bookings_usd', 'booking_bool', 'month_cos', 'search_hour_cos', 'day_of_week_cos', 'prop_review_score_filled', 'num_comps_lower', 'num_comps_higher', 'num_comps_with_inventory', 'avg_comp_rate_percent_diff', 'query_affinity_score_cleaned', 'price_usd']
Training data schema: Schema({'srch_id': Int64, 'visitor_location_country_id': Int64, 'visitor_hist_starrating': String, 'visitor_hist_adr_usd': String, 'prop_country_id': Int64, 'prop_id': Int64, 'prop_starrating': Int64, 'prop_review_score': Float64, 'prop_brand_bool



[100]	training's ndcg@5: 0.592308
[200]	training's ndcg@5: 0.601624
[300]	training's ndcg@5: 0.609171
[400]	training's ndcg@5: 0.615976
[500]	training's ndcg@5: 0.621843
[600]	training's ndcg@5: 0.627757
[700]	training's ndcg@5: 0.632732
[800]	training's ndcg@5: 0.637915
[900]	training's ndcg@5: 0.642621
[1000]	training's ndcg@5: 0.6473
Making predictions...
Top 20 important features:
                                       Feature     Importance
6                         prop_location_score2  877222.434440
24                                   price_usd  401124.861244
99                   price_usd_median_per_prop  142655.361441
97                     price_usd_mean_per_prop  131263.596019
40          prop_location_score1_mean_per_prop  114447.633153
43          prop_location_score2_mean_per_prop   88993.662494
2                              prop_starrating   86101.765804
5                         prop_location_score1   84214.518615
45        prop_location_score2_median_per_prop   72006

In [10]:
# Save the srch_id and prop_id for the full predictions
submission_df = test_df[['srch_id', 'prop_id']]
submission_df = submission_df.sort_values(['srch_id'], ascending=[True])
submission_df.to_csv('submission.csv', index=False)
print("Submission file created: submission.csv")

Submission file created: submission.csv
