In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
import lightgbm as lgb
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt


**LOAD DATA**

In [2]:
train = pd.read_csv("train.csv")
test = pd.read_csv("/content/test_8gqdJqH.csv")
transactions = pd.read_csv("transactions.csv")

**PRE PROCESSING THE DATA**

In [3]:
train['doj'] = pd.to_datetime(train['doj'])
test['doj'] = pd.to_datetime(test['doj'])
transactions['doj'] = pd.to_datetime(transactions['doj'])
transactions['doi'] = pd.to_datetime(transactions['doi'])

**FEATURE ENGINEERING**

In [4]:
# Filter transactions as of dbd >= 15
tx_15 = transactions[transactions['dbd'] >= 15]
agg_15 = tx_15.groupby(['doj', 'srcid', 'destid']).agg({
    'cumsum_seatcount': 'max',
    'cumsum_searchcount': 'max'
}).reset_index().rename(columns={
    'cumsum_seatcount': 'seatcount_dbd15',
    'cumsum_searchcount': 'searchcount_dbd15'
})

In [5]:
# Merge features into train/test
train = train.merge(agg_15, on=['doj', 'srcid', 'destid'], how='left')
test = test.merge(agg_15, on=['doj', 'srcid', 'destid'], how='left')

In [6]:
# Merge region/tier info
cat_cols = ['srcid', 'destid', 'srcid_region', 'destid_region', 'srcid_tier', 'destid_tier']
cat_info = transactions[cat_cols].drop_duplicates(['srcid', 'destid'])
train = train.merge(cat_info, on=['srcid', 'destid'], how='left')
test = test.merge(cat_info, on=['srcid', 'destid'], how='left')

In [7]:
# Fill NA
for df in [train, test]:
    df['seatcount_dbd15'] = df['seatcount_dbd15'].fillna(0)
    df['searchcount_dbd15'] = df['searchcount_dbd15'].fillna(0)
    df['dow'] = df['doj'].dt.weekday
    df['month'] = df['doj'].dt.month
    df['is_weekend'] = df['dow'].isin([5,6]).astype(int)

In [8]:
# Encode categoricals
for col in ['srcid_region', 'destid_region', 'srcid_tier', 'destid_tier']:
    le = LabelEncoder()
    train[col] = le.fit_transform(train[col].astype(str))
    test[col] = le.transform(test[col].astype(str))

**MODEL TRAINING**

In [9]:
features = ['srcid', 'destid', 'dow', 'month', 'is_weekend',
            'seatcount_dbd15', 'searchcount_dbd15',
            'srcid_region', 'destid_region', 'srcid_tier', 'destid_tier']
target = 'final_seatcount'

In [10]:
pip install lightgbm --upgrade




In [17]:
model = LGBMRegressor(
    objective='regression',
    learning_rate=0.05,
    num_leaves=31,
    feature_fraction=0.8,
    bagging_fraction=0.8,
    bagging_freq=5,
    random_state=42,
    n_estimators=1000
)

# NO early_stopping_rounds, NO verbose
model.fit(
    train_data[features], train_data[target]
)

# Predict on validation
val_pred = model.predict(val_data[features])
val_rmse = np.sqrt(mean_squared_error(val_data[target], val_pred))
print("Validation RMSE:", val_rmse)


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.002079 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 652
[LightGBM] [Info] Number of data points in the train set: 53800, number of used features: 11
[LightGBM] [Info] Start training from score 1940.546487
Validation RMSE: 610.6026123251335


In [18]:
# Validation RMSE
val_pred = model.predict(val_data[features])
val_rmse = np.sqrt(mean_squared_error(val_data[target], val_pred))
print("Validation RMSE:", val_rmse)

Validation RMSE: 610.6026123251335


In [19]:
# -------------------- Predict on Test --------------------
test['final_seatcount'] = model.predict(test[features])
test[['route_key', 'final_seatcount']].to_csv("submission.csv", index=False)

