# 🚌 Bus Demand Forecasting Hackathon - Complete Pipeline
This notebook covers everything from data loading to model prediction based on the provided problem statement.


In [24]:
! pip install pandas
! pip install numpy
! pip install lightgbm
! pip install scikit-learn
! pip install matplotlib
! pip install seaborn



In [38]:
# 📦 Imports
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')   

## 🔽 Load the Data
Ensure all CSV files are in your current working directory or specify the full path.

In [39]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test_8gqdJqH.csv')
transactions = pd.read_csv('transactions.csv')
sample_submission = pd.read_csv('sample_submission.csv')

## 📊 Basic EDA

In [40]:
print(train.head())
print(test.head())
print(transactions.head())
print(train.describe())

          doj  srcid  destid  final_seatcount
0  2023-03-01     45      46           2838.0
1  2023-03-01     46      45           2298.0
2  2023-03-01     45      47           2720.0
3  2023-03-01     47      45           2580.0
4  2023-03-01     46       9           4185.0
          route_key         doj  srcid  destid
0  2025-02-11_46_45  2025-02-11     46      45
1  2025-01-20_17_23  2025-01-20     17      23
2  2025-01-08_02_14  2025-01-08      2      14
3  2025-01-08_08_47  2025-01-08      8      47
4  2025-01-08_09_46  2025-01-08      9      46
          doj         doi  srcid  destid    srcid_region   destid_region  \
0  2023-03-01  2023-01-30     45      46       Karnataka      Tamil Nadu   
1  2023-03-01  2023-01-30     46      45      Tamil Nadu       Karnataka   
2  2023-03-01  2023-01-30     45      47       Karnataka  Andhra Pradesh   
3  2023-03-01  2023-01-30     47      45  Andhra Pradesh       Karnataka   
4  2023-03-01  2023-01-30     46       9      Tamil Nadu      

## 🧹 Data Preprocessing

In [41]:
# Convert to datetime
transactions['doj'] = pd.to_datetime(transactions['doj'])
transactions['doi'] = pd.to_datetime(transactions['doi'])
train['doj'] = pd.to_datetime(train['doj'])
test['doj'] = pd.to_datetime(test['doj'])

## 🧠 Feature Engineering

In [42]:
# Use only transactions at least 15 days before journey
txn_15 = transactions[transactions['dbd'] == 15].copy()

# Aggregate route-level features
agg = txn_15.groupby(['srcid', 'destid', 'doj']).agg(
    early_seats=('cumsum_seatcount', 'max'),
    early_searches=('cumsum_searchcount', 'max'),
    mean_dbd=('dbd', 'mean'),
    median_dbd=('dbd', 'median'),
    src_tier=('srcid_tier', 'first'),
    dest_tier=('destid_tier', 'first'),
    src_region=('srcid_region', 'first'),
    dest_region=('destid_region', 'first')
).reset_index()

## 🔧 Train-Test Preparation

In [43]:
# Merge features with training set
train_merged = pd.merge(train, agg, on=['srcid', 'destid', 'doj'], how='left')
test_merged = pd.merge(test, agg, on=['srcid', 'destid', 'doj'], how='left')

# Handle categoricals
categorical_cols = ['src_tier', 'dest_tier', 'src_region', 'dest_region']
for col in categorical_cols:
    train_merged[col] = train_merged[col].astype('category')
    test_merged[col] = test_merged[col].astype('category')

# Define features and target
features = ['early_seats', 'early_searches', 'mean_dbd', 'median_dbd'] + categorical_cols
X = train_merged[features]
y = train_merged['final_seatcount']

## 📈 Model Training

In [44]:
from lightgbm import LGBMRegressor, early_stopping, log_evaluation

# Define the model
model = LGBMRegressor(
    objective='regression',
    learning_rate=0.05,
    n_estimators=1000,
    random_state=42
)

# Fit using callbacks
model.fit(
    X_train, y_train,
    eval_set=[(X_val, y_val)],
    eval_metric='rmse',
    callbacks=[early_stopping(50), log_evaluation(100)]
)


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.009751 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 540
[LightGBM] [Info] Number of data points in the train set: 53760, number of used features: 6
[LightGBM] [Info] Start training from score 2003.632533
Training until validation scores don't improve for 50 rounds
[100]	valid_0's rmse: 653.534	valid_0's l2: 427107
[200]	valid_0's rmse: 633.929	valid_0's l2: 401866
[300]	valid_0's rmse: 625.999	valid_0's l2: 391875
[400]	valid_0's rmse: 623.229	valid_0's l2: 388415
[500]	valid_0's rmse: 622.364	valid_0's l2: 387337
Early stopping, best iteration is:
[490]	valid_0's rmse: 622.205	valid_0's l2: 387139


## 📤 Generate Predictions

In [None]:
X_test = test_merged[features]
test_preds = model.predict(X_test)
sample_submission['final_seatcount'] = test_preds
sample_submission.to_csv('submission.csv', index=False)
print('✅ submission.csv generated!')

✅ submission.csv generated!
