In [2]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

import matplotlib.pyplot as plt

In [3]:
df = pd.read_csv("Live_Trains.csv")
trains = pd.read_csv("Trains.csv")
stations = pd.read_csv("Stations.csv")
block_rel = pd.read_csv("Block_Relation.csv")

In [4]:
features = ['TrainNo', 'TrainType', 'HaltStation', 'PFNo', 'BlockNo', 
            'BlockLen', 'ApproachingBlockNo', 'CurrentSpeed', 'CurrentDelay', 
            'DFNS', 'RunningStatus']
target = 'DwellTime'

In [5]:
X = df[features]
y = df[target]

In [6]:
numeric_cols = ['BlockLen', 'CurrentSpeed', 'CurrentDelay', 'DFNS']
X[numeric_cols] = X[numeric_cols].fillna(X[numeric_cols].median())

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X[numeric_cols] = X[numeric_cols].fillna(X[numeric_cols].median())


In [7]:
categorical_cols = ['TrainNo', 'TrainType', 'HaltStation', 'PFNo', 'BlockNo', 'ApproachingBlockNo', 'RunningStatus']
for cols in categorical_cols:
    X[cols] = X[cols].astype(str).fillna('Unknown')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X[cols] = X[cols].astype(str).fillna('Unknown')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X[cols] = X[cols].astype(str).fillna('Unknown')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X[cols] = X[cols].astype(str).fillna('Unknown')
A value is trying to be set on a copy of a slice from a DataF

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [9]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_cols),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols)
    ])


In [11]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import Pipeline
import numpy as np

rf_pipeline = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("model", RandomForestRegressor(random_state=42))
])

rf_params = {
    "model__n_estimators": [200, 300, 500, 700],
    "model__max_depth": [None, 10, 20, 30],
    "model__max_features": ["sqrt", "log2", 0.8],
    "model__min_samples_split": [2, 5, 10, 20],
    "model__min_samples_leaf": [1, 2, 4, 8],
    "model__bootstrap": [True, False]
}

rf_random = RandomizedSearchCV(
    estimator=rf_pipeline,
    param_distributions=rf_params,
    n_iter=50,  # number of random combinations
    cv=5,
    n_jobs=-1,
    scoring="neg_mean_squared_error",
    random_state=42
)

rf_random.fit(X_train, y_train)
print(rf_random.best_params_)


{'model__n_estimators': 500, 'model__min_samples_split': 20, 'model__min_samples_leaf': 2, 'model__max_features': 'sqrt', 'model__max_depth': None, 'model__bootstrap': True}


In [None]:
from xgboost import XGBRegressor

xgb_pipeline = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("model", XGBRegressor(objective='reg:squarederror', random_state=42))
])

xgb_params = {
    "model__n_estimators": [200, 300, 500],
    "model__learning_rate": [0.01, 0.05, 0.1],
    "model__max_depth": [3, 5, 7, 10],
    "model__subsample": [0.6, 0.7, 0.8, 1.0],
    "model__colsample_bytree": [0.6, 0.7, 0.8, 1.0],
    "model__reg_alpha": [0, 0.1, 0.5],
    "model__reg_lambda": [1, 1.5, 2]
}

xgb_random = RandomizedSearchCV(
    estimator=xgb_pipeline,
    param_distributions=xgb_params,
    n_iter=50,
    cv=5,
    n_jobs=-1,
    scoring="neg_mean_squared_error",
    random_state=42
)

xgb_random.fit(X_train, y_train)
print(xgb_random.best_params_)


{'model__subsample': 1.0, 'model__reg_lambda': 1.5, 'model__reg_alpha': 0.5, 'model__n_estimators': 300, 'model__max_depth': 3, 'model__learning_rate': 0.01, 'model__colsample_bytree': 0.7}


In [13]:
best_rf_model = rf_random.best_estimator_
# best_xgb_model = xgb_random.best_estimator_

In [None]:
# Comparing 
from sklearn.metrics import root_mean_squared_error, mean_absolute_error, r2_score

# Random Forest
y_pred_rf = best_rf_model.predict(X_test)
print("RF RMSE:", root_mean_squared_error(y_test, y_pred_rf ))
print("RF MAE:", mean_absolute_error(y_test, y_pred_rf))
print("RF R2:", r2_score(y_test, y_pred_rf))

# XGBoost
y_pred_xgb = best_xgb_model.predict(X_test)
print("XGB RMSE:", root_mean_squared_error(y_test, y_pred_xgb))
print("XGB MAE:", mean_absolute_error(y_test, y_pred_xgb))
print("XGB R2:", r2_score(y_test, y_pred_xgb))


RF RMSE: 27.931286648112067
RF MAE: 10.136002029864143
RF R2: 0.8550573386036646
XGB RMSE: 29.362194061279297
XGB MAE: 11.017355918884277
XGB R2: 0.839826226234436


In [None]:

y_pred_rf = best_rf_model.predict(X_test)
y_pred_xgb = best_xgb_model.predict(X_test)


In [None]:
y_pred_ensemble = 0.8*y_pred_rf + 0.2*y_pred_xgb




In [None]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

rmse = lambda y_true, y_pred: root_mean_squared_error(y_true, y_pred)

print("Ensemble RMSE:", rmse(y_test, y_pred_ensemble))
print("Ensemble MAE:", mean_absolute_error(y_test, y_pred_ensemble))
print("Ensemble R2:", r2_score(y_test, y_pred_ensemble))


Ensemble RMSE: 28.13319785157287
Ensemble MAE: 10.269836679439267
Ensemble R2: 0.8529542256497432


In [None]:
import joblib

class EnsembleModel:
    def __init__(self, rf_model, xgb_model, rf_weight=0.6, xgb_weight=0.4):
        self.rf_model = rf_model
        self.xgb_model = xgb_model
        self.rf_weight = rf_weight
        self.xgb_weight = xgb_weight

    def predict(self, X):
        y_rf = self.rf_model.predict(X)
        y_xgb = self.xgb_model.predict(X)
        return self.rf_weight * y_rf + self.xgb_weight * y_xgb


In [None]:

best_rf_model = rf_random.best_estimator_
best_xgb_model = xgb_random.best_estimator_

ensemble_model = EnsembleModel(best_rf_model, best_xgb_model)


In [None]:
joblib.dump(ensemble_model, "ensemble_dwell_time_model.pkl")


['ensemble_dwell_time_model.pkl']

In [15]:
import joblib
joblib.dump(best_rf_model,'rf_model.pkl')


['rf_model.pkl']

['TrainNo',
 'TrainType',
 'HaltStation',
 'PFNo',
 'BlockNo',
 'ApproachingBlockNo',
 'RunningStatus']