<a href="https://colab.research.google.com/github/IT21238994/24-25J-169-iTranz/blob/RouteModel_IT21298394/calssificationandregressioncomparison.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.metrics import accuracy_score, mean_absolute_error
import numpy as np

# Load Dataset
file_path = "bus_schedule_dataset.csv"  # Update the path as necessary
df = pd.read_csv(file_path)
print("Dataset Loaded Successfully!")

# **Step 1: Data Preprocessing**
print("Starting Data Preprocessing...")

# Convert Time to Hourly Format
df['Hour'] = pd.to_datetime(df['Time'], format='%I:%M %p').dt.hour

# Group by hour, starting point, and ending point to determine the best route
hourly_best_routes = df.groupby(['Day', 'Hour', 'Starting_Point', 'Ending_Point']) \
    .apply(lambda x: x.loc[x['Delay (min)'].idxmin(), 'Route']) \
    .reset_index(name='Best Route')

# Merge best route into the main dataset
df = df.merge(hourly_best_routes, on=['Day', 'Hour', 'Starting_Point', 'Ending_Point'], how='left')

# Encode categorical variables
label_encoders = {}
for col in ['Day', 'Day_Type', 'Starting_Point', 'Ending_Point', 'Weather', 'Holiday']:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le

# Convert Traffic_Level to numerical values
df['Traffic_Level'] = LabelEncoder().fit_transform(df['Traffic_Level'])

# Define features and target for classification
features_classification = ['Day', 'Hour', 'Day_Type', 'Starting_Point', 'Ending_Point', 'Traffic_Level',
                           'Distance (km)', 'Full Trip Time(min)', 'Seat_Availability (%)', 'Historical_Peak_Delay (min)']
target_classification = 'Best Route'

# Encode the target variable (Best Route) for classification
df[target_classification] = LabelEncoder().fit_transform(df[target_classification])

print("Data Preprocessing Completed!")

# **Step 2: Train Classification Model**
print("Training Classification Model...")
X_train_class, X_test_class, y_train_class, y_test_class = train_test_split(df[features_classification], df[target_classification], test_size=0.2, random_state=42)

classifier = RandomForestClassifier(n_estimators=100, random_state=42)
classifier.fit(X_train_class, y_train_class)

# Predict best routes
y_pred_class = classifier.predict(X_test_class)
classification_accuracy = accuracy_score(y_test_class, y_pred_class)
print(f"Classification Model Accuracy: {classification_accuracy * 100:.2f}%")

# **Step 3: Train Regression Model**
print("Training Regression Model...")
features_regression = ['Day', 'Hour', 'Day_Type', 'Starting_Point', 'Ending_Point', 'Traffic_Level',
                       'Distance (km)', 'Full Trip Time(min)', 'Seat_Availability (%)', 'Historical_Peak_Delay (min)']
target_regression = 'Route Score'

# Generate a score for routes
df[target_regression] = (-df['Delay (min)']) + (-df['Full Trip Time(min)']) + df['Seat_Availability (%)']

# Handle missing values
df[target_regression].fillna(df[target_regression].median(), inplace=True)

X_train_reg, X_test_reg, y_train_reg, y_test_reg = train_test_split(df[features_regression], df[target_regression], test_size=0.2, random_state=42)

regressor = RandomForestRegressor(n_estimators=100, random_state=42)
regressor.fit(X_train_reg, y_train_reg)

# Predict scores
y_pred_reg = regressor.predict(X_test_reg)
regression_error = mean_absolute_error(y_test_reg, y_pred_reg)
print(f"Regression Model Mean Absolute Error: {regression_error:.2f}")

# **Step 4: Display Results**
print("Model Training Completed!")
print("--- Final Model Performance ---")
print(f"Classification Accuracy: {classification_accuracy * 100:.2f}%")
print(f"Regression MAE: {regression_error:.2f}")


Dataset Loaded Successfully!
Starting Data Preprocessing...
Data Preprocessing Completed!
Training Classification Model...


  .apply(lambda x: x.loc[x['Delay (min)'].idxmin(), 'Route']) \


Classification Model Accuracy: 97.80%
Training Regression Model...


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[target_regression].fillna(df[target_regression].median(), inplace=True)


Regression Model Mean Absolute Error: 2.53
Model Training Completed!
--- Final Model Performance ---
Classification Accuracy: 97.80%
Regression MAE: 2.53
