In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score, accuracy_score
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.tree import DecisionTreeRegressor
import xgboost as xgb


# Data viz

In [3]:
file = "../raw_data/mta_1706.csv"
data = pd.read_csv(file, on_bad_lines="skip")
data.head(10)

Unnamed: 0,RecordedAtTime,DirectionRef,PublishedLineName,OriginName,OriginLat,OriginLong,DestinationName,DestinationLat,DestinationLong,VehicleRef,VehicleLocation.Latitude,VehicleLocation.Longitude,NextStopPointName,ArrivalProximityText,DistanceFromStop,ExpectedArrivalTime,ScheduledArrivalTime
0,2017-06-01 00:03:34,0,B8,4 AV/95 ST,40.616104,-74.031143,BROWNSVILLE ROCKAWAY AV,40.656048,-73.907379,NYCT_430,40.63517,-73.960803,FOSTER AV/E 18 ST,approaching,76.0,2017-06-01 00:03:59,24:06:14
1,2017-06-01 00:03:43,1,S61,ST GEORGE FERRY/S61 & S91,40.643169,-74.073494,S I MALL YUKON AV,40.575935,-74.167686,NYCT_8263,40.590802,-74.15834,MERRYMOUNT ST/TRAVIS AV,approaching,62.0,2017-06-01 00:03:56,23:58:02
2,2017-06-01 00:03:49,0,Bx10,E 206 ST/BAINBRIDGE AV,40.875008,-73.880142,RIVERDALE 263 ST,40.912376,-73.902534,NYCT_4223,40.88601,-73.912647,HENRY HUDSON PKY E/W 235 ST,at stop,5.0,2017-06-01 00:03:56,24:00:53
3,2017-06-01 00:03:31,0,Q5,TEARDROP/LAYOVER,40.701748,-73.802399,ROSEDALE LIRR STA via MERRICK,40.666012,-73.735939,NYCT_8422,40.668002,-73.729348,HOOK CREEK BL/SUNRISE HY,< 1 stop away,267.0,2017-06-01 00:04:03,24:03:00
4,2017-06-01 00:03:22,1,Bx1,RIVERDALE AV/W 231 ST,40.881187,-73.90934,MOTT HAVEN 136 ST via CONCOURSE,40.809654,-73.92836,NYCT_4710,40.868134,-73.893032,GRAND CONCOURSE/E 196 ST,at stop,11.0,2017-06-01 00:03:56,23:59:38
5,2017-06-01 00:03:40,0,M1,4 AV/E 10 ST,40.731342,-73.990288,HARLEM 147 ST via MADISON,40.82111,-73.935898,NYCT_3831,40.792897,-73.950023,MADISON AV/E 106 ST,approaching,73.0,2017-06-01 00:03:56,24:02:35
6,2017-06-01 00:03:24,0,B31,GERRITSEN AV/GERRITSEN BEACH,40.587101,-73.918503,MIDWOOD KINGS HWY STA,40.608433,-73.9571,NYCT_4611,40.587024,-73.918623,GERRITSEN AV/GERRITSEN BEACH,at stop,0.0,,24:08:00
7,2017-06-01 00:03:29,0,B83,GATEWAY CTR TERM/GATEWAY DR,40.652649,-73.877029,BWAY JCT VN SNDRN AV,40.678139,-73.903572,NYCT_4841,40.648801,-73.882682,PENNSYLVANIA AV/DELMAR LOOP N,< 1 stop away,196.0,2017-06-01 00:04:13,23:58:47
8,2017-06-01 00:03:27,0,B82,STILLWELL TERMINAL BUS LOOP,40.57708,-73.981293,SPRING CRK TWRS SEAVIEW AV via KINGS HWY,40.64299,-73.878326,NYCT_6592,40.632258,-73.918318,FLATLANDS AV/RALPH AV,approaching,35.0,2017-06-01 00:03:56,24:00:00
9,2017-06-01 00:03:51,1,S59,RICHMOND TER/PARK AV #3,40.640167,-74.130966,HYLAN BL,40.53426,-74.154213,NYCT_8279,40.590689,-74.165811,RICHMOND AV/NOME AV,approaching,31.0,2017-06-01 00:03:56,24:01:14


In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6730436 entries, 0 to 6730435
Data columns (total 17 columns):
 #   Column                     Dtype  
---  ------                     -----  
 0   RecordedAtTime             object 
 1   DirectionRef               int64  
 2   PublishedLineName          object 
 3   OriginName                 object 
 4   OriginLat                  float64
 5   OriginLong                 float64
 6   DestinationName            object 
 7   DestinationLat             float64
 8   DestinationLong            float64
 9   VehicleRef                 object 
 10  VehicleLocation.Latitude   float64
 11  VehicleLocation.Longitude  float64
 12  NextStopPointName          object 
 13  ArrivalProximityText       object 
 14  DistanceFromStop           float64
 15  ExpectedArrivalTime        object 
 16  ScheduledArrivalTime       object 
dtypes: float64(7), int64(1), object(9)
memory usage: 872.9+ MB


In [5]:
data.dtypes

RecordedAtTime                object
DirectionRef                   int64
PublishedLineName             object
OriginName                    object
OriginLat                    float64
OriginLong                   float64
DestinationName               object
DestinationLat               float64
DestinationLong              float64
VehicleRef                    object
VehicleLocation.Latitude     float64
VehicleLocation.Longitude    float64
NextStopPointName             object
ArrivalProximityText          object
DistanceFromStop             float64
ExpectedArrivalTime           object
ScheduledArrivalTime          object
dtype: object

## Convert Time Columns to Datetime Format

In [6]:
data['RecordedAtTime'] = pd.to_datetime(data['RecordedAtTime'])
data['ExpectedArrivalTime'] = pd.to_datetime(data['ExpectedArrivalTime'])

In [7]:
# Using Vectorized Operations
# since the db is very very large and classical python iterations is too slow.

times = data['ScheduledArrivalTime'].str.split(':', expand=True)
times = times.fillna('0').astype(float)
hours, minutes, seconds = times[0], times[1], times[2]

new_hours = np.where(hours >= 24, hours - 24, hours)
days_to_add = np.where(hours >= 24, 1, 0)

valid_rows = ~hours.isna() & ~minutes.isna() & ~seconds.isna()

data.loc[valid_rows, 'ScheduledArrivalTime'] = (
    pd.to_datetime(
        new_hours.astype(int).astype(str) + ':' +
        minutes.astype(int).astype(str) + ':' +
        seconds.astype(int).astype(str),
        format='%H:%M:%S'
    ) + pd.to_timedelta(days_to_add[valid_rows], unit='days')
)

data.loc[~valid_rows, 'ScheduledArrivalTime'] = pd.NaT

In [8]:
data['ScheduledArrivalTime'] = pd.to_datetime(data['ScheduledArrivalTime'])

In [9]:
data['ScheduledArrivalTime'] = (
    pd.to_datetime(data['RecordedAtTime'].dt.date.astype(str) + ' ' + data['ScheduledArrivalTime'].dt.time.astype(str))
)
data['ScheduledArrivalTime']

0         2017-06-01 00:06:14
1         2017-06-01 23:58:02
2         2017-06-01 00:00:53
3         2017-06-01 00:03:00
4         2017-06-01 23:59:38
                  ...        
6730431   2017-06-30 23:44:12
6730432   2017-06-30 00:02:00
6730433   2017-06-30 23:44:16
6730434   2017-06-30 23:50:00
6730435   2017-06-30 23:45:15
Name: ScheduledArrivalTime, Length: 6730436, dtype: datetime64[ns]

In [10]:
print(data['RecordedAtTime'].isna().sum())

0


In [11]:
print(data['ExpectedArrivalTime'].isna().sum())

872302


In [12]:
print(data['ScheduledArrivalTime'].isna().sum())

0


## Data Cleaning

In [13]:
# List of columns to drop
columns_to_drop = ['PublishedLineName', 'OriginName', 'DestinationName', 'VehicleRef', 'NextStopPointName', 'ArrivalProximityText']

# Drop the columns
data_cleaned = data.drop(columns=columns_to_drop)

In [14]:
# Drop rows where 'ExpectedArrivalTime' is missing (since it's needed for target)
data_cleaned = data_cleaned.dropna()

In [15]:
# Inspect the data after handling missing values
data_cleaned.isnull().sum()

RecordedAtTime               0
DirectionRef                 0
OriginLat                    0
OriginLong                   0
DestinationLat               0
DestinationLong              0
VehicleLocation.Latitude     0
VehicleLocation.Longitude    0
DistanceFromStop             0
ExpectedArrivalTime          0
ScheduledArrivalTime         0
dtype: int64

In [16]:
data_cleaned.info()

<class 'pandas.core.frame.DataFrame'>
Index: 5804368 entries, 0 to 6730435
Data columns (total 11 columns):
 #   Column                     Dtype         
---  ------                     -----         
 0   RecordedAtTime             datetime64[ns]
 1   DirectionRef               int64         
 2   OriginLat                  float64       
 3   OriginLong                 float64       
 4   DestinationLat             float64       
 5   DestinationLong            float64       
 6   VehicleLocation.Latitude   float64       
 7   VehicleLocation.Longitude  float64       
 8   DistanceFromStop           float64       
 9   ExpectedArrivalTime        datetime64[ns]
 10  ScheduledArrivalTime       datetime64[ns]
dtypes: datetime64[ns](3), float64(7), int64(1)
memory usage: 531.4 MB


In [17]:
# Remove duplicate rows if any
data_cleaned = data_cleaned.drop_duplicates()

In [18]:
# Calculate the time difference between ExpectedArrivalTime and ScheduledArrivalTime
data_cleaned['TimeDifference'] = (data_cleaned['ExpectedArrivalTime'] - data_cleaned['ScheduledArrivalTime']).dt.total_seconds() / 60

# Inspect the data with the new target column
data_cleaned[['ExpectedArrivalTime', 'ScheduledArrivalTime', 'TimeDifference']].head(10)

Unnamed: 0,ExpectedArrivalTime,ScheduledArrivalTime,TimeDifference
0,2017-06-01 00:03:59,2017-06-01 00:06:14,-2.25
1,2017-06-01 00:03:56,2017-06-01 23:58:02,-1434.1
2,2017-06-01 00:03:56,2017-06-01 00:00:53,3.05
3,2017-06-01 00:04:03,2017-06-01 00:03:00,1.05
4,2017-06-01 00:03:56,2017-06-01 23:59:38,-1435.7
5,2017-06-01 00:03:56,2017-06-01 00:02:35,1.35
7,2017-06-01 00:04:13,2017-06-01 23:58:47,-1434.566667
8,2017-06-01 00:03:56,2017-06-01 00:00:00,3.933333
9,2017-06-01 00:03:56,2017-06-01 00:01:14,2.7
10,2017-06-01 00:04:29,2017-06-01 23:48:35,-1424.1


In [19]:
# Removing outliers by keeping values that are between -100 and 100 minutes
Q1 = data_cleaned["TimeDifference"].quantile(0.25)
Q3 = data_cleaned["TimeDifference"].quantile(0.75)
IQR = Q3 - Q1
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR
data_cleaned = data_cleaned[data_cleaned["TimeDifference"].between(lower_bound, upper_bound, inclusive="both")]

In [20]:
# Reset index after cleaning
data_cleaned = data_cleaned.reset_index(drop=True)

In [21]:
print(data_cleaned.columns)


Index(['RecordedAtTime', 'DirectionRef', 'OriginLat', 'OriginLong',
       'DestinationLat', 'DestinationLong', 'VehicleLocation.Latitude',
       'VehicleLocation.Longitude', 'DistanceFromStop', 'ExpectedArrivalTime',
       'ScheduledArrivalTime', 'TimeDifference'],
      dtype='object')


## Feature engineering

## Set target and features

In [25]:
#Run the cyclical cell to run this one first (sorry for the inconvenience)

y = data_cleaned["TimeDifference"]

X = data_cleaned[[
    "DirectionRef",
    "OriginLat",
    "OriginLong",
    "DestinationLat",
    "DestinationLong",
    "VehicleLocation.Latitude",
    "VehicleLocation.Longitude",
    "DistanceFromStop",
    "Day_sin",
    "Day_cos",
    "Hour_sin",
    "Hour_cos",
    "Minute_sin",
    "Minute_cos",
]]

In [26]:
X.info(), y.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5368352 entries, 0 to 5368351
Data columns (total 14 columns):
 #   Column                     Dtype  
---  ------                     -----  
 0   DirectionRef               int64  
 1   OriginLat                  float64
 2   OriginLong                 float64
 3   DestinationLat             float64
 4   DestinationLong            float64
 5   VehicleLocation.Latitude   float64
 6   VehicleLocation.Longitude  float64
 7   DistanceFromStop           float64
 8   Day_sin                    float64
 9   Day_cos                    float64
 10  Hour_sin                   float64
 11  Hour_cos                   float64
 12  Minute_sin                 float64
 13  Minute_cos                 float64
dtypes: float64(13), int64(1)
memory usage: 573.4 MB
<class 'pandas.core.series.Series'>
RangeIndex: 5368352 entries, 0 to 5368351
Series name: TimeDifference
Non-Null Count    Dtype  
--------------    -----  
5368352 non-null  float64
dtypes: fl

(None, None)

## Feature scaling

In [27]:
# Inspect the shapes of X and y
print(X.shape, y.shape)

(5368352, 14) (5368352,)


In [28]:
X.head(5)

Unnamed: 0,DirectionRef,OriginLat,OriginLong,DestinationLat,DestinationLong,VehicleLocation.Latitude,VehicleLocation.Longitude,DistanceFromStop,Day_sin,Day_cos,Hour_sin,Hour_cos,Minute_sin,Minute_cos
0,0,40.616104,-74.031143,40.656048,-73.907379,40.63517,-73.960803,76.0,0.204552,0.978856,0.0,1.0,0.309017,0.951057
1,0,40.875008,-73.880142,40.912376,-73.902534,40.88601,-73.912647,5.0,0.204552,0.978856,0.0,1.0,0.309017,0.951057
2,0,40.701748,-73.802399,40.666012,-73.735939,40.668002,-73.729348,267.0,0.204552,0.978856,0.0,1.0,0.406737,0.913545
3,0,40.731342,-73.990288,40.82111,-73.935898,40.792897,-73.950023,73.0,0.204552,0.978856,0.0,1.0,0.309017,0.951057
4,0,40.57708,-73.981293,40.64299,-73.878326,40.632258,-73.918318,35.0,0.204552,0.978856,0.0,1.0,0.309017,0.951057


In [29]:
X.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
DirectionRef,5368352.0,0.501647,0.499997,0.0,0.0,1.0,1.0,1.0
OriginLat,5368352.0,40.729948,0.09101,40.506882,40.657738,40.71523,40.80962,40.912365
OriginLong,5368352.0,-73.93037,0.094628,-74.248062,-73.987022,-73.93216,-73.87833,-73.701866
DestinationLat,5368352.0,40.729452,0.090349,40.508106,40.660854,40.71461,40.80908,40.912376
DestinationLong,5368352.0,-73.930431,0.094677,-74.248192,-73.988724,-73.93206,-73.87712,-73.701385
VehicleLocation.Latitude,5368352.0,40.72904,0.087242,40.502879,40.659091,40.72396,40.80479,40.912385
VehicleLocation.Longitude,5368352.0,-73.929923,0.088972,-74.252339,-73.978577,-73.93538,-73.88135,-73.701417
DistanceFromStop,5368352.0,234.427751,959.812906,0.0,32.0,104.0,211.0,33608.0
Day_sin,5368352.0,0.014976,0.710211,-0.999668,-0.697944,0.05147875,0.7338854,0.997018
Day_cos,5368352.0,-0.009583,0.703764,-0.998674,-0.716152,0.02574791,0.6792733,0.9947


## Standardising X

In [30]:
scaler = StandardScaler()

X_scaled = scaler.fit_transform(X)
X_scaled = pd.DataFrame(X_scaled, columns=X.columns, index=X.index)

In [31]:
df_combined = pd.concat([X_scaled, y], axis=1)
correlation_matrix = df_combined.corr()
correlation_matrix

Unnamed: 0,DirectionRef,OriginLat,OriginLong,DestinationLat,DestinationLong,VehicleLocation.Latitude,VehicleLocation.Longitude,DistanceFromStop,Day_sin,Day_cos,Hour_sin,Hour_cos,Minute_sin,Minute_cos,TimeDifference
DirectionRef,1.0,0.247657,0.263767,-0.24928,-0.272147,-0.001568,-0.007119,-0.003714,0.001489,0.001233,-0.01952,0.000422,0.000633,0.000416,0.017662
OriginLat,0.247657,1.0,0.421157,0.699022,0.304755,0.883836,0.387683,-0.051597,0.005406,0.002561,-0.049012,-0.005504,0.000711,0.000733,0.029065
OriginLong,0.263767,0.421157,1.0,0.305226,0.639664,0.393045,0.856672,-0.10727,0.002861,-0.001717,-0.02881,0.008466,-5.6e-05,0.00136,-0.004472
DestinationLat,-0.24928,0.699022,0.305226,1.0,0.424111,0.881841,0.38084,-0.048638,0.00262,0.001237,0.022388,-0.022613,0.000459,0.000562,-0.012116
DestinationLong,-0.272147,0.304755,0.639664,0.424111,1.0,0.384393,0.857776,-0.099506,0.00024,-0.002851,0.004538,0.008335,-0.000734,0.000502,-0.020241
VehicleLocation.Latitude,-0.001568,0.883836,0.393045,0.881841,0.384393,1.0,0.442149,-0.031181,0.004942,0.00235,-0.018564,-0.017196,0.000555,0.000839,0.008233
VehicleLocation.Longitude,-0.007119,0.387683,0.856672,0.38084,0.857776,0.442149,1.0,-0.107138,0.001738,-0.001928,-0.016253,0.008468,-0.000667,0.001158,-0.016737
DistanceFromStop,-0.003714,-0.051597,-0.10727,-0.048638,-0.099506,-0.031181,-0.107138,1.0,0.001091,-0.0023,0.005032,0.003776,-0.000115,1e-05,-0.011392
Day_sin,0.001489,0.005406,0.002861,0.00262,0.00024,0.004942,0.001738,0.001091,1.0,-0.007993,-0.004699,0.00363,-0.000801,-0.002138,-0.00012
Day_cos,0.001233,0.002561,-0.001717,0.001237,-0.002851,0.00235,-0.001928,-0.0023,-0.007993,1.0,-0.004554,0.004193,0.000995,0.001574,-0.015301


In [32]:
correlation = data_cleaned['DistanceFromStop'].corr(data_cleaned['ScheduledArrivalTime'])
print("Correlation between DistanceFromStop and TimeDifference:", correlation)

Correlation between DistanceFromStop and TimeDifference: 0.00030602803085938


# Cyclical Encoding for timetables

In [23]:
def cyclical_encoding(data, column, max_val, prefix):
    data[f"{prefix}_sin"] = np.sin(2 * np.pi * data[column] / max_val)
    data[f"{prefix}_cos"] = np.cos(2 * np.pi * data[column] / max_val)
    return data

data_cleaned['ArrivalDay'] = data_cleaned['ExpectedArrivalTime'].dt.day
data_cleaned['ArrivalHour'] = data_cleaned['ExpectedArrivalTime'].dt.hour
data_cleaned['ArrivalMinute'] = data_cleaned['ExpectedArrivalTime'].dt.minute
data_cleaned['ArrivalSecond'] = data_cleaned['ExpectedArrivalTime'].dt.second

data_cleaned = cyclical_encoding(data=data_cleaned, column="ArrivalDay", max_val=30.5, prefix="Day")
data_cleaned = cyclical_encoding(data=data_cleaned, column="ArrivalHour", max_val=24, prefix="Hour")
data_cleaned = cyclical_encoding(data=data_cleaned, column="ArrivalMinute", max_val=60, prefix="Minute")
data_cleaned = cyclical_encoding(data=data_cleaned, column="ArrivalSecond", max_val=60, prefix="Second")

In [24]:
data_cleaned.head(5)

Unnamed: 0,RecordedAtTime,DirectionRef,OriginLat,OriginLong,DestinationLat,DestinationLong,VehicleLocation.Latitude,VehicleLocation.Longitude,DistanceFromStop,ExpectedArrivalTime,...,ArrivalMinute,ArrivalSecond,Day_sin,Day_cos,Hour_sin,Hour_cos,Minute_sin,Minute_cos,Second_sin,Second_cos
0,2017-06-01 00:03:34,0,40.616104,-74.031143,40.656048,-73.907379,40.63517,-73.960803,76.0,2017-06-01 00:03:59,...,3,59,0.204552,0.978856,0.0,1.0,0.309017,0.951057,-0.104528,0.994522
1,2017-06-01 00:03:49,0,40.875008,-73.880142,40.912376,-73.902534,40.88601,-73.912647,5.0,2017-06-01 00:03:56,...,3,56,0.204552,0.978856,0.0,1.0,0.309017,0.951057,-0.406737,0.913545
2,2017-06-01 00:03:31,0,40.701748,-73.802399,40.666012,-73.735939,40.668002,-73.729348,267.0,2017-06-01 00:04:03,...,4,3,0.204552,0.978856,0.0,1.0,0.406737,0.913545,0.309017,0.951057
3,2017-06-01 00:03:40,0,40.731342,-73.990288,40.82111,-73.935898,40.792897,-73.950023,73.0,2017-06-01 00:03:56,...,3,56,0.204552,0.978856,0.0,1.0,0.309017,0.951057,-0.406737,0.913545
4,2017-06-01 00:03:27,0,40.57708,-73.981293,40.64299,-73.878326,40.632258,-73.918318,35.0,2017-06-01 00:03:56,...,3,56,0.204552,0.978856,0.0,1.0,0.309017,0.951057,-0.406737,0.913545


# Deploying baseline model

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

KeyboardInterrupt: 

In [None]:
X_train_sample = X_train[:10000]
y_train_sample = y_train[:10000]

In [None]:
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

: 

In [None]:
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = mse ** 0.5
r2 = r2_score(y_test, y_pred)

print(f"MAE: {mae}, RMSE: {rmse}, R2: {r2}")

NameError: name 'mean_absolute_error' is not defined

Model- Gradient boosting 

In [33]:
# Setting SEED for reproducibility
SEED = 23

In [34]:
# Splitting dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=SEED)

In [35]:
# Instantiate Gradient Boosting Regressor
gbr = GradientBoostingRegressor(loss='absolute_error', n_estimators=300, learning_rate=0.1, random_state=SEED, max_features=5, max_depth=1)

In [36]:
# Fit to training set
gbr.fit(X_train, y_train)

In [39]:
# Predict on test set
y_pred = gbr.predict(X_test)

In [55]:
# test set RMSE
test_rmse = mean_squared_error(y_test, y_pred) ** (1 / 2)

In [56]:
# Print rmse
print('Root mean Square error: {:.2f}'.format(test_rmse))

Root mean Square error: 5.89


In [67]:
r2 = r2_score(y_test, y_pred)
print({r2})

{-0.0622733978998915}


XG Boost

In [None]:
pip install xgboost

Collecting xgboost
  Downloading xgboost-2.1.3-py3-none-manylinux_2_28_x86_64.whl.metadata (2.1 kB)
Collecting nvidia-nccl-cu12 (from xgboost)
  Downloading nvidia_nccl_cu12-2.23.4-py3-none-manylinux2014_x86_64.whl.metadata (1.8 kB)
Downloading xgboost-2.1.3-py3-none-manylinux_2_28_x86_64.whl (153.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m153.9/153.9 MB[0m [31m3.4 MB/s[0m eta [36m0:00:00[0m00:01[0m00:02[0m
[?25hDownloading nvidia_nccl_cu12-2.23.4-py3-none-manylinux2014_x86_64.whl (199.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m199.0/199.0 MB[0m [31m4.5 MB/s[0m eta [36m0:00:00[0m00:01[0m00:02[0m
[?25hInstalling collected packages: nvidia-nccl-cu12, xgboost
Successfully installed nvidia-nccl-cu12-2.23.4 xgboost-2.1.3
Note: you may need to restart the kernel to use updated packages.


In [45]:
#Splitting the dataset into training and testing
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.25, random_state=0)

In [None]:
# Initialize the XGBoost regressor model
XG_model = xgb.XGBRegressor(
    n_estimators=100,
    learning_rate=0.1,
    max_depth=6,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42)

In [60]:
# Train the model on the training data
XG_model.fit(X_train, y_train)

In [61]:
# Make predictions on the test data
y_pred_XG = XG_model.predict(X_test)

In [62]:
# Calculate evaluation metrics
mae = mean_absolute_error(y_test, y_pred_XG)
rmse = np.sqrt(mean_squared_error(y_test, y_pred_XG))
r2 = r2_score(y_test, y_pred_XG)

In [66]:
print("XGBoost - MAE:", mae)
print("XGBoost - RMSE:", rmse)
print("XGBoost - R²:", r2)

XGBoost - MAE: 4.079893407653279
XGBoost - RMSE: 5.275983012530777
XGBoost - R²: 0.1472125204580207


DNN Model

In [69]:
pip install tensorflow

Collecting tensorflow
  Downloading tensorflow-2.18.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.1 kB)
Collecting absl-py>=1.0.0 (from tensorflow)
  Using cached absl_py-2.1.0-py3-none-any.whl.metadata (2.3 kB)
Collecting astunparse>=1.6.0 (from tensorflow)
  Using cached astunparse-1.6.3-py2.py3-none-any.whl.metadata (4.4 kB)
Collecting flatbuffers>=24.3.25 (from tensorflow)
  Using cached flatbuffers-24.3.25-py2.py3-none-any.whl.metadata (850 bytes)
Collecting gast!=0.5.0,!=0.5.1,!=0.5.2,>=0.2.1 (from tensorflow)
  Downloading gast-0.6.0-py3-none-any.whl.metadata (1.3 kB)
Collecting google-pasta>=0.1.1 (from tensorflow)
  Using cached google_pasta-0.2.0-py3-none-any.whl.metadata (814 bytes)
Collecting libclang>=13.0.0 (from tensorflow)
  Using cached libclang-18.1.1-py2.py3-none-manylinux2010_x86_64.whl.metadata (5.2 kB)
Collecting opt-einsum>=2.3.2 (from tensorflow)
  Using cached opt_einsum-3.4.0-py3-none-any.whl.metadata (6.3 kB)
Collecting protobuf!=4.

[autoreload of numpy._core.multiarray failed: Traceback (most recent call last):
  File "/home/alice123/.pyenv/versions/3.10.6/envs/Smart_Public_Bus_Optimization_in_Mauritius/lib/python3.10/site-packages/IPython/extensions/autoreload.py", line 276, in check
    superreload(m, reload, self.old_objects)
  File "/home/alice123/.pyenv/versions/3.10.6/envs/Smart_Public_Bus_Optimization_in_Mauritius/lib/python3.10/site-packages/IPython/extensions/autoreload.py", line 475, in superreload
    module = reload(module)
  File "/home/alice123/.pyenv/versions/3.10.6/lib/python3.10/importlib/__init__.py", line 169, in reload
    _bootstrap._exec(spec, module)
  File "<frozen importlib._bootstrap>", line 619, in _exec
  File "<frozen importlib._bootstrap_external>", line 883, in exec_module
  File "<frozen importlib._bootstrap>", line 241, in _call_with_frames_removed
  File "/home/alice123/.pyenv/versions/3.10.6/envs/Smart_Public_Bus_Optimization_in_Mauritius/lib/python3.10/site-packages/numpy/_core

RecursionError: maximum recursion depth exceeded in comparison