In [1]:
import numpy as np 
import pandas as pd 

In [2]:
df = pd.read_csv('../DataOut/processed_data/bus_running_times_feature_added_all.csv')
pivot_df = pd.read_csv('../DataOut/pivot_df.csv')

In [3]:
selected_deviceid = pivot_df['deviceid'].unique()
selected_deviceid

array([ 116,  117,  123,  128,  262,  264,  274,  279,  294,  505,  513,
       1143, 1358, 1377], dtype=int64)

In [4]:
#drop null values
df = df.dropna()
#drop run_time column
df.drop(['run_time'], axis=1,inplace = True)
df.drop(['end_time'], axis=1,inplace = True)

In [5]:
df = df[df['deviceid'].isin(selected_deviceid)]
df['deviceid'].unique()

array([ 262.,  274.,  123.,  279.,  264.,  294.,  128.,  505.,  116.,
        117., 1358., 1143.,  513., 1377.])

In [6]:
#unique values in conditions
print("Unique values in conditions: ", df['conditions'].unique())

Unique values in conditions:  ['Partially cloudy' 'Rain, Overcast' 'Overcast' 'Rain, Partially cloudy'
 'Clear' 'Rain']


In [7]:
df['date'] = pd.to_datetime(df['date'])
df['start_time'] = pd.to_datetime(df['start_time'],
                                   format='%H:%M:%S').dt.time


In [8]:
df['year'] = df['date'].dt.year
df['month'] = df['date'].dt.month
df['day'] = df['date'].dt.day

# Drop the original date column
df.drop('date', axis=1, inplace=True)

In [9]:
df['start_float'] = df['start_time'].apply(lambda x: x.hour + x.minute/60.0 + x.second/3600.0)

#drop 
df = df.drop(['start_time'], axis=1)

In [10]:
#replace ' partially cloudy' with 'partially cloudy'
df['conditions'] = df['conditions'].replace(' Partially cloudy', 'Partially cloudy')

In [11]:
from sklearn.preprocessing import OrdinalEncoder

conditions = ['Rain', 'Rain, Overcast', 'Rain, Partially cloudy','Overcast','Partially cloudy', 'Clear']
encoder = OrdinalEncoder(categories=[conditions])
df['conditions_encoded'] = encoder.fit_transform(df[['conditions']])
#df drop conditions
df = df.drop(['conditions'], axis=1)

# XG boost without clusters

In [12]:
df = df[df['direction'] == 1]

In [13]:
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error
features = df.drop(['run_time_in_seconds'], axis=1)  # Drop the target feature
target = df['run_time_in_seconds']

# Split the data into two sets based on the week number

train_data = df[df['week_no'] <= 36]
test_data = df[df['week_no'] > 36]

X_train, X_test = train_data.drop(['run_time_in_seconds'], axis=1), test_data.drop(['run_time_in_seconds'], axis=1)
y_train, y_test = train_data['run_time_in_seconds'], test_data['run_time_in_seconds']


# Initialize XGBoost regressor
model = xgb.XGBRegressor(objective='reg:squarederror', n_estimators=100, max_depth=5, learning_rate=0.1)

# Train the model on the training data
model.fit(X_train, y_train)

# Make predictions on the testing data
predictions = model.predict(X_test)

# Evaluate the model's performance
mae = mean_absolute_error(y_test, predictions)
rmse = np.sqrt(mean_squared_error(y_test, predictions))

print(f'Mean Absolute Error: {mae}')
print(f'Root Mean Square Error: {rmse}')

# # Now you can use the trained model to make predictions on new data
# new_data = pd.DataFrame(...)  # Create a new DataFrame with the same columns as the original dataset
# new_predictions = model.predict(new_data)

Mean Absolute Error: 33.3954266831142
Root Mean Square Error: 54.3347575017282


In [14]:
importance = model.feature_importances_

# create a dictionary of feature names and their importance scores
feature_importance = dict(zip(X_train.columns, importance))

# sort the features by importance score in descending order
sorted_features = sorted(feature_importance.items(), key=lambda x: x[1], reverse=True)

# print the sorted features
for feature, score in sorted_features:
    print(f"{feature}: {score}")

length: 0.3192259669303894
rt(t-1): 0.16831713914871216
rt(w-2): 0.16576388478279114
rt(w-1): 0.14369727671146393
rt(w-3): 0.08556853234767914
rt(t-2): 0.032657839357852936
rt(n-3): 0.013848804868757725
Sunday/holiday: 0.007760612294077873
dt(n-1): 0.007424660958349705
month: 0.006134753115475178
rt(n-1): 0.005513362120836973
segment: 0.005395333748310804
trip_id: 0.004348497837781906
saturday: 0.004033662844449282
temp: 0.003769027069211006
day_of_week: 0.003767871530726552
conditions_encoded: 0.0034538893960416317
start_float: 0.003439640859141946
day: 0.003426200244575739
rt(n-2): 0.0034071679692715406
precip: 0.0033848609309643507
time_of_day: 0.00196266476996243
deviceid: 0.0019411962712183595
windspeed: 0.0017572023207321763
direction: 0.0
weekday/end: 0.0
week_no: 0.0
hour_of_day: 0.0
year: 0.0


# XG boost with clusters

In [15]:
#add new feature

# Merge the dataframes using the device_id column
new_df = pd.merge(df, pivot_df, on='deviceid', how='inner')

In [16]:
new_df = new_df[new_df['direction'] == 1]

In [17]:
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error
features1 = new_df.drop(['run_time_in_seconds'], axis=1)  # Drop the target feature
target1 = new_df['run_time_in_seconds']

# Split the data into two sets based on the week number

train_data1 = new_df[new_df['week_no'] <= 36]
test_data1 = new_df[new_df['week_no'] > 36]

X_train1, X_test1 = train_data1.drop(['run_time_in_seconds'], axis=1), test_data1.drop(['run_time_in_seconds'], axis=1)
y_train1, y_test1 = train_data1['run_time_in_seconds'], test_data1['run_time_in_seconds']


# Initialize XGBoost regressor
model1 = xgb.XGBRegressor(objective='reg:squarederror', n_estimators=100, max_depth=5, learning_rate=0.1)

# Train the model on the training data
model1.fit(X_train1, y_train1)

# Make predictions on the testing data
predictions1 = model1.predict(X_test1)

# Create a copy of the test data and add predicted values
predicted_df = test_data1.copy()
predicted_df['predicted_run_time'] = predictions1

# Evaluate the model's performance
mae = mean_absolute_error(y_test1, predictions1)
rmse = np.sqrt(mean_squared_error(y_test1, predictions1))

print(f'Mean Absolute Error: {mae}')
print(f'Root Mean Square Error: {rmse}')

# # Now you can use the trained model to make predictions on new data
# new_data = pd.DataFrame(...)  # Create a new DataFrame with the same columns as the original dataset
# new_predictions = model.predict(new_data)

Mean Absolute Error: 33.458892976879
Root Mean Square Error: 54.16974363414823


In [18]:
importance1 = model1.feature_importances_

# create a dictionary of feature names and their importance scores
feature_importance1 = dict(zip(X_train1.columns, importance1))

# sort the features by importance score in descending order
sorted_features1 = sorted(feature_importance1.items(), key=lambda x: x[1], reverse=True)

# print the sorted features
for feature, score in sorted_features1:
    print(f"{feature}: {score}")

length: 0.313946396112442
rt(t-1): 0.18139252066612244
rt(w-2): 0.14870406687259674
rt(w-1): 0.1425199657678604
rt(w-3): 0.08965936303138733
rt(t-2): 0.03325464203953743
rt(n-3): 0.01241157203912735
dt(n-1): 0.00818333774805069
Sunday/holiday: 0.006375066004693508
month: 0.005853497423231602
segment: 0.005489332135766745
rt(n-1): 0.0054112207144498825
trip_id: 0.004726135637611151
temp: 0.004436873830854893
precip: 0.0037928642705082893
conditions_encoded: 0.003764417488127947
time_of_day: 0.003736112266778946
day: 0.0037075288128107786
rt(n-2): 0.0035849944688379765
day_of_week: 0.003548823297023773
start_float: 0.00334230181761086
windspeed: 0.0025074791628867388
TSCluster 0: 0.0021216007880866528
TSCluster 3: 0.002069223904982209
deviceid: 0.002029384719207883
TSCluster 2: 0.001957542495802045
TSCluster 1: 0.001473717507906258
direction: 0.0
saturday: 0.0
weekday/end: 0.0
week_no: 0.0
hour_of_day: 0.0
year: 0.0


In [19]:
predicted_df

Unnamed: 0,trip_id,deviceid,direction,segment,run_time_in_seconds,length,day_of_week,time_of_day,Sunday/holiday,saturday,...,windspeed,dt(n-1),year,start_float,conditions_encoded,TSCluster 0,TSCluster 1,TSCluster 2,TSCluster 3,predicted_run_time
6040,23068.0,262.0,1.0,1.0,100.0,0.63,0.0,6.50,0.0,0,...,6.1,0.0,2022,6.527500,3.0,0.000000,36.190837,11.179357,52.629806,105.955620
6041,23068.0,262.0,1.0,2.0,207.0,1.28,0.0,6.50,0.0,0,...,6.1,105.0,2022,6.584444,3.0,0.000000,36.190837,11.179357,52.629806,252.812256
6042,23068.0,262.0,1.0,3.0,480.0,2.11,0.0,6.50,0.0,0,...,6.1,15.0,2022,6.646111,3.0,0.000000,36.190837,11.179357,52.629806,481.192261
6043,23068.0,262.0,1.0,4.0,197.0,1.55,0.0,6.75,0.0,0,...,6.1,56.0,2022,6.795000,3.0,0.000000,36.190837,11.179357,52.629806,217.610718
6044,23068.0,262.0,1.0,5.0,100.0,0.84,0.0,6.75,0.0,0,...,6.1,0.0,2022,6.849722,3.0,0.000000,36.190837,11.179357,52.629806,126.057121
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
78521,25350.0,1377.0,1.0,11.0,151.0,1.10,1.0,15.00,0.0,0,...,9.0,0.0,2022,15.227500,3.0,1.100556,50.574607,25.139242,23.185595,182.463150
78522,25350.0,1377.0,1.0,12.0,165.0,1.31,1.0,15.25,0.0,0,...,9.0,0.0,2022,15.269444,3.0,1.100556,50.574607,25.139242,23.185595,175.038116
78523,25350.0,1377.0,1.0,13.0,104.0,1.15,1.0,15.25,0.0,0,...,9.0,0.0,2022,15.315278,3.0,1.100556,50.574607,25.139242,23.185595,160.168594
78524,25350.0,1377.0,1.0,14.0,80.0,0.89,1.0,15.25,0.0,0,...,9.0,0.0,2022,15.344167,3.0,1.100556,50.574607,25.139242,23.185595,116.873428


In [20]:
predicted_df.to_csv('../DataOut/runtime_with_predictions.csv', index=False)