In [50]:
import numpy as np 
import pandas as pd 

In [51]:
df = pd.read_csv('../DataOut/processed_data/bus_running_times_feature_added_all.csv')


In [52]:
#drop null values
df = df.dropna()
#drop run_time column
df.drop(['run_time'], axis=1,inplace = True)
df.drop(['end_time'], axis=1,inplace = True)

In [53]:
#unique values in conditions
print("Unique values in conditions: ", df['conditions'].unique())

Unique values in conditions:  ['Partially cloudy' 'Rain, Overcast' 'Overcast' 'Rain, Partially cloudy'
 'Clear' 'Rain']


In [54]:
df['date'] = pd.to_datetime(df['date'])
df['start_time'] = pd.to_datetime(df['start_time'],
                                   format='%H:%M:%S').dt.time


In [55]:
df['year'] = df['date'].dt.year
df['month'] = df['date'].dt.month
df['day'] = df['date'].dt.day

# Drop the original date column
df.drop('date', axis=1, inplace=True)

In [56]:
df['start_float'] = df['start_time'].apply(lambda x: x.hour + x.minute/60.0 + x.second/3600.0)

#drop 
df = df.drop(['start_time'], axis=1)

In [57]:
#replace ' partially cloudy' with 'partially cloudy'
df['conditions'] = df['conditions'].replace(' Partially cloudy', 'Partially cloudy')

In [58]:
from sklearn.preprocessing import OrdinalEncoder

conditions = ['Rain', 'Rain, Overcast', 'Rain, Partially cloudy','Overcast','Partially cloudy', 'Clear']
encoder = OrdinalEncoder(categories=[conditions])
df['conditions_encoded'] = encoder.fit_transform(df[['conditions']])
#df drop conditions
df = df.drop(['conditions'], axis=1)

# XG boost without clusters

In [59]:
df = df[df['direction'] == 1]

In [60]:
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error
features = df.drop(['run_time_in_seconds'], axis=1)  # Drop the target feature
target = df['run_time_in_seconds']

# Split the data into two sets based on the week number

train_data = df[df['week_no'] <= 36]
test_data = df[df['week_no'] > 36]

X_train, X_test = train_data.drop(['run_time_in_seconds'], axis=1), test_data.drop(['run_time_in_seconds'], axis=1)
y_train, y_test = train_data['run_time_in_seconds'], test_data['run_time_in_seconds']


# Initialize XGBoost regressor
model = xgb.XGBRegressor(objective='reg:squarederror', n_estimators=100, max_depth=5, learning_rate=0.1)

# Train the model on the training data
model.fit(X_train, y_train)

# Make predictions on the testing data
predictions = model.predict(X_test)

# Evaluate the model's performance
mae = mean_absolute_error(y_test, predictions)
rmse = np.sqrt(mean_squared_error(y_test, predictions))

print(f'Mean Absolute Error: {mae}')
print(f'Root Mean Square Error: {rmse}')

# # Now you can use the trained model to make predictions on new data
# new_data = pd.DataFrame(...)  # Create a new DataFrame with the same columns as the original dataset
# new_predictions = model.predict(new_data)

Mean Absolute Error: 34.03501728604507
Root Mean Square Error: 53.775282117909185


In [61]:
importance = model.feature_importances_

# create a dictionary of feature names and their importance scores
feature_importance = dict(zip(X_train.columns, importance))

# sort the features by importance score in descending order
sorted_features = sorted(feature_importance.items(), key=lambda x: x[1], reverse=True)

# print the sorted features
for feature, score in sorted_features:
    print(f"{feature}: {score}")

length: 0.32432883977890015
rt(w-3): 0.18996544182300568
rt(w-1): 0.13857701420783997
rt(t-1): 0.13345922529697418
rt(w-2): 0.07970735430717468
rt(t-2): 0.05785393342375755
rt(n-3): 0.013545498251914978
dt(n-1): 0.007554370444267988
month: 0.006062169559299946
segment: 0.005370727740228176
Sunday/holiday: 0.005176009610295296
rt(n-1): 0.005118425469845533
trip_id: 0.004405011422932148
day: 0.004114991519600153
start_float: 0.003477453952655196
rt(n-2): 0.0033529289066791534
temp: 0.0031491739209741354
day_of_week: 0.002973935566842556
conditions_encoded: 0.0026598237454891205
precip: 0.0025684149004518986
windspeed: 0.0017608441412448883
time_of_day: 0.0017533233622089028
saturday: 0.0016550726722925901
deviceid: 0.0014099752297624946
direction: 0.0
weekday/end: 0.0
week_no: 0.0
hour_of_day: 0.0
year: 0.0


# XG boost with clusters

In [None]:
df = df[df['direction'] == 1]

In [None]:
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error
features = df.drop(['run_time_in_seconds'], axis=1)  # Drop the target feature
target = df['run_time_in_seconds']

# Split the data into two sets based on the week number

train_data = df[df['week_no'] <= 36]
test_data = df[df['week_no'] > 36]

X_train, X_test = train_data.drop(['run_time_in_seconds'], axis=1), test_data.drop(['run_time_in_seconds'], axis=1)
y_train, y_test = train_data['run_time_in_seconds'], test_data['run_time_in_seconds']


# Initialize XGBoost regressor
model = xgb.XGBRegressor(objective='reg:squarederror', n_estimators=100, max_depth=5, learning_rate=0.1)

# Train the model on the training data
model.fit(X_train, y_train)

# Make predictions on the testing data
predictions = model.predict(X_test)

# Evaluate the model's performance
mae = mean_absolute_error(y_test, predictions)
rmse = np.sqrt(mean_squared_error(y_test, predictions))

print(f'Mean Absolute Error: {mae}')
print(f'Root Mean Square Error: {rmse}')

# # Now you can use the trained model to make predictions on new data
# new_data = pd.DataFrame(...)  # Create a new DataFrame with the same columns as the original dataset
# new_predictions = model.predict(new_data)

Mean Absolute Error: 34.03501728604507
Root Mean Square Error: 53.775282117909185


In [None]:
importance = model.feature_importances_

# create a dictionary of feature names and their importance scores
feature_importance = dict(zip(X_train.columns, importance))

# sort the features by importance score in descending order
sorted_features = sorted(feature_importance.items(), key=lambda x: x[1], reverse=True)

# print the sorted features
for feature, score in sorted_features:
    print(f"{feature}: {score}")

length: 0.32432883977890015
rt(w-3): 0.18996544182300568
rt(w-1): 0.13857701420783997
rt(t-1): 0.13345922529697418
rt(w-2): 0.07970735430717468
rt(t-2): 0.05785393342375755
rt(n-3): 0.013545498251914978
dt(n-1): 0.007554370444267988
month: 0.006062169559299946
segment: 0.005370727740228176
Sunday/holiday: 0.005176009610295296
rt(n-1): 0.005118425469845533
trip_id: 0.004405011422932148
day: 0.004114991519600153
start_float: 0.003477453952655196
rt(n-2): 0.0033529289066791534
temp: 0.0031491739209741354
day_of_week: 0.002973935566842556
conditions_encoded: 0.0026598237454891205
precip: 0.0025684149004518986
windspeed: 0.0017608441412448883
time_of_day: 0.0017533233622089028
saturday: 0.0016550726722925901
deviceid: 0.0014099752297624946
direction: 0.0
weekday/end: 0.0
week_no: 0.0
hour_of_day: 0.0
year: 0.0
