In [50]:
import pandas as pd
#import osmnx as ox
#import numpy as np
from sklearn.model_selection import train_test_split
import xgboost as xgb


In [51]:

df = pd.read_parquet(r'C:\Users\wikto\OneDrive\Dokumenty\AA_projects\road-optimization\data\preprocessed-dataset')

# Changing datatype to category to be sure it's not 'datetime64[us]'
for i in df:
    if df[i].dtype == 'object':
        df[i] = df[i].astype('category')

# Train & Test variables
Y = df['time_diffrence h']
X = df.drop(columns=['time_diffrence h'])
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, train_size=0.8, random_state=123)
print(X_train.dtypes)


trip_distance km                   float64
RatecodeID                         float64
congestion_surcharge               float64
PULocationID                         int32
PULBorough                        category
DOLocationID                         int32
DOLBorough                        category
fare_amount                        float64
extra                              float64
tolls_amount                       float64
Airport_fee                        float64
cbd_congestion_fee                 float64
average_speed km/h                 float64
Temperature                        float64
Snowfall                           float64
Showers                            float64
Rain                               float64
Precipitation                      float64
Wind_speed_10m                     float64
pickup_month                       float32
pickup_day                         float32
pickup_minutes_after_midnight      float32
dropoff_month                      float32
dropoff_day

In [52]:
print(df.dtypes)
# Converting dataset into DMatrix structure
xgb_train = xgb.DMatrix(X_train, Y_train, enable_categorical=True)
xgb_test = xgb.DMatrix(X_test, Y_test, enable_categorical=True)

# Specifying params for our trees
params = {'objective': 'reg:squarederror',
          'learning_rate': 0.02,
          'max_depth': 4,
          'tree_method': 'hist',
          'min_child_weight': 8
          }

# Training model
nb = 700
evals_result = {}
watchlist = [(xgb_test, "test"), (xgb_train, "train")]
model = xgb.train(params=params, dtrain=xgb_train, num_boost_round=nb, evals=watchlist,
                  verbose_eval=100, early_stopping_rounds=50, evals_result=evals_result)

# Watching performance for evaluating

#metric_name = list(evals_result['test'].keys())[0]  
metric_name = 'rmse'

train_score = evals_result['train'][metric_name][-1]
test_score = evals_result['test'][metric_name][-1]

# Best last result
log_data = {
    "best_iteration": model.best_iteration,
    "best_score": model.best_score,
    "train_score": round(train_score, 4),
    "test_score": round(test_score, 4),
    "features": ", ".join(X.columns.tolist()),
    "params": str(params)
}

# Saving to *.txt
with open('model_summary.txt', 'a', encoding='utf-8') as txt_file:
    for key, value in log_data.items():
        txt_file.write(f"{key}: {value}\n")

y_pred_test = model.predict(xgb_test)

time_diffrence h                   float64
trip_distance km                   float64
RatecodeID                         float64
congestion_surcharge               float64
PULocationID                         int32
PULBorough                        category
DOLocationID                         int32
DOLBorough                        category
fare_amount                        float64
extra                              float64
tolls_amount                       float64
Airport_fee                        float64
cbd_congestion_fee                 float64
average_speed km/h                 float64
Temperature                        float64
Snowfall                           float64
Showers                            float64
Rain                               float64
Precipitation                      float64
Wind_speed_10m                     float64
pickup_month                       float32
pickup_day                         float32
pickup_minutes_after_midnight      float32
dropoff_mon

In [53]:
print(f"Średnia z czasu przejazdu {Y_train.mean()}")
print(f"Odcyhelnie standardowe: {Y_train.std()}")

Średnia z czasu przejazdu 0.24059080054268075
Odcyhelnie standardowe: 0.18843015232788624


In [60]:
# Residual analysis
print(f'Shape of Y_test: {Y_test.shape}')
print(f'Shape of y_pred_test: {y_pred_test.shape}')
print('-'*30)

residuals_test = round((Y_test - y_pred_test), 4)
print(f'Residuals for Test data: ')
residuals_test = residuals_test.to_frame(name='residuals in h')
print(residuals_test.head())
print('-'*30)

# Top 10 largest residuals
n = 800
print(f'Top {n} largest residuals:')
top_largest = residuals_test.nlargest(n, 'residuals in h')
print(top_largest)

target_user_id = 1249791
#row_position = Y_test.index.get_loc(target_user_id)
#print(f"Prediction for user {target_user_id}: {y_pred_test[row_position]}")


Shape of Y_test: (564789,)
Shape of y_pred_test: (564789,)
------------------------------
Residuals for Test data: 
         residuals in h
user_id                
1090813          0.0060
1999329          0.0075
2803333          0.0193
595057           0.0039
269474          -0.0021
------------------------------
Top 800 largest residuals:
         residuals in h
user_id                
572447           1.1191
1815856          1.0265
2693379          0.9635
2495106          0.9630
1209574          0.9058
...                 ...
472524           0.0943
2533710          0.0942
473638           0.0942
2145478          0.0942
147488           0.0941

[800 rows x 1 columns]


In [57]:
print(df.iloc[1815856])

time_diffrence h                   0.087222
trip_distance km                        0.7
RatecodeID                              1.0
congestion_surcharge                    0.0
PULocationID                            152
PULBorough                        Manhattan
DOLocationID                            166
DOLBorough                        Manhattan
fare_amount                             6.5
extra                                   0.0
tolls_amount                            0.0
Airport_fee                             0.0
cbd_congestion_fee                      0.0
average_speed km/h                     8.03
Temperature                            -1.5
Snowfall                                0.0
Showers                                 0.0
Rain                                    0.0
Precipitation                           0.0
Wind_speed_10m                         19.8
pickup_month                            1.0
pickup_day                             14.0
pickup_minutes_after_midnight   