## Machine Learning Pipeline

In [6]:
import modin.pandas as pd
from modin.config import Engine
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score, accuracy_score, precision_score, recall_score
from xgboost import XGBRegressor

Engine.put("dask")

In [7]:
PATH = "../DATA"
PATHS = [PATH+"/yellow_tripdata_201"+str(i)+"-01.parquet" for i in range(1,4)]
temporary = "../DATA/yellow_tripdata_2011-01.parquet"

df = pd.read_parquet(temporary)



ArrowMemoryError: malloc of size 134217728 failed

In [None]:
df.dtypes

VendorID                          int64
tpep_pickup_datetime     datetime64[us]
tpep_dropoff_datetime    datetime64[us]
passenger_count                   int64
trip_distance                   float64
RatecodeID                        int64
store_and_fwd_flag               object
PULocationID                      int64
DOLocationID                      int64
payment_type                      int64
fare_amount                     float64
extra                           float64
mta_tax                         float64
tip_amount                      float64
tolls_amount                    float64
improvement_surcharge           float64
total_amount                    float64
congestion_surcharge            float64
airport_fee                     float64
dtype: object

In [None]:
print(f"File has {len(df)} entries")
print(df.columns)
print()
print(df.head(20))

File has 13464997 entries
Index(['VendorID', 'tpep_pickup_datetime', 'tpep_dropoff_datetime',
       'passenger_count', 'trip_distance', 'RatecodeID', 'store_and_fwd_flag',
       'PULocationID', 'DOLocationID', 'payment_type', 'fare_amount', 'extra',
       'mta_tax', 'tip_amount', 'tolls_amount', 'improvement_surcharge',
       'total_amount', 'congestion_surcharge', 'airport_fee'],
      dtype='object')

    VendorID tpep_pickup_datetime tpep_dropoff_datetime  passenger_count  \
0          2  2011-01-01 00:10:00   2011-01-01 00:12:00                4   
1          2  2011-01-01 00:04:00   2011-01-01 00:13:00                4   
2          2  2011-01-01 00:14:00   2011-01-01 00:16:00                4   
3          2  2011-01-01 00:04:00   2011-01-01 00:06:00                5   
4          2  2011-01-01 00:08:00   2011-01-01 00:08:00                5   
5          2  2011-01-01 00:23:00   2011-01-01 00:23:00                1   
6          2  2011-01-01 00:25:00   2011-01-01 00:25:00  

In [None]:
df.insert(loc=0, column='duration', value=(df['tpep_dropoff_datetime'] - df['tpep_pickup_datetime']).dt.total_seconds(), allow_duplicates=True)

In [None]:
df.drop(columns=["VendorID","passenger_count","RatecodeID","store_and_fwd_flag","PULocationID","DOLocationID","payment_type","extra","mta_tax","tip_amount","tolls_amount","improvement_surcharge","total_amount","congestion_surcharge","airport_fee","tpep_pickup_datetime","tpep_dropoff_datetime"])

Unnamed: 0,duration,trip_distance,fare_amount
0,120.0,0.0,2.9
1,540.0,0.0,5.7
2,120.0,0.0,2.9
3,120.0,0.0,2.9
4,0.0,0.0,2.5
...,...,...,...
13464992,142.0,0.3,3.3
13464993,665.0,1.8,7.7
13464994,639.0,1.8,7.7
13464995,705.0,3.1,10.5


In [None]:
X = df[['duration', 'trip_distance']]
y = df['fare_amount']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=28)

In [None]:
xgb_model = XGBRegressor()
cv_scores = cross_val_score(xgb_model, X_train, y_train, cv=5, scoring='neg_mean_squared_error')

In [None]:
xgb_model.fit(X_train, y_train)

##### Hyperparameter tuning

In [None]:
param_grid = {
    'learning_rate': [0.1, 0.3, 0.5],
    'max_depth': [3, 5, 7],
    'n_estimators': [50, 100, 200]
}

grid_search = GridSearchCV(estimator=XGBRegressor(), param_grid=param_grid, scoring='neg_mean_squared_error', cv=3)
grid_search.fit(X_train, y_train)

best_params = grid_search.best_params_
best_model = grid_search.best_estimator_

MemoryError: Unable to allocate 27.4 MiB for an array with shape (3590665,) and data type float64

##### Testing

In [None]:
y_pred = best_model.predict(X_test)

# Calculating metrics
acc = accuracy_score(y_test, y_pred)
prec = precision_score(y_test, y_pred)
rec = recall_score(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = mean_squared_error(y_test, y_pred, squared=False)
r2 = r2_score(y_test, y_pred)

print("Best Parameters:", best_params)

print(f'Accuracy: {acc}')
print(f'Precision: {prec}')
print(f'Recall: {rec}')
print(f'MAE (Mean Absolute Error): {mae}')
print(f'MSE (Mean Squared Error): {mse}')
print(f'RMSE: {rmse}')
print(f'R^2: {r2}')