In [24]:
# Downloading the dataset
!wget https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2024-01.parquet -P /workspaces/mlops-learning/01-intro/data
!wget https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2024-02.parquet -P /workspaces/mlops-learning/01-intro/data


# Download 2021 data
!wget https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2021-01.parquet -P /workspaces/mlops-learning/01-intro/data

--2024-08-29 16:26:34--  https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2021-01.parquet
Resolving d37ci6vzurychx.cloudfront.net (d37ci6vzurychx.cloudfront.net)... 3.164.82.112, 3.164.82.197, 3.164.82.40, ...
Connecting to d37ci6vzurychx.cloudfront.net (d37ci6vzurychx.cloudfront.net)|3.164.82.112|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 21686067 (21M) [application/x-www-form-urlencoded]
Saving to: ‘/workspaces/mlops-learning/01-intro/data/yellow_tripdata_2021-01.parquet’


2024-08-29 16:26:35 (15.5 MB/s) - ‘/workspaces/mlops-learning/01-intro/data/yellow_tripdata_2021-01.parquet’ saved [21686067/21686067]



To check the indepth version of Training and Prediction, check 01-intro folder

In [2]:
# importing all the necesasry libraries

import pickle
import numpy as np
import pandas as pd

import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression

from sklearn.metrics import root_mean_squared_error


from sklearn.model_selection import cross_val_score, KFold

In [12]:
# a function to read the data, preprocess it and return it
def read_and_preprocess(filename):
    data = pd.read_parquet(filename)
    
    # create the target variable
    data['ride_duration'] = data['tpep_dropoff_datetime'] - data['tpep_pickup_datetime'] 
    data['ride_duration'] = data['ride_duration'].apply(lambda x: x.total_seconds()/60) 

    # take only the data below 1 hour
    data = data[(data['ride_duration'] >= 1) & (data['ride_duration'] <= 60)]

    # sample the data to 70k rows
    sampled_data = data.iloc[:70000,:].copy()
    
    # chosing categorical
    categorical = ['PULocationID', 'DOLocationID']

    # convert these numerical categorical features to string categorical features
    sampled_data[categorical] = sampled_data[categorical].astype(str)


    return sampled_data



In [13]:
sampled_data = read_and_preprocess('/workspaces/mlops-learning/01-intro/data/yellow_tripdata_2024-01.parquet')

In [14]:
# chosing categorical and numerical features
categorical = ['PULocationID', 'DOLocationID']
numerical = ['trip_distance']

# creating the training data with the features
train_data = sampled_data[categorical + numerical]

# storing our target variable
target = 'ride_duration'
y_train = sampled_data[target].values

In [15]:
# to use the DictVectorizer, we need to convert the dataframe to dict
train_dicts = train_data.to_dict(orient='records')

dv = DictVectorizer()
X_train = dv.fit_transform(train_dicts)
X_train

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 210000 stored elements and shape (70000, 447)>

In [16]:
# train a LinearRegression Model

lr = LinearRegression()

lr.fit(X_train, y_train)

# make predictions on train data
y_pred = lr.predict(X_train)

# calculate the metrics
root_mean_squared_error(y_train, y_pred) # squared set to False implies we are using RMSE instead MSE

np.float64(5.797042749628479)

In [3]:
# sns.displot(y_train, kde=True, stat='density',kde_kws=dict(cut=3), label='Actual')
# sns.displot(y_pred, kde=True, stat='density',kde_kws=dict(cut=3), label='Predicted') 

# sns.distplot(y_train, kde=True, label='Actual')
# sns.distplot(y_pred, kde=True, label='Predicted') 

sns.kdeplot(y_train,label='Actual')
sns.kdeplot(y_pred,label='Predicted') 

plt.legend()


NameError: name 'y_train' is not defined

### K-Fold Cross Valiation

In [24]:
# Convert the dataset to dictionaries
train_dicts = train_data.to_dict(orient='records')

# Vectorize the data
dv = DictVectorizer()
X = dv.fit_transform(train_dicts)
y = sampled_data['ride_duration'].values

# Initialize the Linear Regression model
lr = LinearRegression()

# Set up 5-fold cross-validation
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# Perform cross-validation
cv_rmse = np.sqrt(-cross_val_score(lr, X, y, cv=kf, scoring='neg_mean_squared_error'))

print(f'Cross-Validation RMSE: {cv_rmse}')
print(f'Average Cross-Validation RMSE: {cv_rmse.mean()}')

Cross-Validation RMSE: [5.87214604 5.94142789 5.96554232 5.80563044 5.80900893]
Average Cross-Validation RMSE: 5.87875112379174


`scoring='neg_mean_squared_error'`: This tells cross_val_score to evaluate the model using the Negative Mean Squared Error (MSE) as the scoring metric. The reason for using "negative" MSE is that cross_val_score by default assumes higher scores are better, so negative values allow it to maintain consistency (since MSE is a loss function, lower is better).

`-cross_val_score(...):`

The scores returned by cross_val_score are negative because of the `neg_mean_squared_error` scoring. By negating them (multiplying by -1), you get the actual Mean Squared Error (MSE) values for each fold. If it's negative we cannot take the square root of it

MSE is the average of the squared differences between the predicted and actual values. RMSE is the square root of the MSE, which is more interpretable because it’s in the same units as the target variable (ride_duration in this case).


In [26]:
# Perform cross-validation - Without Scoring Variable
# if scoring is not mentioned, the default score func of estimator will be used
# the default scorer for LinearReg is R Square i.e. (1 - u/v), where
# "u" is he residual sum of squares ((y_true - y_pred)** 2).sum() and is the total sum of squares ((y_true - y_true.mean()) ** 2).sum()
# and the best possible score is 1.0 and worst is 0
cv_rmse = np.sqrt(cross_val_score(lr, X, y, cv=kf))

print(f'Cross-Validation RMSE: {cv_rmse}')
print(f'Average Cross-Validation RMSE: {cv_rmse.mean()}')

Cross-Validation RMSE: [0.82801636 0.81976742 0.82515868 0.82878804 0.83299871]
Average Cross-Validation RMSE: 0.826945840395376
