In [1]:
import sklearn

In [2]:
import pandas as pd
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
import numpy as np

In [3]:
pd.__version__


'2.0.3'

In [4]:
sklearn.__version__

'1.2.2'

In [5]:
df = pd.read_parquet('https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2024-01.parquet')

In [6]:
!pip install pyarrow



In [7]:
col=len(df.columns)
print(col)

19


In [8]:
# Calculate 'duration' as the difference between drop-off and pick-up times
df['duration'] = (df['tpep_dropoff_datetime'] - df['tpep_pickup_datetime'])

# Convert duration from timedelta to minutes
df['duration_min'] = df['duration'].dt.total_seconds() / 60

# Calculate the standard deviation of trip durations in January (in minutes)
std_duration_january = df['duration_min'].std()
print("Standard deviation of trips duration in January (in minutes):", std_duration_january)

# Filtering df to remove outliers, keeping only durations between 1 and 60 minutes
original_count = len(df)
df_filtered = df[(df['duration_min'] >= 1) & (df['duration_min'] <= 60)].copy()
filtered_count = len(df_filtered)

# Determine what fraction of records remains after dropping outliers
fraction_left = filtered_count / original_count
print("Fraction of records left after dropping outliers:", fraction_left)

# Convert location IDs to strings
df_filtered['PULocationID'] = df_filtered['PULocationID'].astype(str)
df_filtered['DOLocationID'] = df_filtered['DOLocationID'].astype(str)

# Use only the filtered duration in minutes for further processing
df_filtered['duration'] = df_filtered['duration_min']

# Convert DataFrame to a list of dictionaries for feature encoding
data_dicts = df_filtered[['PULocationID', 'DOLocationID']].to_dict(orient='records')

# Create and apply DictVectorizer
vectorizer = DictVectorizer(sparse=True)
feature_matrix = vectorizer.fit_transform(data_dicts)

# Number of columns in the feature matrix
print("Dimensionality of the feature matrix (number of columns):", feature_matrix.shape[1])

# Target variable
y = df_filtered['duration']

# Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(feature_matrix, y, test_size=0.2, random_state=42)

# Create and train a linear regression model
model = LinearRegression()
model.fit(X_train, y_train)

# Make predictions on the training data
y_train_pred = model.predict(X_train)

# Calculate RMSE on the training data
rmse_train = np.sqrt(mean_squared_error(y_train, y_train_pred))
print("RMSE on the train data:", rmse_train)


Standard deviation of trips duration in January (in minutes): 34.851053592212814
Fraction of records left after dropping outliers: 0.9778326020432945
Dimensionality of the feature matrix (number of columns): 518
RMSE on the train data: 7.948999928312118


In [9]:
# Load the February 2023 data
df_feb = pd.read_parquet('https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2024-02.parquet')

# Calculate 'duration' as the difference between drop-off and pick-up times
df_feb['duration'] = (df_feb['tpep_dropoff_datetime'] - df_feb['tpep_pickup_datetime'])

# Convert duration from timedelta to minutes
df_feb['duration_min'] = df_feb['duration'].dt.total_seconds() / 60

# Filter df_feb to remove outliers, keeping only durations between 1 and 60 minutes
df_feb_filtered = df_feb[(df_feb['duration_min'] >= 1) & (df_feb['duration_min'] <= 60)].copy()

# Convert location IDs to strings
df_feb_filtered['PULocationID'] = df_feb_filtered['PULocationID'].astype(str)
df_feb_filtered['DOLocationID'] = df_feb_filtered['DOLocationID'].astype(str)

# Prepare the validation dataset for prediction using the same DictVectorizer
# (assuming 'vectorizer' has been fitted with the training data)
data_dicts_feb = df_feb_filtered[['PULocationID', 'DOLocationID']].to_dict(orient='records')
X_feb = vectorizer.transform(data_dicts_feb)

# Target variable for validation dataset
y_feb = df_feb_filtered['duration_min']

# Use the already trained model to make predictions on the February data
y_feb_pred = model.predict(X_feb)

# Calculate RMSE on the February data
rmse_feb = np.sqrt(mean_squared_error(y_feb, y_feb_pred))
print("RMSE on the validation data (February 2023):", rmse_feb)


RMSE on the validation data (February 2023): 8.124011701161189
