In [1]:
# Libraries
import pandas as pd
import sklearn

from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

In [2]:
print("pandas", pd.__version__)
print("sklearn", sklearn.__version__)

pandas 1.4.2
sklearn 1.0.2


In [3]:
taxi_df = pd.read_parquet("https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2023-01.parquet")
print("Dataset:", taxi_df.shape)

Dataset: (3066766, 19)


#### 1-Read the data for January. How many columns are there?

In [4]:
print("Columns:", taxi_df.shape[1])

Columns: 19


#### 2-What's the standard deviation of the trips duration in January?

In [5]:
taxi_df.columns

Index(['VendorID', 'tpep_pickup_datetime', 'tpep_dropoff_datetime',
       'passenger_count', 'trip_distance', 'RatecodeID', 'store_and_fwd_flag',
       'PULocationID', 'DOLocationID', 'payment_type', 'fare_amount', 'extra',
       'mta_tax', 'tip_amount', 'tolls_amount', 'improvement_surcharge',
       'total_amount', 'congestion_surcharge', 'airport_fee'],
      dtype='object')

In [6]:
# Create duration variable
taxi_df["duration"] = (taxi_df["tpep_dropoff_datetime"]-taxi_df["tpep_pickup_datetime"]).dt.total_seconds()/60
print("Standard derivation:", taxi_df["duration"].std())

Standard derivation: 42.594351241920904


#### 3-What fraction of the records left after you dropped the outliers?

In [7]:
taxi_clean_df = taxi_df.loc[
    (taxi_df["duration"]<=60) & (taxi_df["duration"]>=1)
].copy().reset_index()
print("Proportion remain:", taxi_clean_df.shape[0]/taxi_df.shape[0])

Proportion remain: 0.9812202822125979


#### 4-One hot enconding

In [8]:
# Convert to string
taxi_clean_df["PULocationID"] = taxi_clean_df["PULocationID"].astype(str)
taxi_clean_df["DOLocationID"] = taxi_clean_df["DOLocationID"].astype(str)

# Create dictionary
train_dicts = taxi_clean_df[["PULocationID","DOLocationID"]].to_dict(orient="records")

In [9]:
# Apply one-hot enconding
dv = DictVectorizer()
x_train = dv.fit_transform(train_dicts)

In [10]:
x_train.shape

(3009173, 515)

#### 5-Linear regression

In [11]:
y_train = taxi_clean_df["duration"].to_numpy()

In [12]:
# Train model
model = LinearRegression()
model.fit(x_train, y_train)

LinearRegression()

In [13]:
# Get predictions
y_pred = model.predict(x_train)

In [14]:
mean_squared_error(y_train, y_pred, squared=False)

7.649261027826866

#### 6-Check February results

In [15]:
taxi_val_df = pd.read_parquet("https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2023-02.parquet")
print("Dataset:", taxi_val_df.shape)

Dataset: (2913955, 19)


In [16]:
# Create duration variable
taxi_val_df["duration"] = (taxi_val_df["tpep_dropoff_datetime"]-taxi_val_df["tpep_pickup_datetime"]).dt.total_seconds()/60

# Clean outliers
taxi_val_clean_df = taxi_val_df.loc[
    (taxi_val_df["duration"]<=60) & (taxi_val_df["duration"]>=1)
].copy().reset_index(drop=True)

# Apply one hot enconding
taxi_val_clean_df["PULocationID"] = taxi_val_clean_df["PULocationID"].astype(str)
taxi_val_clean_df["DOLocationID"] = taxi_val_clean_df["DOLocationID"].astype(str)

val_dicts = taxi_val_clean_df[["PULocationID","DOLocationID"]].to_dict(orient="records")
x_val = dv.transform(val_dicts)

In [18]:
# Get truth and prediction values
y_val = taxi_val_clean_df["duration"].to_numpy()
y_val_pred = model.predict(x_val)

In [19]:
# Measure performance
mean_squared_error(y_val, y_val_pred, squared=False)

7.811832641626525