In [1]:
import pandas as pd
import seaborn as sns

from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression

from sklearn.metrics import mean_squared_error

In [2]:
df = pd.read_parquet("data/fhv_tripdata_2021-01.parquet")

In [3]:
print(f"Number of records: {len(df)}")

Number of records: 1154112


In [4]:
df["duration"] = df.dropOff_datetime - df.pickup_datetime
df["duration"] = df.duration.dt.total_seconds() / 60

In [5]:
print(f"Average trip duration in January: {df['duration'].mean()}")

Average trip duration in January: 19.1672240937939


In [6]:
print(f"Number of records to drop: {((df.duration >= 1) & (df.duration <= 60)).sum()}")

Number of records to drop: 1109826


In [7]:
df = df[(df.duration >= 1) & (df.duration <= 60)].copy()

In [8]:
categorical = ["PUlocationID", "DOlocationID"]

df[categorical] = df[categorical].fillna(-1).astype("int")

In [9]:
print(f"Fractions of missing values for the pickup location ID: {(df['PUlocationID'] == -1).mean()}")

Fractions of missing values for the pickup location ID: 0.8352732770722617


In [10]:
df[categorical] = df[categorical].astype("str")

In [11]:
train_dicts = df[categorical].to_dict(orient="records")

In [12]:
dv = DictVectorizer()
X_train = dv.fit_transform(train_dicts) 

In [13]:
X_train.shape

(1109826, 525)

In [14]:
y_train = df.duration.values

In [15]:
print(f"Dimensionality of matrix: {len(dv.feature_names_)}")

Dimensionality of matrix: 525


In [16]:
lr = LinearRegression()
lr.fit(X_train, y_train)

LinearRegression()

In [17]:
y_pred = lr.predict(X_train)

In [18]:
print(f"RMSE on train: {mean_squared_error(y_train, y_pred, squared=False)}")

RMSE on train: 10.528519107213992


In [19]:
categorical = ["PUlocationID", "DOlocationID"]

def read_data(filename):
    df = pd.read_parquet(filename)
    
    df["duration"] = df.dropOff_datetime - df.pickup_datetime
    df["duration"] = df.duration.dt.total_seconds() / 60

    df = df[(df.duration >= 1) & (df.duration <= 60)].copy()

    df[categorical] = df[categorical].fillna(-1).astype("int").astype("str")
    
    return df

In [20]:
df_val = read_data("data/fhv_tripdata_2021-02.parquet")

In [21]:
val_dicts = df_val[categorical].to_dict(orient="records")

In [22]:
X_val = dv.transform(val_dicts) 

In [23]:
y_pred = lr.predict(X_val)

In [24]:
y_val = df_val.duration.values

In [25]:
print(f"RMSE on valid: {mean_squared_error(y_val, y_pred, squared=False)}")

RMSE on valid: 11.014283216489057
