In [1]:
# !pip install pyarrow

In [47]:
import pandas as pd
import seaborn as sns

from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

In [70]:
def import_dataset(filename):
    df = pd.read_parquet(filename)
    
    df['duration'] = df.dropOff_datetime - df.pickup_datetime
    df.duration = df.duration.apply(lambda x: x.total_seconds() / 60)
    print(f"mean duration: {df.duration.mean()}")
    
    df = df[(df.duration >= 1) & (df.duration <= 60)]
    
    impute_values = {
        'PUlocationID': -1,
        'DOlocationID': -1,
    }
    df = df.fillna(value=impute_values)
    missing_values_percentage = df[df.PUlocationID == -1].shape[0] / df.shape[0] * 100
    print(f"% of missing values: {missing_values_percentage}")
    
    categorical_cols = ['PUlocationID', 'DOlocationID']    
    df[categorical_cols].astype(str)
    
    return df

In [71]:
train_df = import_dataset('./data/fhv_tripdata_2021-01.parquet')
val_df = import_dataset('./data/fhv_tripdata_2021-02.parquet')
print(train_df.shape)
print(val_df.shape)

mean duration: 19.1672240937939
% of missing values: 83.52732770722618
mean duration: 20.70698622520125
% of missing values: 85.71354986754038
(1109826, 8)
(990113, 8)


In [65]:
categorical_cols = ['PUlocationID', 'DOlocationID']
train_dicts = train_df[categorical_cols].to_dict(orient='records')
val_dicts = val_df[categorical_cols].to_dict(orient='records')

dv = DictVectorizer()
x_train = dv.fit_transform(train_dicts)
x_val = dv.transform(val_dicts)

In [None]:
target_col = 'duration'
y_train = train_df[target_col]
y_val = val_df[target_col]

lr = LinearRegression()
lr.fit(x_train, y_train)

y_pred = lr.predict(x_val)

mean_squared_error(y_val, y_pred, squared=False)