In [1]:
# load packages
import pandas as pd
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

## Homework MLOps 1

In [2]:
def read_f(file:str) -> pd.DataFrame:
    """
    reads parquet functions and adds column 'duration' (in min)
    Arguments:
        file: path to file (string)
    Returns:
        pandas dataframe
    """
    df = pd.read_parquet(file)
    df['duration'] = df.dropOff_datetime - df.pickup_datetime
    df['duration'] = df.duration.apply(lambda td: td.total_seconds() / 60)
    
    return df

In [3]:
df_train = read_f('./data/fhv_tripdata_2021-01.parquet')

## Q1

In [4]:
print("There are {} records in January.".format(df_train.shape[0]))

There are 1154112 records in January.


## Q2

In [5]:
print("The average trip duration is: {}.".format(df_train['duration'].mean()))

The average trip duration is: 19.1672240937939.


In [6]:
df_train = df_train[(df_train.duration >= 1) & (df_train.duration <= 60)]

In [7]:
df_train.shape

(1109826, 8)

In [8]:
df_train = df_train.fillna(-1)

In [9]:
df_train.PUlocationID = df_train.PUlocationID.astype(str)

## Q3

In [10]:
print("Fraction of missing values: {}.".format(df_train.PUlocationID.value_counts()["-1.0"]*100/len(df_train)))

Fraction of missing values: 83.52732770722618.


In [11]:
df_train = df_train[['PUlocationID', 'DOlocationID', 'duration']]

In [12]:
categorical = ['PUlocationID', 'DOlocationID']
df_train[categorical] = df_train[categorical].astype(str)

In [13]:
df_train.columns

Index(['PUlocationID', 'DOlocationID', 'duration'], dtype='object')

In [14]:
dv = DictVectorizer()

train_dicts = df_train[categorical].to_dict(orient='records')

In [15]:
X_train = dv.fit_transform(train_dicts)

## Q4

In [16]:
print("The number of columns is: {}.".format(X_train.shape[1]))

The number of columns is: 525.


In [17]:
target = 'duration'
y_train = df_train[target].values

In [18]:
lr = LinearRegression()
lr.fit(X_train, y_train)

y_pred = lr.predict(X_train)

## Q5

In [19]:
print("RSME on training data is: {}.".format(mean_squared_error(y_train, y_pred, squared=False)))

RSME on training data is: 10.528519107206316.


In [20]:
df_val = read_f('./data/fhv_tripdata_2021-02.parquet')
df_val = df_val[(df_val.duration >= 1) & (df_val.duration <= 60)]
df_val = df_val[['PUlocationID', 'DOlocationID', 'duration']]
df_val = df_val.fillna(-1)
df_val[categorical] = df_val[categorical].astype(str)
val_dicts = df_val[categorical].to_dict(orient='records')
X_val = dv.transform(val_dicts)
y_val = df_val[target].values
y_pred = lr.predict(X_val)

## Q6

In [21]:
print("RSME on validation data is: {}.".format(mean_squared_error(y_val, y_pred, squared=False)))

RSME on validation data is: 11.014283149347039.
