In [1]:
!python -V

Python 3.9.7


In [2]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from pathlib import Path
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge

from sklearn.metrics import mean_squared_error

In [3]:
data_folder = Path.cwd() / 'data'
model_folder = Path.cwd() / 'models'
model_folder.mkdir(exist_ok=True)

In [56]:
df = pd.read_parquet(data_folder / 'fhv_tripdata_2021-01.parquet')

Q1

In [60]:
print(len(df))

1154112


In [64]:
print(df.dtypes)

dispatching_base_num              object
pickup_datetime           datetime64[ns]
dropOff_datetime          datetime64[ns]
PUlocationID                     float64
DOlocationID                     float64
SR_Flag                          float64
Affiliated_base_number            object
duration                         float64
dtype: object


Q2

In [65]:
df['duration'] = (df['dropOff_datetime'] - df['pickup_datetime']).map(lambda x: x.total_seconds() / 60)

In [66]:
print(df.duration.describe())
print("\nMean:")
print(f"{df.duration.mean():.3f} minutes")

count    1.154112e+06
mean     1.916722e+01
std      3.986922e+02
min      1.666667e-02
25%      7.766667e+00
50%      1.340000e+01
75%      2.228333e+01
max      4.233710e+05
Name: duration, dtype: float64

Mean:
19.167 minutes


In [67]:
cond = (df.duration >= 1) & (df.duration <= 60)
dropped = len(cond) - cond.sum()
print(f"Dropped: {dropped} records")
assert dropped + cond.sum() == len(df)

print(f"Before records: {len(df):_}")
df = df[cond]
print(f"After records: {len(df):_}")

Dropped: 44286 records
Before records: 1_154_112
After records: 1_109_826


Q3

In [30]:
for col in df.columns:
    print(col)

dispatching_base_num
pickup_datetime
dropOff_datetime
PUlocationID
DOlocationID
SR_Flag
Affiliated_base_number
duration


In [68]:
locations = ['PUlocationID', 'DOlocationID']
df[locations] = df[locations].fillna('-1')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[locations] = df[locations].fillna('-1')


In [69]:
missing = (df['PUlocationID'] == '-1').sum() / len(df)
print(f"Missing PU locations: {missing:.3%}\n")
print(df['PUlocationID'].value_counts(normalize=1).head() * 100)

Missing PU locations: 83.527%

-1       83.527328
221.0     0.750568
206.0     0.612438
129.0     0.484671
115.0     0.367805
Name: PUlocationID, dtype: float64


Q4

In [73]:
df[locations] = df[locations].astype(str)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[locations] = df[locations].astype(str)


In [75]:
train_dicts = df[locations].to_dict(orient='records')

dv = DictVectorizer()
X_train = dv.fit_transform(train_dicts)

In [101]:
print(X_train.shape[-1])

525


Q5

In [81]:
target = 'duration'
y_train = df[target].values

lr = LinearRegression()
lr.fit(X_train, y_train)

y_pred = lr.predict(X_train)

mse = mean_squared_error(y_train, y_pred, squared=False)

In [82]:
print(f"MSE: {mse:.3f}")

MSE: 10.529


Q6

In [95]:
LOCATIONS = ['PUlocationID', 'DOlocationID']

def read_process_data(path, categorical=None):
    if categorical is None:
        categorical = LOCATIONS
    df = pd.read_parquet(path)
    df['duration'] = (df['dropOff_datetime'] - df['pickup_datetime']).map(lambda x: x.total_seconds() / 60)
    cond = (df.duration >= 1) & (df.duration <= 60)
    df = df[cond]
    df[categorical] = df[categorical].fillna('-1')
    df[categorical] = df[categorical].astype(str)
    return df

In [96]:
def build_features(train_data, val_data, target='duration', features=None):
    if features is None:
        features = LOCATIONS
    train_dicts = train_data[features].to_dict(orient='records')
    val_dicts = val_data[features].to_dict(orient='records')
    dv = DictVectorizer()
    X_train = dv.fit_transform(train_dicts)
    X_val = dv.transform(val_dicts)
    return X_train, train_data[target].values, X_val, val_data[target].values

In [97]:
train_path = data_folder / 'fhv_tripdata_2021-01.parquet'
val_path = data_folder / 'fhv_tripdata_2021-02.parquet'

train_data = read_process_data(train_path)
val_data = read_process_data(val_path)

In [98]:
X_train, y_train, X_val, y_val = build_features(train_data, val_data)

In [99]:
lr = LinearRegression()
lr.fit(X_train, y_train)

y_train_pred = lr.predict(X_train)
y_pred = lr.predict(X_val)

train_mse = mean_squared_error(y_train, y_train_pred, squared=False)
val_mse = mean_squared_error(y_val, y_pred, squared=False)

In [102]:
print(f"Train MSE: {train_mse:.3f}\nValidation MSE: {val_mse:.3f}")

Train MSE: 10.529
Validation MSE: 11.014
