In [1]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
%matplotlib inline

import seaborn as sns

from sklearn.metrics import mean_squared_error
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression

import pickle

import warnings
warnings.filterwarnings('ignore')

In [2]:
train_df = pd.read_parquet('../data/yellow_tripdata_2022-01.parquet')
test_df = pd.read_parquet('../data/yellow_tripdata_2022-02.parquet')

In [3]:
def get_target(dataframe):
    
    dataframe['duration'] = dataframe.tpep_dropoff_datetime - dataframe.tpep_pickup_datetime
    dataframe['duration'] = dataframe['duration'].dt.total_seconds() / 60
    dataframe['duration'] = round(dataframe['duration'],2)
    
    return dataframe

In [None]:
# train_df = get_target(train_df)
# test_df = get_target(test_df)
# train_df['duration'].describe().round(2)
# train_df['duration'].describe(percentiles=[0.85, 0.95, 0.99]).round(2)

In [4]:
categorical_cols = ['PULocationID', 'DOLocationID']
numerical_cols = ['trip_distance']
target = ['duration']

In [5]:
def clean_data(dataframe, cat_cols, num_cols, target):
    
    cat_cols = [x.lower() for x in cat_cols]
    num_cols = [x.lower() for x in num_cols]
    
    
    dataframe = dataframe.loc[dataframe['VendorID'] == 2]
    dataframe = get_target(dataframe)
    
    dataframe = dataframe[dataframe['duration'].between(0, 50.85)]
    dataframe.columns = dataframe.columns.str.lower()
    
    dataframe = dataframe[cat_cols + num_cols + target]
    dataframe[cat_cols] = dataframe[cat_cols].astype('str')
    
    dataframe.reset_index(drop = True, inplace = True)
    return dataframe
    

In [6]:
train_df = clean_data(train_df, categorical_cols, numerical_cols, target)
test_df = clean_data(test_df, categorical_cols, numerical_cols, target)

In [7]:
train_df.head()

Unnamed: 0,pulocationid,dolocationid,trip_distance,duration
0,166,166,0.97,8.97
1,114,68,1.09,10.03
2,68,163,4.3,37.53
3,233,87,5.07,14.13
4,238,152,2.02,9.68


In [8]:
y_train = train_df['duration'].values
train_df.drop(columns = ['duration'], inplace = True)

y_test = test_df['duration'].values
test_df.drop(columns = ['duration'], inplace = True)

train_df = train_df.to_dict(orient = 'records')
test_df = test_df.to_dict(orient = 'records')


In [None]:
# with open('../data/train_df_dict', 'wb') as f_out:
    
#     pickle.dump(train_df, f_out)
    
# with open('../data/test_df_dict', 'wb') as f_out:
    
#     pickle.dump(test_df, f_out)
    
# del train_df

# del test_df

In [None]:
# with open('../data/train_df_dict', 'rb') as f_in:
    
#     train_df = pickle.load(f_in)
    
# with open('../data/test_df_dict', 'rb') as f_in:
    
#     test_df = pickle.load(f_in)

In [9]:
dv = DictVectorizer(sparse = False)
X_train = dv.fit_transform(train_df)
X_test = dv.transform(test_df)

In [None]:
lr = LinearRegression()

lr.fit(X_train, y_train)

preds = lr.predict(X_test)
rmse = mean_squared_error(y_test, y_train)

print('RMSE: {rmse}')