# Import libraries

In [1]:
import pandas as pd 
import numpy as np 

In [2]:
import seaborn as sns
import matplotlib.pyplot as plt

In [3]:
from sklearn.feature_extraction import DictVectorizer
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

# With ohe

In [4]:
def read_transform_data(data, col_cat, col_num, target):
    df = pd.read_parquet(data)
    print(f'The size of the dataset is: {len(df)}')
    #Convert to datetime
    df['lpep_dropoff_datetime'] = pd.to_datetime(df['lpep_dropoff_datetime'])
    df['lpep_pickup_datetime'] = pd.to_datetime(df['lpep_pickup_datetime'])

    #Create duration column
    df['duration'] = df['lpep_dropoff_datetime'] - df['lpep_pickup_datetime']
    # Obtain only mins
    df['duration'] = df['duration'].dt.total_seconds() / 60
    #Filtering data
    df = df[(df['duration'] >= 1) & (df['duration'] <= 60)]
    #Change type for cat vars
    df[col_cat] = df[col_cat].astype(str)
    #Select final variables
    df = df[col_cat + col_num + target]
    #Drop nulls
    df = df.dropna()
    df.reset_index()
    return df

In [5]:
target = ['duration']
col_cat = ['PULocationID', 'DOLocationID']
col_num = ['trip_distance']

df_train = read_transform_data('../data/green_tripdata_2021-01.parquet', col_cat, col_num, target)
df_test = read_transform_data('../data/green_tripdata_2021-02.parquet', col_cat, col_num, target)


The size of the dataset is: 76518
The size of the dataset is: 64572


In [6]:
print(df_train.isna().sum())

PULocationID     0
DOLocationID     0
trip_distance    0
duration         0
dtype: int64


In [7]:
# One hot encoding 
enc = OneHotEncoder(sparse=False, handle_unknown='ignore')

In [8]:
# Fit and transform the training data, then create a DataFrame with the correct index
enc_features_train = enc.fit_transform(df_train[col_cat])
df_train_ohe = pd.DataFrame(enc_features_train, columns=enc.get_feature_names_out(col_cat), index=df_train.index)

# Concatenate the one-hot encoded DataFrame with the remaining columns in df_train
df_train = pd.concat([df_train.drop(col_cat, axis=1), df_train_ohe], axis=1)

# Transform the test data using the same encoder, then create a DataFrame with the correct index
enc_features_test = enc.transform(df_test[col_cat])
df_test_ohe = pd.DataFrame(enc_features_test, columns=enc.get_feature_names_out(col_cat), index=df_test.index)

# Concatenate the one-hot encoded DataFrame with the remaining columns in df_test
df_test = pd.concat([df_test.drop(col_cat, axis=1), df_test_ohe], axis=1)

# Verifica si hay valores nulos después de la transformación
print(df_train.isna().sum())



trip_distance       0
duration            0
PULocationID_10     0
PULocationID_100    0
PULocationID_101    0
                   ..
DOLocationID_94     0
DOLocationID_95     0
DOLocationID_96     0
DOLocationID_97     0
DOLocationID_98     0
Length: 508, dtype: int64


In [9]:
y_train = df_train[target]
X_train = df_train.drop(columns = target)
y_test = df_test[target]
X_test = df_test.drop(columns = target)


In [10]:
lr = LinearRegression()

In [11]:
lr.fit(X_train, y_train)

y_pred_train = lr.predict(X_train)
y_pred_test = lr.predict(X_test)
rmse_train = mean_squared_error(y_train, y_pred_train, squared=False)
rmse_test = mean_squared_error(y_test, y_pred_test, squared=False)

print(f'RMSE in train: {rmse_train}')
print(f'RMSE in test: {rmse_test}')

RMSE in train: 9.775258694628427
RMSE in test: 1470183.2562417628


# With dict vectorizer

In [12]:
df_train = read_transform_data('../data/green_tripdata_2021-01.parquet', col_cat, col_num, target)
df_test = read_transform_data('../data/green_tripdata_2021-02.parquet', col_cat, col_num, target)

The size of the dataset is: 76518
The size of the dataset is: 64572


In [13]:
dv = DictVectorizer()

train_dicts = df_train[col_cat + col_num].to_dict(orient='records')
X_train = dv.fit_transform(train_dicts)

test_dicts = df_test[col_cat + col_num].to_dict(orient='records')
X_test = dv.transform(test_dicts)

In [14]:
y_train = df_train[target].values
y_test = df_test[target].values

In [15]:
lr2 = LinearRegression()
lr2.fit(X_train, y_train)

y_pred_train = lr2.predict(X_train)
y_pred_test = lr2.predict(X_test)
rmse_train_2 = mean_squared_error(y_train, y_pred_train, squared=False)
rmse_test_2 = mean_squared_error(y_test, y_pred_test, squared=False)

print(f'RMSE in train: {rmse_train_2}')
print(f'RMSE in test: {rmse_test_2}')

RMSE in train: 9.838799799829626
RMSE in test: 10.499110710360293


In [16]:
print('OHE')
print(f'RMSE in train ohe: {rmse_train}')
print(f'RMSE in test ohe: {rmse_test}')
print('VECT')
print(f'RMSE in train vect: {rmse_train_2}')
print(f'RMSE in test vect: {rmse_test_2}')

OHE
RMSE in train ohe: 9.775258694628427
RMSE in test ohe: 1470183.2562417628
VECT
RMSE in train vect: 9.838799799829626
RMSE in test vect: 10.499110710360293
