In [1]:
# import the dataset
import pandas as pd
url = "s3://wagon-public-datasets/taxi-fare-train.csv"
df = pd.read_csv(url, nrows=1000)
df.head()

Unnamed: 0,key,fare_amount,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
0,2009-06-15 17:26:21.0000001,4.5,2009-06-15 17:26:21 UTC,-73.844311,40.721319,-73.84161,40.712278,1
1,2010-01-05 16:52:16.0000002,16.9,2010-01-05 16:52:16 UTC,-74.016048,40.711303,-73.979268,40.782004,1
2,2011-08-18 00:35:00.00000049,5.7,2011-08-18 00:35:00 UTC,-73.982738,40.76127,-73.991242,40.750562,2
3,2012-04-21 04:30:42.0000001,7.7,2012-04-21 04:30:42 UTC,-73.98713,40.733143,-73.991567,40.758092,1
4,2010-03-09 07:51:00.000000135,5.3,2010-03-09 07:51:00 UTC,-73.968095,40.768008,-73.956655,40.783762,1


In [2]:
# prepare X and y
y = df.pop("fare_amount")
X = df
X.head()

Unnamed: 0,key,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
0,2009-06-15 17:26:21.0000001,2009-06-15 17:26:21 UTC,-73.844311,40.721319,-73.84161,40.712278,1
1,2010-01-05 16:52:16.0000002,2010-01-05 16:52:16 UTC,-74.016048,40.711303,-73.979268,40.782004,1
2,2011-08-18 00:35:00.00000049,2011-08-18 00:35:00 UTC,-73.982738,40.76127,-73.991242,40.750562,2
3,2012-04-21 04:30:42.0000001,2012-04-21 04:30:42 UTC,-73.98713,40.733143,-73.991567,40.758092,1
4,2010-03-09 07:51:00.000000135,2010-03-09 07:51:00 UTC,-73.968095,40.768008,-73.956655,40.783762,1


In [3]:
# Hold out 
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [4]:
X_train.head()

Unnamed: 0,key,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
593,2013-12-04 08:44:06.0000001,2013-12-04 08:44:06 UTC,-73.982061,40.775128,-73.974957,40.759715,1
667,2014-06-25 23:08:56.0000004,2014-06-25 23:08:56 UTC,-73.977865,40.784274,-73.962362,40.767125,2
588,2012-09-19 09:22:56.0000002,2012-09-19 09:22:56 UTC,-73.952827,40.772524,-74.000287,40.75857,1
510,2010-10-23 18:39:00.000000208,2010-10-23 18:39:00 UTC,-73.782215,40.644272,-73.782217,40.64427,2
966,2015-06-08 01:59:17.0000001,2015-06-08 01:59:17 UTC,0.0,0.0,0.0,0.0,1


In [5]:
df.to_csv(r'/home/louisedantas/code/LouiseDantas/taxi-fare/raw_data/train_1k.csv')

In [6]:
df2 = pd.read_csv(url, nrows=10000)

In [7]:
df2.to_csv(r'/home/louisedantas/code/LouiseDantas/taxi-fare/raw_data/train_10k.csv')

# Functions to compute distance of the ride and time and day of ride

In [9]:
import numpy as np

def haversine_vectorized(df, 
                         start_lat="pickup_latitude",
                         start_lon="pickup_longitude",
                         end_lat="dropoff_latitude",
                         end_lon="dropoff_longitude"):
    """ 
        Calculates the great circle distance between two points 
        on the earth (specified in decimal degrees).
        Vectorized version of the haversine distance for pandas df.
        Computes the distance in kms.
    """

    lat_1_rad, lon_1_rad = np.radians(df[start_lat].astype(float)), np.radians(df[start_lon].astype(float))
    lat_2_rad, lon_2_rad = np.radians(df[end_lat].astype(float)), np.radians(df[end_lon].astype(float))
    dlon = lon_2_rad - lon_1_rad
    dlat = lat_2_rad - lat_1_rad

    a = np.sin(dlat / 2.0) ** 2 + np.cos(lat_1_rad) * np.cos(lat_2_rad) * np.sin(dlon / 2.0) ** 2
    c = 2 * np.arcsin(np.sqrt(a))
    return 6371 * c

In [10]:
# create a DistanceTransformer
from sklearn.base import BaseEstimator, TransformerMixin

class DistanceTransformer(BaseEstimator, TransformerMixin):
    """
        Computes the haversine distance between two GPS points.
        Returns a copy of the DataFrame X with only one column: 'distance'.
    """

    def __init__(self,
                 start_lat="pickup_latitude",
                 start_lon="pickup_longitude",
                 end_lat="dropoff_latitude",
                 end_lon="dropoff_longitude"):
        self.start_lat = start_lat
        self.start_lon = start_lon
        self.end_lat = end_lat
        self.end_lon = end_lon

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        assert isinstance(X, pd.DataFrame)
        X_ = X.copy()
        X_["distance"] = haversine_vectorized(
            X_,
            start_lat=self.start_lat,
            start_lon=self.start_lon,
            end_lat=self.end_lat,
            end_lon=self.end_lon
        )
        return X_[['distance']]

In [11]:
# create a TimeFeaturesEncoder
class TimeFeaturesEncoder(BaseEstimator, TransformerMixin):
    """
        Extracts the day of week (dow), the hour, the month and the year from a time column.
        Returns a copy of the DataFrame X with only four columns: 'dow', 'hour', 'month', 'year'.
    """

    def __init__(self, time_column, time_zone_name='America/New_York'):
        self.time_column = time_column
        self.time_zone_name = time_zone_name

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        assert isinstance(X, pd.DataFrame)
        X_ = X.copy()
        X_.index = pd.to_datetime(X[self.time_column])
        X_.index = X_.index.tz_convert(self.time_zone_name)
        X_["dow"] = X_.index.weekday
        X_["hour"] = X_.index.hour
        X_["month"] = X_.index.month
        X_["year"] = X_.index.year
        return X_[['dow', 'hour', 'month', 'year']]

# Standardize DF

In [15]:
# visualizing pipelines in HTML
from sklearn import set_config

In [17]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

# create distance pipeline
dist_pipe = Pipeline([
    ('dist_trans', DistanceTransformer()),
    ('stdscaler', StandardScaler())
])

# display distance pipeline
dist_pipe

Pipeline(memory=None,
         steps=[('dist_trans',
                 DistanceTransformer(end_lat='dropoff_latitude',
                                     end_lon='dropoff_longitude',
                                     start_lat='pickup_latitude',
                                     start_lon='pickup_longitude')),
                ('stdscaler',
                 StandardScaler(copy=True, with_mean=True, with_std=True))],
         verbose=False)

In [18]:
from sklearn.preprocessing import OneHotEncoder

# create time pipeline
time_pipe = Pipeline([
    ('time_enc', TimeFeaturesEncoder('pickup_datetime')),
    ('ohe', OneHotEncoder(handle_unknown='ignore'))
])

# display time pipeline
time_pipe

Pipeline(memory=None,
         steps=[('time_enc',
                 TimeFeaturesEncoder(time_column='pickup_datetime',
                                     time_zone_name='America/New_York')),
                ('ohe',
                 OneHotEncoder(categories='auto', drop=None,
                               dtype=<class 'numpy.float64'>,
                               handle_unknown='ignore', sparse=True))],
         verbose=False)

In [19]:
from sklearn.compose import ColumnTransformer

# create preprocessing pipeline
preproc_pipe = ColumnTransformer([
    ('distance', dist_pipe, ["pickup_latitude", "pickup_longitude", 'dropoff_latitude', 'dropoff_longitude']),
    ('time', time_pipe, ['pickup_datetime'])
], remainder="drop")

# display preprocessing pipeline
preproc_pipe

ColumnTransformer(n_jobs=None, remainder='drop', sparse_threshold=0.3,
                  transformer_weights=None,
                  transformers=[('distance',
                                 Pipeline(memory=None,
                                          steps=[('dist_trans',
                                                  DistanceTransformer(end_lat='dropoff_latitude',
                                                                      end_lon='dropoff_longitude',
                                                                      start_lat='pickup_latitude',
                                                                      start_lon='pickup_longitude')),
                                                 ('stdscaler',
                                                  StandardScaler(copy=True,
                                                                 with_mean=True,
                                                                 wi...
                                

In [20]:
from sklearn.linear_model import LinearRegression

# Add the model of your choice to the pipeline
pipe = Pipeline([
    ('preproc', preproc_pipe),
    ('linear_model', LinearRegression())
])

# display the pipeline with model
pipe

Pipeline(memory=None,
         steps=[('preproc',
                 ColumnTransformer(n_jobs=None, remainder='drop',
                                   sparse_threshold=0.3,
                                   transformer_weights=None,
                                   transformers=[('distance',
                                                  Pipeline(memory=None,
                                                           steps=[('dist_trans',
                                                                   DistanceTransformer(end_lat='dropoff_latitude',
                                                                                       end_lon='dropoff_longitude',
                                                                                       start_lat='pickup_latitude',
                                                                                       start_lon='pickup_longitude')),
                                                                  ('stdscaler',
     

# Training 

In [21]:
# train the pipelined model
pipe.fit(X_train, y_train)

# compute y_pred on the test set
y_pred = pipe.predict(X_test)

In [22]:
def compute_rmse(y_pred, y_true):
    return np.sqrt(((y_pred - y_true)**2).mean())

In [23]:
# call compute_rmse
compute_rmse(y_pred, y_test)

8.076908114744029

# Refactor code into functions:

In [24]:
# implement get_data() function
def get_data(nrows=10000):
    '''returns a DataFrame with nrows from s3 bucket'''
    aws_path = "s3://wagon-public-datasets/taxi-fare-train.csv"
    df = pd.read_csv(aws_path, nrows=nrows)
    return df

In [25]:
# implement clean_data() function
def clean_data(df, test=False):
    '''returns a DataFrame without outliers and missing values'''
    df = df.dropna(how='any')
    df = df[(df.dropoff_latitude != 0) | (df.dropoff_longitude != 0)]
    df = df[(df.pickup_latitude != 0) | (df.pickup_longitude != 0)]
    if "fare_amount" in list(df):
        df = df[df.fare_amount.between(0, 4000)]
    df = df[df.passenger_count < 8]
    df = df[df.passenger_count >= 0]
    df = df[df["pickup_latitude"].between(left=40, right=42)]
    df = df[df["pickup_longitude"].between(left=-74.3, right=-72.9)]
    df = df[df["dropoff_latitude"].between(left=40, right=42)]
    df = df[df["dropoff_longitude"].between(left=-74, right=-72.9)]
    return df

In [26]:
# implement set_pipeline() function
def set_pipeline():
    '''returns a pipelined model'''
    dist_pipe = Pipeline([
        ('dist_trans', DistanceTransformer()),
        ('stdscaler', StandardScaler())
    ])
    time_pipe = Pipeline([
        ('time_enc', TimeFeaturesEncoder('pickup_datetime')),
        ('ohe', OneHotEncoder(handle_unknown='ignore'))
    ])
    preproc_pipe = ColumnTransformer([
        ('distance', dist_pipe, ["pickup_latitude", "pickup_longitude", 'dropoff_latitude', 'dropoff_longitude']),
        ('time', time_pipe, ['pickup_datetime'])
    ], remainder="drop")
    pipe = Pipeline([
        ('preproc', preproc_pipe),
        ('linear_model', LinearRegression())
    ])
    return pipe

In [27]:
# implement train() function
def train(X_train, y_train, pipeline):
    '''returns a trained pipelined model'''
    pipeline.fit(X_train, y_train)
    return pipeline

In [28]:
# implement evaluate() function
def evaluate(X_test, y_test, pipeline):
    '''returns the value of the RMSE'''
    y_pred = pipeline.predict(X_test)
    rmse = compute_rmse(y_pred, y_test)
    print(rmse)
    return rmse

# Testing

In [29]:
# store the data in a DataFrame
df = get_data()

# set X and y
y = df["fare_amount"]
X = df.drop("fare_amount", axis=1)

# hold out
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.15)

# build pipeline
pipeline = set_pipeline()

# train the pipeline
train(X_train, y_train, pipeline)

# evaluate the pipeline
rmse = evaluate(X_val, y_val, pipeline)

8.567329162783583
