In [1]:
import pandas as pd # for data analysis
import numpy as np # for scientific calculation
import seaborn as sns # for statistical plotting
import datetime # for working with date fields
import matplotlib.pyplot as plt # for plotting
%matplotlib inline
import math # for mathematical calculation

In [2]:
nydata = pd.read_csv('./data/yellow_tripdata_2016-03.csv')

In [3]:

nydata = nydata.sample(n=100)
from math import radians, cos, sin, asin, sqrt


def distance(lon1, lat1, lon2, lat2):
    """
    Calculate the great circle distance between two points
    on the earth (specified in decimal degrees)
    """
    # convert decimal degrees to radians
    lon1, lat1, lon2, lat2 = map(radians, [lon1, lat1, lon2, lat2])

    # haversine formula
    dlon = lon2 - lon1
    dlat = lat2 - lat1
    a = sin(dlat / 2) ** 2 + cos(lat1) * cos(lat2) * sin(dlon / 2) ** 2
    c = 2 * asin(sqrt(a))
    r = 6371  # Radius of earth in kilometers. Use 3956 for miles
    return c * r


nydata["tpep_pickup_datetime"] = pd.to_datetime(nydata["tpep_pickup_datetime"])
nydata["tpep_dropoff_datetime"] = pd.to_datetime(nydata["tpep_dropoff_datetime"])


nydata["duration"] = (
    nydata["tpep_dropoff_datetime"] - nydata["tpep_pickup_datetime"]
).dt.total_seconds()
nydata["distance"] = nydata.apply(
    lambda x: distance(
        x["pickup_longitude"],
        x["pickup_latitude"],
        x["dropoff_longitude"],
        x["dropoff_latitude"],
    ),
    axis=1,
)

nydata["speed"] = nydata.distance / (nydata.duration / 3600)


nyc_taxi_final = nydata.drop(
    [
        "VendorID",
        "pickup_longitude",
        "pickup_latitude",
        "dropoff_longitude",
        "dropoff_latitude",
        "store_and_fwd_flag",
    ],
    axis=1,
)


nyc_taxi_final["pickup_min"] = nyc_taxi_final["tpep_pickup_datetime"].apply(
    lambda x: x.minute
)
nyc_taxi_final["pickup_hour"] = nyc_taxi_final["tpep_pickup_datetime"].apply(
    lambda x: x.hour
)
nyc_taxi_final["pickup_day"] = nyc_taxi_final["tpep_pickup_datetime"].apply(
    lambda x: x.day
)
nyc_taxi_final["pickup_month"] = nyc_taxi_final["tpep_pickup_datetime"].apply(
    lambda x: int(x.month)
)
nyc_taxi_final["pickup_weekday"] = nyc_taxi_final["tpep_pickup_datetime"].dt.day_name()
nyc_taxi_final["pickup_month_name"] = nyc_taxi_final[
    "tpep_pickup_datetime"
].dt.month_name()

nyc_taxi_final["drop_hour"] = nyc_taxi_final["tpep_dropoff_datetime"].apply(
    lambda x: x.hour
)
nyc_taxi_final["drop_month"] = nyc_taxi_final["tpep_dropoff_datetime"].apply(
    lambda x: int(x.month)
)
nyc_taxi_final["drop_day"] = nyc_taxi_final["tpep_dropoff_datetime"].apply(
    lambda x: x.day
)
nyc_taxi_final["drop_min"] = nyc_taxi_final["tpep_dropoff_datetime"].apply(
    lambda x: x.minute
)

df = nyc_taxi_final[(nyc_taxi_final["speed"] < 1) & (nyc_taxi_final["distance"] == 0)]
nyc_taxi_final.drop(df.index, inplace=True)

df = nyc_taxi_final[
    (nyc_taxi_final["pickup_day"] < nyc_taxi_final["drop_day"])
    & (nyc_taxi_final["duration"] > 10000)
    & (nyc_taxi_final["distance"] < 5)
    & (nyc_taxi_final["pickup_hour"] < 23)
]
nyc_taxi_final.drop(df.index, inplace=True)

df = nyc_taxi_final[(nyc_taxi_final["speed"] < 1) & (nyc_taxi_final["distance"] < 1)]
nyc_taxi_final.drop(df.index, inplace=True)

nyc_taxi_final[nyc_taxi_final["duration"] / 60 > 10000][["duration", "distance"]]
nyc_taxi_final[nyc_taxi_final["duration"] / 60 > 10000]["duration"]

df = nyc_taxi_final[nyc_taxi_final["distance"] < 0.2]
nyc_taxi_final.drop(df.index, inplace=True)

df = nyc_taxi_final[nyc_taxi_final["passenger_count"] == 0]
nyc_taxi_final.drop(df.index, inplace=True)

df = nyc_taxi_final[nyc_taxi_final["duration"] < 120]
nyc_taxi_final.drop(df.index, inplace=True)

df = nyc_taxi_final[nyc_taxi_final["speed"] > 50]["speed"]
nyc_taxi_final.drop(df.index, inplace=True)



In [7]:
# Import Sklearn and models
from sklearn import linear_model
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn import preprocessing
from sklearn.model_selection import cross_val_score, KFold


In [12]:
X2 = nyc_taxi_final.drop(
    [
        "tpep_pickup_datetime",
        "tpep_dropoff_datetime",
        "trip_distance",
        "fare_amount",
        "extra",
        "mta_tax",
        "tip_amount",
        "tolls_amount",
        "improvement_surcharge",
        "total_amount",
        "duration",
        "speed",
        "pickup_weekday",
        "pickup_month_name",
    ],
    axis=1,
)

X2.head()

Unnamed: 0,passenger_count,RatecodeID,payment_type,distance,pickup_min,pickup_hour,pickup_day,pickup_month,drop_hour,drop_month,drop_day,drop_min
11513406,1,1,1,5.116023,14,9,30,3,9,3,30,38
5847145,2,1,2,1.033144,6,21,18,3,21,3,18,16
9521039,2,1,1,6.156661,17,20,13,3,20,3,13,29
7070685,2,2,2,20.851836,6,17,21,3,17,3,21,47
10115451,1,1,2,0.381908,21,12,26,3,12,3,26,25


In [None]:
X2["distance"] = X2["distance"].apply(lambda x: int(x))
X1 = preprocessing.scale(X2)

array([[-0.46128524, -0.16795701, -0.71842121, ...,  0.        ,
         1.66142419,  0.53867846],
       [ 0.29943077, -0.16795701,  1.39194109, ...,  0.        ,
         0.33978351, -0.75364991],
       [ 0.29943077, -0.16795701, -0.71842121, ...,  0.        ,
        -0.21090011,  0.00999867],
       ...,
       [-0.46128524, -0.16795701, -0.71842121, ...,  0.        ,
        -1.53254079, -0.98861871],
       [ 0.29943077, -0.16795701, -0.71842121, ...,  0.        ,
        -1.20213062,  0.42119406],
       [ 3.34229481, -0.16795701, -0.71842121, ...,  0.        ,
        -0.76158373, -0.28371232]], shape=(94, 12))

In [None]:
import joblib

reg = joblib.load('./nyc_taxi_model.pkl')
Y1 = reg.predict(X1)

In [17]:
Y1.shape, X1.shape

((94,), (94, 12))

In [16]:
Y1

array([ 3.21649785e+03,  1.29916889e+03,  7.20423002e+02,  3.41757828e+03,
        1.60107123e+03, -1.50354831e+03,  2.56995601e+02,  1.55916067e+03,
       -5.28956211e+02, -5.17919599e+02, -6.18953810e+02, -1.89897954e+02,
        1.19008541e+03,  4.37792061e+03,  3.47599981e+03,  3.36716930e+03,
        1.36826381e+03,  2.25133153e+03,  1.70475579e+03,  1.94699294e+03,
        1.00308298e+03, -9.35638103e+01,  4.16886454e+03,  9.92973243e+02,
        1.64604220e+03,  2.25530810e+03, -3.39570857e+02, -7.19228600e+02,
        5.51153075e+02,  2.39916405e+03, -3.89097452e+02,  8.75908999e+02,
        8.00667794e+02,  2.04541782e+03, -3.72415178e+02,  2.04509218e+03,
       -1.42169220e+03,  1.59246628e+03, -5.50674245e+02, -1.13512916e+03,
        1.47557580e+03, -1.92850146e+02, -1.36979666e+04,  3.87732768e+02,
       -5.08136709e+02,  1.56494851e+03,  2.05115776e+03,  2.28860798e+02,
        1.48505657e+03,  2.62021388e+03,  2.24258848e+03,  6.56539040e+02,
       -9.32464754e+02, -