In [1]:
import torch
import numpy as np
import pandas as pd
import torch.nn.functional as F
import torch.nn as nn

In [2]:
from sklearn.model_selection import train_test_split

In [3]:
df = pd.read_csv("NYCTaxiFares.csv")

In [4]:
df.head()

Unnamed: 0,pickup_datetime,fare_amount,fare_class,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
0,2010-04-19 08:17:56 UTC,6.5,0,-73.992365,40.730521,-73.975499,40.744746,1
1,2010-04-17 15:43:53 UTC,6.9,0,-73.990078,40.740558,-73.974232,40.744114,1
2,2010-04-17 11:23:26 UTC,10.1,1,-73.994149,40.751118,-73.960064,40.766235,2
3,2010-04-11 21:25:03 UTC,8.9,0,-73.990485,40.756422,-73.971205,40.748192,1
4,2010-04-17 02:19:01 UTC,19.7,1,-73.990976,40.734202,-73.905956,40.743115,1


In [5]:
y_label = df["fare_amount"].values

In [6]:
def haversine_distance(df, lat1, long1, lat2, long2):
    """
    Calculates the haversine distance between 2 sets of GPS coordinates in df
    """
    r = 6371  # average radius of Earth in kilometers
       
    phi1 = np.radians(df[lat1])
    phi2 = np.radians(df[lat2])
    
    delta_phi = np.radians(df[lat2]-df[lat1])
    delta_lambda = np.radians(df[long2]-df[long1])
     
    a = np.sin(delta_phi/2)**2 + np.cos(phi1) * np.cos(phi2) * np.sin(delta_lambda/2)**2
    c = 2 * np.arctan2(np.sqrt(a), np.sqrt(1-a))
    d = (r * c) # in kilometers

    return d

In [7]:
df['dist_km'] = haversine_distance(df,'pickup_latitude', 'pickup_longitude', 'dropoff_latitude', 'dropoff_longitude')
df.head()

Unnamed: 0,pickup_datetime,fare_amount,fare_class,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,dist_km
0,2010-04-19 08:17:56 UTC,6.5,0,-73.992365,40.730521,-73.975499,40.744746,1,2.126312
1,2010-04-17 15:43:53 UTC,6.9,0,-73.990078,40.740558,-73.974232,40.744114,1,1.392307
2,2010-04-17 11:23:26 UTC,10.1,1,-73.994149,40.751118,-73.960064,40.766235,2,3.326763
3,2010-04-11 21:25:03 UTC,8.9,0,-73.990485,40.756422,-73.971205,40.748192,1,1.864129
4,2010-04-17 02:19:01 UTC,19.7,1,-73.990976,40.734202,-73.905956,40.743115,1,7.231321


In [8]:
p_longitude = df["pickup_longitude"].values
p_latitude = df["pickup_latitude"].values
d_longitude = df["dropoff_longitude"].values
d_latitude = df["dropoff_latitude"].values
passenger_count = df["passenger_count"].values
distance = df["dist_km"].values

In [27]:
X = np.stack([p_longitude,p_latitude,d_longitude,d_latitude,passenger_count,distance],axis = 1)

In [28]:
y_label = y_label.reshape((-1,1))

In [29]:
X_train, X_test, y_train, y_test = train_test_split(X,y_label)

In [30]:
X_train = torch.FloatTensor(X_train)
X_test = torch.FloatTensor(X_test)

In [31]:
y_train = torch.FloatTensor(y_train)
y_test = torch.FloatTensor(y_test)

In [32]:
class Model(nn.Module):
    def __init__(self, in_features=6, h1=100, h2=100, h3 = 100, out_features=1):
        super().__init__()
        self.fc1 = nn.Linear(in_features,h1)    # input layer
        self.fc2 = nn.Linear(h1, h2)            # hidden layer
        self.fc3 = nn.Linear(h2,h3)
        self.out = nn.Linear(h3, out_features)  # output layer
        
    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = F.relu(self.fc3(x))
        x = self.out(x)
        return x

In [33]:
model = Model()

In [34]:
criterion = nn.MSELoss()  # we'll convert this to RMSE later
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

In [36]:
epochs = 150

for i in range(epochs):
    y_pred = model.forward(X_train)
    loss = criterion(y_pred,y_train)
    
    if i%10 == 1:
        print(f'epoch: {i:2}  loss: {loss.item():10.8f}')
    
    
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

epoch:  1  loss: 10.97501469
epoch: 11  loss: 11.04236603
epoch: 21  loss: 10.80784225
epoch: 31  loss: 10.76502991
epoch: 41  loss: 10.74826527
epoch: 51  loss: 10.73707867
epoch: 61  loss: 10.73199940
epoch: 71  loss: 10.76047039
epoch: 81  loss: 10.73238564
epoch: 91  loss: 10.73316956
epoch: 101  loss: 10.72959137
epoch: 111  loss: 10.72743130
epoch: 121  loss: 10.72658634
epoch: 131  loss: 10.72527027
epoch: 141  loss: 10.73643398


In [37]:
with torch.no_grad():
    y_val = model.forward(X_test)
    loss = criterion(y_val,y_test)
    print(str(loss))

tensor(11.7121)


In [39]:
with torch.no_grad():
    y_val = model.forward(X_test)
    for i in range(10):
        print(f'{str(y_val[i])}   {str(y_test[i])}')

tensor([18.5812])   tensor([18.9000])
tensor([4.7373])   tensor([3.7000])
tensor([5.3142])   tensor([5.3000])
tensor([7.0902])   tensor([4.9000])
tensor([5.4543])   tensor([5.7000])
tensor([5.0427])   tensor([4.5000])
tensor([12.7880])   tensor([7.3000])
tensor([8.6924])   tensor([7.3000])
tensor([5.0401])   tensor([3.7000])
tensor([5.7923])   tensor([5.7000])
