In [1]:
import torch
import torch.nn as nn

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
df= pd.read_csv('/home/mikaeil/Desktop/Dataset/Taxi Dataset/NYCTaxiFares.csv')

In [3]:
df.head()

Unnamed: 0,pickup_datetime,fare_amount,fare_class,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
0,2010-04-19 08:17:56 UTC,6.5,0,-73.992365,40.730521,-73.975499,40.744746,1
1,2010-04-17 15:43:53 UTC,6.9,0,-73.990078,40.740558,-73.974232,40.744114,1
2,2010-04-17 11:23:26 UTC,10.1,1,-73.994149,40.751118,-73.960064,40.766235,2
3,2010-04-11 21:25:03 UTC,8.9,0,-73.990485,40.756422,-73.971205,40.748192,1
4,2010-04-17 02:19:01 UTC,19.7,1,-73.990976,40.734202,-73.905956,40.743115,1


In [4]:
df['fare_amount'].describe()

count    120000.000000
mean         10.040326
std           7.500134
min           2.500000
25%           5.700000
50%           7.700000
75%          11.300000
max          49.900000
Name: fare_amount, dtype: float64

In [5]:
def haversine_distance(df, lat1, long1, lat2, long2):
    """
    Calculates the haversine distance between 2 sets of GPS coordinates in df
    """
    r = 6371  # average radius of Earth in kilometers
       
    phi1 = np.radians(df[lat1])
    phi2 = np.radians(df[lat2])
    
    delta_phi = np.radians(df[lat2]-df[lat1])
    delta_lambda = np.radians(df[long2]-df[long1])
     
    a = np.sin(delta_phi/2)**2 + np.cos(phi1) * np.cos(phi2) * np.sin(delta_lambda/2)**2
    c = 2 * np.arctan2(np.sqrt(a), np.sqrt(1-a))
    d = (r * c) # in kilometers

    return d

In [6]:
df['dist_km'] = haversine_distance(df,'pickup_latitude', 'pickup_longitude', 'dropoff_latitude', 'dropoff_longitude')
df.head()

Unnamed: 0,pickup_datetime,fare_amount,fare_class,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,dist_km
0,2010-04-19 08:17:56 UTC,6.5,0,-73.992365,40.730521,-73.975499,40.744746,1,2.126312
1,2010-04-17 15:43:53 UTC,6.9,0,-73.990078,40.740558,-73.974232,40.744114,1,1.392307
2,2010-04-17 11:23:26 UTC,10.1,1,-73.994149,40.751118,-73.960064,40.766235,2,3.326763
3,2010-04-11 21:25:03 UTC,8.9,0,-73.990485,40.756422,-73.971205,40.748192,1,1.864129
4,2010-04-17 02:19:01 UTC,19.7,1,-73.990976,40.734202,-73.905956,40.743115,1,7.231321


In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 120000 entries, 0 to 119999
Data columns (total 9 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   pickup_datetime    120000 non-null  object 
 1   fare_amount        120000 non-null  float64
 2   fare_class         120000 non-null  int64  
 3   pickup_longitude   120000 non-null  float64
 4   pickup_latitude    120000 non-null  float64
 5   dropoff_longitude  120000 non-null  float64
 6   dropoff_latitude   120000 non-null  float64
 7   passenger_count    120000 non-null  int64  
 8   dist_km            120000 non-null  float64
dtypes: float64(6), int64(2), object(1)
memory usage: 8.2+ MB


In [8]:
df['pickup_datetime']= pd.to_datetime(df['pickup_datetime'])

In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 120000 entries, 0 to 119999
Data columns (total 9 columns):
 #   Column             Non-Null Count   Dtype              
---  ------             --------------   -----              
 0   pickup_datetime    120000 non-null  datetime64[ns, UTC]
 1   fare_amount        120000 non-null  float64            
 2   fare_class         120000 non-null  int64              
 3   pickup_longitude   120000 non-null  float64            
 4   pickup_latitude    120000 non-null  float64            
 5   dropoff_longitude  120000 non-null  float64            
 6   dropoff_latitude   120000 non-null  float64            
 7   passenger_count    120000 non-null  int64              
 8   dist_km            120000 non-null  float64            
dtypes: datetime64[ns, UTC](1), float64(6), int64(2)
memory usage: 8.2 MB


In [10]:
 df['EDTdate']= df['pickup_datetime']- pd.Timedelta(hours=4)

In [11]:
df['Hour']= df['EDTdate'].dt.hour

In [12]:
df['AMorPM']= np.where(df['Hour']<12, 'am', 'pm')

In [13]:
df['Weekday']= df['EDTdate'].dt.strftime('%a')#  dt.dayofweek

In [14]:
df.head()

Unnamed: 0,pickup_datetime,fare_amount,fare_class,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,dist_km,EDTdate,Hour,AMorPM,Weekday
0,2010-04-19 08:17:56+00:00,6.5,0,-73.992365,40.730521,-73.975499,40.744746,1,2.126312,2010-04-19 04:17:56+00:00,4,am,Mon
1,2010-04-17 15:43:53+00:00,6.9,0,-73.990078,40.740558,-73.974232,40.744114,1,1.392307,2010-04-17 11:43:53+00:00,11,am,Sat
2,2010-04-17 11:23:26+00:00,10.1,1,-73.994149,40.751118,-73.960064,40.766235,2,3.326763,2010-04-17 07:23:26+00:00,7,am,Sat
3,2010-04-11 21:25:03+00:00,8.9,0,-73.990485,40.756422,-73.971205,40.748192,1,1.864129,2010-04-11 17:25:03+00:00,17,pm,Sun
4,2010-04-17 02:19:01+00:00,19.7,1,-73.990976,40.734202,-73.905956,40.743115,1,7.231321,2010-04-16 22:19:01+00:00,22,pm,Fri


In [15]:
cat_cols= ['Hour', 'AMorPM', 'Weekday']
cont_cols= ['pickup_longitude',
       'pickup_latitude', 'dropoff_longitude', 'dropoff_latitude',
       'passenger_count', 'dist_km']

In [16]:
 y_col= ['fare_amount']

In [17]:
for cat in cat_cols:
    df[cat]= df[cat].astype('category')

In [18]:
df.dtypes

pickup_datetime      datetime64[ns, UTC]
fare_amount                      float64
fare_class                         int64
pickup_longitude                 float64
pickup_latitude                  float64
dropoff_longitude                float64
dropoff_latitude                 float64
passenger_count                    int64
dist_km                          float64
EDTdate              datetime64[ns, UTC]
Hour                            category
AMorPM                          category
Weekday                         category
dtype: object

In [19]:
df['Hour'].head()

0     4
1    11
2     7
3    17
4    22
Name: Hour, dtype: category
Categories (24, int64): [0, 1, 2, 3, ..., 20, 21, 22, 23]

In [20]:
df['AMorPM'].head()

0    am
1    am
2    am
3    pm
4    pm
Name: AMorPM, dtype: category
Categories (2, object): ['am', 'pm']

In [21]:
df['Weekday'].head()

0    Mon
1    Sat
2    Sat
3    Sun
4    Fri
Name: Weekday, dtype: category
Categories (7, object): ['Fri', 'Mon', 'Sat', 'Sun', 'Thu', 'Tue', 'Wed']

In [22]:
df['AMorPM'].cat.codes

0         0
1         0
2         0
3         1
4         1
         ..
119995    0
119996    0
119997    1
119998    0
119999    1
Length: 120000, dtype: int8

In [23]:
df['AMorPM'].cat.codes.values

array([0, 0, 0, ..., 1, 0, 1], dtype=int8)

In [24]:
hr= df['Hour'].cat.codes.values
ampm= df['AMorPM'].cat.codes.values
wkdy= df['Weekday'].cat.codes.values

In [25]:
hr

array([ 4, 11,  7, ..., 14,  4, 12], dtype=int8)

In [26]:
cats= np.stack([hr, ampm, wkdy], axis=1)

In [27]:
cats

array([[ 4,  0,  1],
       [11,  0,  2],
       [ 7,  0,  2],
       ...,
       [14,  1,  3],
       [ 4,  0,  5],
       [12,  1,  2]], dtype=int8)

In [28]:
#cats=np.stack([df[col].cat.codes.values for col in cat_cols],1)

In [29]:
cats= torch.tensor(cats, dtype= torch.int64)

In [30]:
conts=np.stack([df[col].values for col in cont_cols], axis=1)

In [31]:
conts= torch.tensor(conts, dtype= torch.float)

In [32]:
y=torch.tensor(df[y_col].values,dtype= torch.float).reshape(-1, 1)

In [33]:
# calculate category size
cat_szs= [len(df[col].cat.categories) for col in cat_cols]

In [34]:
cat_szs

[24, 2, 7]

In [35]:
# calculate embedding size
emb_szs= [(size, min(50, (size+1)//2))for size in cat_szs]

In [36]:
emb_szs

[(24, 12), (2, 1), (7, 4)]

In [37]:
catz=cats[:2]

In [38]:
catz

tensor([[ 4,  0,  1],
        [11,  0,  2]])

In [39]:
# make a list of embedding layers
selfembeds= nn.ModuleList([nn.Embedding(ni,nf) for ni,nf in emb_szs])

In [40]:
selfembeds

ModuleList(
  (0): Embedding(24, 12)
  (1): Embedding(2, 1)
  (2): Embedding(7, 4)
)

In [41]:
embeddingz= []

for i,e in enumerate(selfembeds):
    embeddingz.append(e(catz[:,i]))

In [42]:
embeddingz

[tensor([[ 1.4325, -0.3552, -0.3853, -0.5851, -0.7315, -2.4092,  0.9159,  0.3791,
          -0.1599,  1.5418, -0.8957, -2.4161],
         [ 0.1542,  0.2536,  0.4741, -0.4527,  0.6275,  1.7840,  0.0418,  2.4309,
          -0.9698, -1.3570,  0.8834,  0.3976]], grad_fn=<EmbeddingBackward>),
 tensor([[0.9250],
         [0.9250]], grad_fn=<EmbeddingBackward>),
 tensor([[-1.8671, -0.2842, -0.5907, -0.7124],
         [ 1.1225,  0.3230,  1.5499,  1.4941]], grad_fn=<EmbeddingBackward>)]

In [43]:
z= torch.cat(embeddingz, 1)

In [44]:
z

tensor([[ 1.4325, -0.3552, -0.3853, -0.5851, -0.7315, -2.4092,  0.9159,  0.3791,
         -0.1599,  1.5418, -0.8957, -2.4161,  0.9250, -1.8671, -0.2842, -0.5907,
         -0.7124],
        [ 0.1542,  0.2536,  0.4741, -0.4527,  0.6275,  1.7840,  0.0418,  2.4309,
         -0.9698, -1.3570,  0.8834,  0.3976,  0.9250,  1.1225,  0.3230,  1.5499,
          1.4941]], grad_fn=<CatBackward>)

In [45]:
selfembdrop= nn.Dropout(0.4)

In [46]:
z= selfembdrop(z) # pass to droupout layer

In [47]:
z

tensor([[ 0.0000, -0.5920, -0.6421, -0.0000, -0.0000, -0.0000,  0.0000,  0.6318,
         -0.2664,  2.5697, -1.4928, -0.0000,  0.0000, -0.0000, -0.0000, -0.9845,
         -1.1874],
        [ 0.2569,  0.4226,  0.7902, -0.7545,  1.0458,  2.9734,  0.0697,  4.0514,
         -1.6163, -0.0000,  1.4723,  0.6626,  1.5417,  0.0000,  0.0000,  2.5832,
          0.0000]], grad_fn=<MulBackward0>)

In [48]:
class TabularModel(nn.Module):
    def __init__(self, emb_szs, n_cont, out_sz, layers, p=0.5):
        super().__init__()
        
        self.embeds= nn.ModuleList([nn.Embedding(ni,nf) for ni, nf in emb_szs])
        self.emb_drop= nn.Dropout(p)
        self.bn_cont= nn.BatchNorm1d(n_cont)
        
        layerlist=[]
        n_emb= sum(nf for ni,nf in emb_szs )
        n_in = n_emb + n_cont  # sum of all input features
        
        for i in layers:
            layerlist.append(nn.Linear(n_in, i))
            layerlist.append(nn.ReLU(inplace= True))
            layerlist.append(nn.BatchNorm1d(i))
            layerlist.append(nn.Dropout(p))
            n_in=i
            
        layerlist.append(nn.Linear(layers[-1], out_sz))
        
        
        self.layers= nn.Sequential(*layerlist)
    
    def forward(self, x_cat, x_cont):
        embeddings= []
        
        
        for i, e in enumerate(self.embeds):
            embeddings.append(e(x_cat[:,i]))
            
        x= torch.cat(embeddings, 1)
        x= self.emb_drop(x)
            
        x_cont= self.bn_cont(x_cont) # pass through nomalization layer
        x=torch.cat([x, x_cont], 1)  # adding up categorical and continous data 
        
        return x
        

In [49]:
torch.manual_seed(33)

model= TabularModel(emb_szs, conts.shape[1], 1,[200,100], p=0.4)

In [50]:
model

TabularModel(
  (embeds): ModuleList(
    (0): Embedding(24, 12)
    (1): Embedding(2, 1)
    (2): Embedding(7, 4)
  )
  (emb_drop): Dropout(p=0.4, inplace=False)
  (bn_cont): BatchNorm1d(6, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (layers): Sequential(
    (0): Linear(in_features=23, out_features=200, bias=True)
    (1): ReLU(inplace=True)
    (2): BatchNorm1d(200, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (3): Dropout(p=0.4, inplace=False)
    (4): Linear(in_features=200, out_features=100, bias=True)
    (5): ReLU(inplace=True)
    (6): BatchNorm1d(100, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (7): Dropout(p=0.4, inplace=False)
    (8): Linear(in_features=100, out_features=1, bias=True)
  )
)

In [51]:
criterion= nn.MSELoss()
optimizer= torch.optim.Adam(model.parameters(), lr= 0.06)

In [52]:
batch_size= 120000
test_size= int(batch_size*0.2)

In [53]:
cat_train= cats[:batch_size - test_size]
cat_test= cats[batch_size - test_size: batch_size]
con_train= conts[:batch_size - test_size]
con_test= conts[batch_size - test_size : batch_size]

In [54]:
y_train= y[:batch_size - test_size]
y_test= y[batch_size - test_size: batch_size]

In [55]:
import time

start_time= time.time()

epochs= 500
losses= []


for i in range(epochs):
    i +=1
    
    
    y_pred= model(cat_train, con_train)
    loss= torch.sqrt(criterion(y_pred, y_train))
    losses.append(loss)
    
    
    if i%10==1:
        print(f'epoch: {i} loss is {loss}')
        
        
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    
    
duration= time.time() - start_time
print(f'Training took {duration/60} minutes')


  return F.mse_loss(input, target, reduction=self.reduction)


epoch: 1 loss is 12.626952171325684
epoch: 11 loss is 12.128745079040527
epoch: 21 loss is 11.66796588897705
epoch: 31 loss is 11.248017311096191
epoch: 41 loss is 10.874324798583984
epoch: 51 loss is 10.54636287689209
epoch: 61 loss is 10.26839542388916
epoch: 71 loss is 10.027616500854492
epoch: 81 loss is 9.833477020263672
epoch: 91 loss is 9.678089141845703
epoch: 101 loss is 9.545310020446777
epoch: 111 loss is 9.45255184173584
epoch: 121 loss is 9.3724365234375
epoch: 131 loss is 9.30604362487793
epoch: 141 loss is 9.259363174438477
epoch: 151 loss is 9.223464012145996
epoch: 161 loss is 9.198450088500977
epoch: 171 loss is 9.178739547729492
epoch: 181 loss is 9.16048812866211
epoch: 191 loss is 9.146690368652344
epoch: 201 loss is 9.137798309326172
epoch: 211 loss is 9.131519317626953
epoch: 221 loss is 9.12613582611084
epoch: 231 loss is 9.119963645935059
epoch: 241 loss is 9.120931625366211
epoch: 251 loss is 9.116294860839844
epoch: 261 loss is 9.118053436279297
epoch: 271 lo

In [57]:
 with torch.no_grad():
        y_val= model(cat_test, con_test)
        loss= torch.sqrt(criterion(y_val, y_test))

  return F.mse_loss(input, target, reduction=self.reduction)


In [58]:
loss

tensor(9.1082)