In [1]:
import pandas as pd
import numpy as np
from zipfile import ZipFile

In [2]:
taxiTrainRaw = pd.read_csv(ZipFile('../data/train.zip').open('train.csv'))
taxiTestRaw = pd.read_csv(ZipFile('../data/test.zip').open('test.csv'))

In [3]:
taxiTrainRaw.head()

Unnamed: 0,id,vendor_id,pickup_datetime,dropoff_datetime,passenger_count,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,store_and_fwd_flag,trip_duration
0,id2875421,2,2016-03-14 17:24:55,2016-03-14 17:32:30,1,-73.982155,40.767937,-73.96463,40.765602,N,455
1,id2377394,1,2016-06-12 00:43:35,2016-06-12 00:54:38,1,-73.980415,40.738564,-73.999481,40.731152,N,663
2,id3858529,2,2016-01-19 11:35:24,2016-01-19 12:10:48,1,-73.979027,40.763939,-74.005333,40.710087,N,2124
3,id3504673,2,2016-04-06 19:32:31,2016-04-06 19:39:40,1,-74.01004,40.719971,-74.012268,40.706718,N,429
4,id2181028,2,2016-03-26 13:30:55,2016-03-26 13:38:10,1,-73.973053,40.793209,-73.972923,40.78252,N,435


In [4]:
taxiTrainRaw.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1458644 entries, 0 to 1458643
Data columns (total 11 columns):
 #   Column              Non-Null Count    Dtype  
---  ------              --------------    -----  
 0   id                  1458644 non-null  object 
 1   vendor_id           1458644 non-null  int64  
 2   pickup_datetime     1458644 non-null  object 
 3   dropoff_datetime    1458644 non-null  object 
 4   passenger_count     1458644 non-null  int64  
 5   pickup_longitude    1458644 non-null  float64
 6   pickup_latitude     1458644 non-null  float64
 7   dropoff_longitude   1458644 non-null  float64
 8   dropoff_latitude    1458644 non-null  float64
 9   store_and_fwd_flag  1458644 non-null  object 
 10  trip_duration       1458644 non-null  int64  
dtypes: float64(4), int64(3), object(4)
memory usage: 122.4+ MB


In [5]:
taxiTestRaw.head()

Unnamed: 0,id,vendor_id,pickup_datetime,passenger_count,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,store_and_fwd_flag
0,id3004672,1,2016-06-30 23:59:58,1,-73.988129,40.732029,-73.990173,40.75668,N
1,id3505355,1,2016-06-30 23:59:53,1,-73.964203,40.679993,-73.959808,40.655403,N
2,id1217141,1,2016-06-30 23:59:47,1,-73.997437,40.737583,-73.98616,40.729523,N
3,id2150126,2,2016-06-30 23:59:41,1,-73.95607,40.7719,-73.986427,40.730469,N
4,id1598245,1,2016-06-30 23:59:33,1,-73.970215,40.761475,-73.96151,40.75589,N


In [6]:
taxiTestRaw.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 625134 entries, 0 to 625133
Data columns (total 9 columns):
 #   Column              Non-Null Count   Dtype  
---  ------              --------------   -----  
 0   id                  625134 non-null  object 
 1   vendor_id           625134 non-null  int64  
 2   pickup_datetime     625134 non-null  object 
 3   passenger_count     625134 non-null  int64  
 4   pickup_longitude    625134 non-null  float64
 5   pickup_latitude     625134 non-null  float64
 6   dropoff_longitude   625134 non-null  float64
 7   dropoff_latitude    625134 non-null  float64
 8   store_and_fwd_flag  625134 non-null  object 
dtypes: float64(4), int64(2), object(3)
memory usage: 42.9+ MB


In [7]:
taxiTrain = taxiTrainRaw.drop(columns={'id','vendor_id','pickup_datetime','dropoff_datetime','store_and_fwd_flag'})
taxiTest = taxiTestRaw.drop(columns={'id','vendor_id','pickup_datetime','store_and_fwd_flag'})

In [8]:
def calculateDistance(long1,lat1,long2,lat2):
    long1, lat1, long2, lat2 = map(np.radians, [long1, lat1, long2, lat2])  
    dLong = long2 - long1
    dLat = lat2 - lat1
    a = np.sin(dLat / 2)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dLong / 2)**2
    c = 2 * np.arcsin(np.sqrt(a))
    r = 6371
    return c * r      
    

In [9]:
taxiTrain['distance'] = taxiTrain.apply(lambda row: calculateDistance(row['pickup_longitude'], row['pickup_latitude'], 
                                                                      row['dropoff_longitude'], row['dropoff_latitude']), axis=1)

taxiTest['distance'] = taxiTest.apply(lambda row: calculateDistance(row['pickup_longitude'], row['pickup_latitude'], 
                                                                      row['dropoff_longitude'], row['dropoff_latitude']), axis=1)

In [10]:
taxiTrain.drop(columns={'pickup_longitude', 'pickup_latitude', 'dropoff_longitude', 'dropoff_latitude'},inplace=True)
taxiTest.drop(columns={'pickup_longitude', 'pickup_latitude', 'dropoff_longitude', 'dropoff_latitude'},inplace=True)

In [11]:
taxiTrain.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1458644 entries, 0 to 1458643
Data columns (total 3 columns):
 #   Column           Non-Null Count    Dtype  
---  ------           --------------    -----  
 0   passenger_count  1458644 non-null  int64  
 1   trip_duration    1458644 non-null  int64  
 2   distance         1458644 non-null  float64
dtypes: float64(1), int64(2)
memory usage: 33.4 MB


In [12]:
pd.to_pickle(taxiTrain,'../data/train.pickle')
pd.to_pickle(taxiTest,'../data/test.pickle')