## Imports

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from datetime import datetime

def get_path(dataset_name,env_name='colab'):
    prefix = 'https://raw.githubusercontent.com/John-Ghaly88/Big_Data_and_NoSQL/main/Datasets/Assessment/'
    if env_name == 'colab':
        return prefix+dataset_name
    else:
        return f'../Datasets/{dataset_name}'

### Explore data

In [3]:
df = pd.read_csv(get_path('taxi_trip_data(20k).csv'))

df.head()

Unnamed: 0,vendor_id,pickup_datetime,dropoff_datetime,passenger_count,trip_distance,rate_code,store_and_fwd_flag,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,imp_surcharge,total_amount,pickup_location_id,dropoff_location_id
0,1,05-11-18 17:40,05-11-18 17:55,1,1.6,1,N,1,11.5,1.0,0.5,0.0,0.0,0.3,13.3,48,68
1,2,3/22/2018 23:01,3/22/2018 23:25,1,9.52,1,N,1,28.5,0.5,0.5,5.96,0.0,0.3,35.76,138,230
2,2,7/24/2018 9:58,7/24/2018 10:22,1,2.17,1,N,1,15.5,0.0,0.5,1.5,0.0,0.3,17.8,234,48
3,2,12/21/2018 18:28,12/21/2018 18:35,1,0.86,1,N,2,6.0,1.0,0.5,0.0,0.0,0.3,7.8,79,125
4,1,8/15/2018 13:58,8/15/2018 14:05,1,0.3,1,N,2,5.5,0.0,0.5,0.0,0.0,0.3,6.3,233,233


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19999 entries, 0 to 19998
Data columns (total 17 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   vendor_id            19999 non-null  int64  
 1   pickup_datetime      19999 non-null  object 
 2   dropoff_datetime     19999 non-null  object 
 3   passenger_count      19999 non-null  int64  
 4   trip_distance        19999 non-null  float64
 5   rate_code            19999 non-null  int64  
 6   store_and_fwd_flag   19999 non-null  object 
 7   payment_type         19999 non-null  int64  
 8   fare_amount          19999 non-null  float64
 9   extra                19999 non-null  float64
 10  mta_tax              19999 non-null  float64
 11  tip_amount           19999 non-null  float64
 12  tolls_amount         19999 non-null  float64
 13  imp_surcharge        19999 non-null  float64
 14  total_amount         19999 non-null  float64
 15  pickup_location_id   19999 non-null 

In [5]:
df=df.drop(['store_and_fwd_flag','rate_code','total_amount'],axis=1)

In [6]:
df['vendor_id'].unique()

array([1, 2, 4], dtype=int64)

In [8]:
df=df[df['vendor_id']!=4]

df['vendor_id'].unique()

array([1, 2], dtype=int64)

In [9]:
len(df[df['passenger_count']==0])/len(df)

0.009686323713927227

In [11]:
df=df[df['passenger_count']!=0]

df['payment_type'].unique()

array([1, 2, 3, 4], dtype=int64)

In [12]:
df=df[df['trip_distance']!=0]

In [13]:
df=df[df['fare_amount']!=0]

In [14]:
df=df[df['mta_tax']==0.5]

In [None]:
# df.info()

In [16]:
zone = pd.read_csv(get_path('taxi_zone_geo.csv'))

In [17]:
zone.drop(zone.columns[3], axis=1, inplace=True)

### Renaming column in dataframe to merge with zone table to retrieve location name

In [18]:
df.rename(columns = {'pickup_location_id':'zone_id'}, inplace = True)

In [19]:
df_merged= pd.merge(df,zone, on='zone_id', how='inner')

In [20]:
len(df_merged)

19220

In [21]:
df_merged.head()

Unnamed: 0,vendor_id,pickup_datetime,dropoff_datetime,passenger_count,trip_distance,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,imp_surcharge,zone_id,dropoff_location_id,zone_name,borough
0,1,05-11-18 17:40,05-11-18 17:55,1,1.6,1,11.5,1.0,0.5,0.0,0.0,0.3,48,68,Clinton East,Manhattan
1,1,8/26/2018 10:24,8/26/2018 10:32,2,1.2,2,7.5,0.0,0.5,0.0,0.0,0.3,48,43,Clinton East,Manhattan
2,2,11/21/2018 22:25,11/21/2018 22:42,1,2.4,2,12.5,0.5,0.5,0.0,0.0,0.3,48,137,Clinton East,Manhattan
3,1,6/15/2018 6:26,6/15/2018 6:34,1,1.1,1,7.0,0.0,0.5,2.3,0.0,0.3,48,162,Clinton East,Manhattan
4,2,12-05-18 18:41,12-05-18 18:53,2,1.35,1,9.0,1.0,0.5,3.0,0.0,0.3,48,68,Clinton East,Manhattan


### Converting date to datetime format to calculate the duration of the trips

In [23]:
df_merged['dropoff_datetime'] =  pd.to_datetime(df_merged['dropoff_datetime'])
df_merged['pickup_datetime'] =  pd.to_datetime(df_merged['pickup_datetime'])
df_merged['duration'] =df_merged['dropoff_datetime'] - df_merged['pickup_datetime']

def minutes(time):
    return time.seconds/60

df_merged['duration']=df_merged['duration'].apply(lambda x:minutes(x) )

In [None]:
# df_merged.head()

### Setting time of day according to pickup time

In [28]:
def time_of_day(date):
    if((date.hour>=5)&(date.hour<12)):
        return 'morning'
    if((date.hour>=12)&(date.hour<17)):
        return 'afternoon'
    if((date.hour>=17)&(date.hour<21)):
        return 'evening'
    else:
        return 'night'

In [30]:
df_merged['time_of_day']=df_merged['pickup_datetime'].apply(lambda x:time_of_day(x))

df_merged['time_of_day'].unique()

array(['evening', 'morning', 'night', 'afternoon'], dtype=object)

In [None]:
# df_merged.head()

In [32]:
df_merged['dropoff_datetime'] = df_merged['dropoff_datetime'].dt.strftime("%m/%d/%Y, %H:%M:%S")

df_merged['pickup_datetime'] = df_merged['pickup_datetime'].dt.strftime("%m/%d/%Y, %H:%M:%S")

### Exporting to csv file

In [34]:
# df_merged.to_csv('taxi_trip_cleaned.csv',index=False)

## Bonus

### The R value shows that the 2 attributes are moderately correlated

In [37]:
from scipy import stats

slope, intercept, r, p, std_err = stats.linregress(df_merged['trip_distance'], df_merged['tip_amount'])

print(abs(r))

0.5684683896088778
