In [61]:
import numpy as np
import pandas as pd

import seaborn as sns
import matplotlib.pyplot as plt

In [62]:
data = pd.read_csv('uber.csv', low_memory=False, index_col=0)

In [63]:
data

Unnamed: 0,key,fare_amount,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
24238194,2015-05-07 19:52:06.0000003,7.5,2015-05-07 19:52:06 UTC,-73.999817,40.738354,-73.999512,40.723217,1
27835199,2009-07-17 20:04:56.0000002,7.7,2009-07-17 20:04:56 UTC,-73.994355,40.728225,-73.994710,40.750325,1
44984355,2009-08-24 21:45:00.00000061,12.9,2009-08-24 21:45:00 UTC,-74.005043,40.740770,-73.962565,40.772647,1
25894730,2009-06-26 08:22:21.0000001,5.3,2009-06-26 08:22:21 UTC,-73.976124,40.790844,-73.965316,40.803349,3
17610152,2014-08-28 17:47:00.000000188,16.0,2014-08-28 17:47:00 UTC,-73.925023,40.744085,-73.973082,40.761247,5
...,...,...,...,...,...,...,...,...
42598914,2012-10-28 10:49:00.00000053,3.0,2012-10-28 10:49:00 UTC,-73.987042,40.739367,-73.986525,40.740297,1
16382965,2014-03-14 01:09:00.0000008,7.5,2014-03-14 01:09:00 UTC,-73.984722,40.736837,-74.006672,40.739620,1
27804658,2009-06-29 00:42:00.00000078,30.9,2009-06-29 00:42:00 UTC,-73.986017,40.756487,-73.858957,40.692588,2
20259894,2015-05-20 14:56:25.0000004,14.5,2015-05-20 14:56:25 UTC,-73.997124,40.725452,-73.983215,40.695415,1


# Data Preprocessing

In [64]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 200000 entries, 24238194 to 11951496
Data columns (total 8 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   key                200000 non-null  object 
 1   fare_amount        200000 non-null  float64
 2   pickup_datetime    200000 non-null  object 
 3   pickup_longitude   200000 non-null  float64
 4   pickup_latitude    200000 non-null  float64
 5   dropoff_longitude  199999 non-null  float64
 6   dropoff_latitude   199999 non-null  float64
 7   passenger_count    200000 non-null  int64  
dtypes: float64(5), int64(1), object(2)
memory usage: 13.7+ MB


In [65]:
data.dropna(inplace=True)

In [66]:
data.describe()

Unnamed: 0,fare_amount,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
count,199999.0,199999.0,199999.0,199999.0,199999.0,199999.0
mean,11.359892,-72.527631,39.935881,-72.525292,39.92389,1.684543
std,9.90176,11.437815,7.720558,13.117408,6.794829,1.385995
min,-52.0,-1340.64841,-74.015515,-3356.6663,-881.985513,0.0
25%,6.0,-73.992065,40.734796,-73.991407,40.733823,1.0
50%,8.5,-73.981823,40.752592,-73.980093,40.753042,1.0
75%,12.5,-73.967154,40.767158,-73.963658,40.768001,2.0
max,499.0,57.418457,1644.421482,1153.572603,872.697628,208.0


# IQR
### IQR = Q3 -Q1
### Min = Q1 - 1.5 X IQR

In [67]:
Q1 = data['pickup_longitude'].quantile(0.25)
Q3 = data['pickup_longitude'].quantile(0.75)
IQR = Q3 - Q1

Min = Q1 - 1.5 * IQR
Max = Q3 + 1.5 * IQR

In [68]:
print('Max = ', Max)
print('Max = ', Min)

Max =  -73.92978750000003
Max =  -74.02943149999999


In [69]:
data[(data['pickup_longitude'] < Min) | (data['pickup_longitude'] > Max)].shape[0]/data.shape[0]

0.06779533897669489

# Z score

In [70]:
mu = data['pickup_longitude'].mean()
std = data['pickup_longitude'].std()

z = np.abs(data['pickup_longitude'] - mu)

print(data[z>3].shape[0]/data.shape[0]*100)

data = data[z<=3]
data

1.9710098550492752


Unnamed: 0,key,fare_amount,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
24238194,2015-05-07 19:52:06.0000003,7.5,2015-05-07 19:52:06 UTC,-73.999817,40.738354,-73.999512,40.723217,1
27835199,2009-07-17 20:04:56.0000002,7.7,2009-07-17 20:04:56 UTC,-73.994355,40.728225,-73.994710,40.750325,1
44984355,2009-08-24 21:45:00.00000061,12.9,2009-08-24 21:45:00 UTC,-74.005043,40.740770,-73.962565,40.772647,1
25894730,2009-06-26 08:22:21.0000001,5.3,2009-06-26 08:22:21 UTC,-73.976124,40.790844,-73.965316,40.803349,3
17610152,2014-08-28 17:47:00.000000188,16.0,2014-08-28 17:47:00 UTC,-73.925023,40.744085,-73.973082,40.761247,5
...,...,...,...,...,...,...,...,...
42598914,2012-10-28 10:49:00.00000053,3.0,2012-10-28 10:49:00 UTC,-73.987042,40.739367,-73.986525,40.740297,1
16382965,2014-03-14 01:09:00.0000008,7.5,2014-03-14 01:09:00 UTC,-73.984722,40.736837,-74.006672,40.739620,1
27804658,2009-06-29 00:42:00.00000078,30.9,2009-06-29 00:42:00 UTC,-73.986017,40.756487,-73.858957,40.692588,2
20259894,2015-05-20 14:56:25.0000004,14.5,2015-05-20 14:56:25 UTC,-73.997124,40.725452,-73.983215,40.695415,1


In [71]:
mu = data['pickup_latitude'].mean()
std = data['pickup_latitude'].std()

z = np.abs(data['pickup_latitude'] - mu)

print(data[z>3].shape[0]/data.shape[0]*100)

data = data[z<=3]
data

0.005100557490933758


Unnamed: 0,key,fare_amount,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
24238194,2015-05-07 19:52:06.0000003,7.5,2015-05-07 19:52:06 UTC,-73.999817,40.738354,-73.999512,40.723217,1
27835199,2009-07-17 20:04:56.0000002,7.7,2009-07-17 20:04:56 UTC,-73.994355,40.728225,-73.994710,40.750325,1
44984355,2009-08-24 21:45:00.00000061,12.9,2009-08-24 21:45:00 UTC,-74.005043,40.740770,-73.962565,40.772647,1
25894730,2009-06-26 08:22:21.0000001,5.3,2009-06-26 08:22:21 UTC,-73.976124,40.790844,-73.965316,40.803349,3
17610152,2014-08-28 17:47:00.000000188,16.0,2014-08-28 17:47:00 UTC,-73.925023,40.744085,-73.973082,40.761247,5
...,...,...,...,...,...,...,...,...
42598914,2012-10-28 10:49:00.00000053,3.0,2012-10-28 10:49:00 UTC,-73.987042,40.739367,-73.986525,40.740297,1
16382965,2014-03-14 01:09:00.0000008,7.5,2014-03-14 01:09:00 UTC,-73.984722,40.736837,-74.006672,40.739620,1
27804658,2009-06-29 00:42:00.00000078,30.9,2009-06-29 00:42:00 UTC,-73.986017,40.756487,-73.858957,40.692588,2
20259894,2015-05-20 14:56:25.0000004,14.5,2015-05-20 14:56:25 UTC,-73.997124,40.725452,-73.983215,40.695415,1


In [72]:
mu = data['dropoff_longitude'].mean()
std = data['dropoff_longitude'].std()

z = np.abs(data['dropoff_longitude'] - mu)

print(data[z>3].shape[0]/data.shape[0]*100)

data = data[z<=3]
data

0.09589537202813611


Unnamed: 0,key,fare_amount,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
24238194,2015-05-07 19:52:06.0000003,7.5,2015-05-07 19:52:06 UTC,-73.999817,40.738354,-73.999512,40.723217,1
27835199,2009-07-17 20:04:56.0000002,7.7,2009-07-17 20:04:56 UTC,-73.994355,40.728225,-73.994710,40.750325,1
44984355,2009-08-24 21:45:00.00000061,12.9,2009-08-24 21:45:00 UTC,-74.005043,40.740770,-73.962565,40.772647,1
25894730,2009-06-26 08:22:21.0000001,5.3,2009-06-26 08:22:21 UTC,-73.976124,40.790844,-73.965316,40.803349,3
17610152,2014-08-28 17:47:00.000000188,16.0,2014-08-28 17:47:00 UTC,-73.925023,40.744085,-73.973082,40.761247,5
...,...,...,...,...,...,...,...,...
42598914,2012-10-28 10:49:00.00000053,3.0,2012-10-28 10:49:00 UTC,-73.987042,40.739367,-73.986525,40.740297,1
16382965,2014-03-14 01:09:00.0000008,7.5,2014-03-14 01:09:00 UTC,-73.984722,40.736837,-74.006672,40.739620,1
27804658,2009-06-29 00:42:00.00000078,30.9,2009-06-29 00:42:00 UTC,-73.986017,40.756487,-73.858957,40.692588,2
20259894,2015-05-20 14:56:25.0000004,14.5,2015-05-20 14:56:25 UTC,-73.997124,40.725452,-73.983215,40.695415,1


In [73]:
mu = data['dropoff_latitude'].mean()
std = data['dropoff_latitude'].std()

z = np.abs(data['dropoff_latitude'] - mu)

print(data[z>3].shape[0]/data.shape[0]*100)

data = data[z<=3]
data

0.004595142423886572


Unnamed: 0,key,fare_amount,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
24238194,2015-05-07 19:52:06.0000003,7.5,2015-05-07 19:52:06 UTC,-73.999817,40.738354,-73.999512,40.723217,1
27835199,2009-07-17 20:04:56.0000002,7.7,2009-07-17 20:04:56 UTC,-73.994355,40.728225,-73.994710,40.750325,1
44984355,2009-08-24 21:45:00.00000061,12.9,2009-08-24 21:45:00 UTC,-74.005043,40.740770,-73.962565,40.772647,1
25894730,2009-06-26 08:22:21.0000001,5.3,2009-06-26 08:22:21 UTC,-73.976124,40.790844,-73.965316,40.803349,3
17610152,2014-08-28 17:47:00.000000188,16.0,2014-08-28 17:47:00 UTC,-73.925023,40.744085,-73.973082,40.761247,5
...,...,...,...,...,...,...,...,...
42598914,2012-10-28 10:49:00.00000053,3.0,2012-10-28 10:49:00 UTC,-73.987042,40.739367,-73.986525,40.740297,1
16382965,2014-03-14 01:09:00.0000008,7.5,2014-03-14 01:09:00 UTC,-73.984722,40.736837,-74.006672,40.739620,1
27804658,2009-06-29 00:42:00.00000078,30.9,2009-06-29 00:42:00 UTC,-73.986017,40.756487,-73.858957,40.692588,2
20259894,2015-05-20 14:56:25.0000004,14.5,2015-05-20 14:56:25 UTC,-73.997124,40.725452,-73.983215,40.695415,1


In [74]:
data.reset_index(drop=True, inplace=True)

In [75]:
data.describe()

Unnamed: 0,fare_amount,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
count,195850.0,195850.0,195850.0,195850.0,195850.0,195850.0
mean,11.341925,-73.975154,40.750908,-73.974189,40.751177,1.684825
std,9.798707,0.041479,0.032476,0.041409,0.0365,1.387662
min,-52.0,-75.426904,39.514527,-75.458979,39.514527,0.0
25%,6.0,-73.992273,40.736448,-73.991597,40.735323,1.0
50%,8.5,-73.982108,40.753308,-73.980537,40.753748,1.0
75%,12.5,-73.968355,40.76755,-73.965373,40.768335,2.0
max,499.0,-71.004193,42.478467,-71.004193,42.464187,208.0


In [76]:
data.shape[0]

195850

In [78]:
from geopy.distance import geodesic

In [79]:
Distance = []

for i in range(data.shape[0]):
    start = (data.loc[i, 'pickup_latitude'], data.loc[i, 'pickup_longitude'])
    dest = (data.loc[i, 'dropoff_latitude'], data.loc[i, 'dropoff_longitude'])
    Distance.append(geodesic(start, dest).kilometers)



In [80]:
df = {'distance': Distance,
           'n_passenger': data['passenger_count'].values,
           'fare': data['fare_amount'].values}

In [84]:
df

{'distance': [1.6811107421764293,
  2.4543632395347106,
  5.039602684044266,
  1.6614415240630966,
  4.483730087177181,
  0.0,
  11.73466714708299,
  2.33855965755225,
  4.891121357057089,
  2.2503607684447413,
  0.30187973981063815,
  3.579381355684481,
  1.3087360846158802,
  1.7190123091411413,
  0.7312133686589251,
  2.5210933890913685,
  1.7901528525644335,
  1.0345529350832423,
  2.493014083354037,
  0.9583104783394542,
  1.259752705953191,
  1.7500684549215246,
  6.190482758821909,
  2.733597059871603,
  0.7223132575720504,
  3.225516110805604,
  1.4302720789145387,
  2.23913351743337,
  13.040416375766188,
  1.8961713063881787,
  1.9097003504477117,
  3.180891779182723,
  11.355487331684865,
  2.9275620507531435,
  1.2007675423315685,
  2.6359329726661773,
  2.252064373503419,
  9.96792629234231,
  4.820210689614793,
  1.2498441727246834,
  0.8004236940701086,
  0.8398440040908891,
  0.38528358001204277,
  2.4316604878437453,
  3.134738815090711,
  3.7299542000993378,
  0.0,
  