#Predict the price of the Uber ride from a given pickup
point to the agreed drop-off location. Perform following
tasks:
1. Pre-process the dataset.
2. Identify outliers.
3. Check the correlation.
4. Implement linear regression and random forest regression models.
5. Evaluate the models and compare their respective scores like R2, RMSE, etc

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.ensemble import RandomForestRegressor
from sklearn import metrics
from sklearn import preprocessing

In [2]:
df  = pd.read_csv("uber.csv")




















### 1. Pre-process the dataset.

In [3]:
df.head()

Unnamed: 0.1,Unnamed: 0,key,fare_amount,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
0,24238194,2015-05-07 19:52:06.0000003,7.5,2015-05-07 19:52:06 UTC,-73.999817,40.738354,-73.999512,40.723217,1
1,27835199,2009-07-17 20:04:56.0000002,7.7,2009-07-17 20:04:56 UTC,-73.994355,40.728225,-73.99471,40.750325,1
2,44984355,2009-08-24 21:45:00.00000061,12.9,2009-08-24 21:45:00 UTC,-74.005043,40.74077,-73.962565,40.772647,1
3,25894730,2009-06-26 08:22:21.0000001,5.3,2009-06-26 08:22:21 UTC,-73.976124,40.790844,-73.965316,40.803349,3
4,17610152,2014-08-28 17:47:00.000000188,16.0,2014-08-28 17:47:00 UTC,-73.925023,40.744085,-73.973082,40.761247,5


In [4]:
df.describe()

Unnamed: 0.1,Unnamed: 0,fare_amount,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
count,200000.0,200000.0,200000.0,200000.0,199999.0,199999.0,200000.0
mean,27712500.0,11.359955,-72.527638,39.935885,-72.525292,39.92389,1.684535
std,16013820.0,9.901776,11.437787,7.720539,13.117408,6.794829,1.385997
min,1.0,-52.0,-1340.64841,-74.015515,-3356.6663,-881.985513,0.0
25%,13825350.0,6.0,-73.992065,40.734796,-73.991407,40.733823,1.0
50%,27745500.0,8.5,-73.981823,40.752592,-73.980093,40.753042,1.0
75%,41555300.0,12.5,-73.967154,40.767158,-73.963658,40.768001,2.0
max,55423570.0,499.0,57.418457,1644.421482,1153.572603,872.697628,208.0


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200000 entries, 0 to 199999
Data columns (total 9 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   Unnamed: 0         200000 non-null  int64  
 1   key                200000 non-null  object 
 2   fare_amount        200000 non-null  float64
 3   pickup_datetime    200000 non-null  object 
 4   pickup_longitude   200000 non-null  float64
 5   pickup_latitude    200000 non-null  float64
 6   dropoff_longitude  199999 non-null  float64
 7   dropoff_latitude   199999 non-null  float64
 8   passenger_count    200000 non-null  int64  
dtypes: float64(5), int64(2), object(2)
memory usage: 13.7+ MB


In [6]:
df.columns

Index(['Unnamed: 0', 'key', 'fare_amount', 'pickup_datetime',
       'pickup_longitude', 'pickup_latitude', 'dropoff_longitude',
       'dropoff_latitude', 'passenger_count'],
      dtype='object')

In [7]:
df.shape

(200000, 9)

In [8]:
df.dtypes

Unnamed: 0             int64
key                   object
fare_amount          float64
pickup_datetime       object
pickup_longitude     float64
pickup_latitude      float64
dropoff_longitude    float64
dropoff_latitude     float64
passenger_count        int64
dtype: object

### Filling Missing Value

In [9]:
df.isnull().sum()

Unnamed: 0           0
key                  0
fare_amount          0
pickup_datetime      0
pickup_longitude     0
pickup_latitude      0
dropoff_longitude    1
dropoff_latitude     1
passenger_count      0
dtype: int64

In [10]:
df['dropoff_latitude'].fillna(value=df['dropoff_latitude'].mean(),inplace = True)
df['dropoff_longitude'].fillna(value=df['dropoff_longitude'].median(), inplace=True)

In [11]:
df.isnull().sum()

Unnamed: 0           0
key                  0
fare_amount          0
pickup_datetime      0
pickup_longitude     0
pickup_latitude      0
dropoff_longitude    0
dropoff_latitude     0
passenger_count      0
dtype: int64

In [12]:
df = df.drop(['Unnamed: 0', 'key'], axis= 1) 

In [13]:
df.head()

Unnamed: 0,fare_amount,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
0,7.5,2015-05-07 19:52:06 UTC,-73.999817,40.738354,-73.999512,40.723217,1
1,7.7,2009-07-17 20:04:56 UTC,-73.994355,40.728225,-73.99471,40.750325,1
2,12.9,2009-08-24 21:45:00 UTC,-74.005043,40.74077,-73.962565,40.772647,1
3,5.3,2009-06-26 08:22:21 UTC,-73.976124,40.790844,-73.965316,40.803349,3
4,16.0,2014-08-28 17:47:00 UTC,-73.925023,40.744085,-73.973082,40.761247,5


### Column pickup_datetime is in wrong format (Object). Convert it to DateTime Format


In [14]:
# Assuming df is your DataFrame and 'pickup_datetime' is your column
df.pickup_datetime = pd.to_datetime(df.pickup_datetime, errors='coerce')

In [15]:
## Separating the date and time into separate columns for more usability
df= df.assign(
            second = df.pickup_datetime.dt.second,
            minute = df.pickup_datetime.dt.minute,
            hour = df.pickup_datetime.dt.hour,
            day= df.pickup_datetime.dt.day,
            month = df.pickup_datetime.dt.month,
            year = df.pickup_datetime.dt.year,
            dayofweek = df.pickup_datetime.dt.dayofweek
            )
df = df.drop('pickup_datetime',axis=1)

In [16]:
df.dtypes

fare_amount          float64
pickup_longitude     float64
pickup_latitude      float64
dropoff_longitude    float64
dropoff_latitude     float64
passenger_count        int64
second                 int32
minute                 int32
hour                   int32
day                    int32
month                  int32
year                   int32
dayofweek              int32
dtype: object

In [17]:
df.head()

Unnamed: 0,fare_amount,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,second,minute,hour,day,month,year,dayofweek
0,7.5,-73.999817,40.738354,-73.999512,40.723217,1,6,52,19,7,5,2015,3
1,7.7,-73.994355,40.728225,-73.99471,40.750325,1,56,4,20,17,7,2009,4
2,12.9,-74.005043,40.74077,-73.962565,40.772647,1,0,45,21,24,8,2009,0
3,5.3,-73.976124,40.790844,-73.965316,40.803349,3,21,22,8,26,6,2009,4
4,16.0,-73.925023,40.744085,-73.973082,40.761247,5,0,47,17,28,8,2014,3


### Checking outliers and flling them

In [1]:
df.plot(kind = "box",subplots = True,layout = (7,2),figsize=(15,20))

NameError: name 'df' is not defined

In [2]:
import numpy as np

def remove_outlier(df1 , col):
    Q1 = df1[col].quantile(0.25)
    Q3 = df1[col].quantile(0.75)
    IQR = Q3 - Q1
    lower_whisker = Q1-1.5*IQR
    upper_whisker = Q3+1.5*IQR
    df1[col] = np.clip(df1[col] , lower_whisker , upper_whisker)
    return df1

def treat_outliers_all(df1 , col_list):
    for c in col_list:
        df1 = remove_outlier(df1 , c)
    return df1

# Assuming that 'df' is your DataFrame and 'df.columns' is the list of columns in your DataFrame
df = treat_outliers_all(df , df.columns)

# Plotting boxplots for all columns in the DataFrame
df.plot(kind = "box", subplots = True, layout = (7,2), figsize=(15,20))

NameError: name 'df' is not defined

### Function to find Corrrlation

In [20]:
corr = df.corr()

corr.style.background_gradient(cmap='BuGn')

Unnamed: 0,fare_amount,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,second,minute,hour,day,month,year,dayofweek
fare_amount,1.0,0.154069,-0.110842,0.218675,-0.125898,0.015778,0.002976,-0.009246,-0.023623,0.004534,0.030817,0.141277,0.013652
pickup_longitude,0.154069,1.0,0.259497,0.425619,0.07329,-0.013213,-0.013308,-0.005792,0.011579,-0.003204,0.001169,0.010198,-0.024652
pickup_latitude,-0.110842,0.259497,1.0,0.048889,0.515714,-0.012889,0.006061,-0.003454,0.029681,-0.001553,0.001562,-0.014243,-0.04231
dropoff_longitude,0.218675,0.425619,0.048889,1.0,0.245667,-0.009303,-0.005254,-0.004847,-0.046558,-0.004007,0.002391,0.011346,-0.003336
dropoff_latitude,-0.125898,0.07329,0.515714,0.245667,1.0,-0.006308,0.005151,-0.003175,0.019783,-0.003479,-0.001193,-0.009603,-0.031919
passenger_count,0.015778,-0.013213,-0.012889,-0.009303,-0.006308,1.0,-0.194292,0.001115,0.020274,0.002712,0.010351,-0.009749,0.04855
second,0.002976,-0.013308,0.006061,-0.005254,0.005151,-0.194292,1.0,0.001987,-0.01324,-0.002107,-0.049937,0.083345,-0.000136
minute,-0.009246,-0.005792,-0.003454,-0.004847,-0.003175,0.001115,0.001987,1.0,0.001138,-0.001217,-0.001485,-0.002805,-0.002328
hour,-0.023623,0.011579,0.029681,-0.046558,0.019783,0.020274,-0.01324,0.001138,1.0,0.004677,-0.003926,0.002156,-0.086947
day,0.004534,-0.003204,-0.001553,-0.004007,-0.003479,0.002712,-0.002107,-0.001217,0.004677,1.0,-0.01736,-0.01217,0.005617


In [21]:
pip install haversine

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip available: 22.3 -> 23.2.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [22]:
import haversine as hs

# Initialize an empty list to store travel distances
travel_dist = []

# Iterate through the DataFrame rows to calculate distances
for pos in range(len(df['pickup_longitude'])):
    long1, lati1, long2, lati2 = df['pickup_longitude'][pos], df['pickup_latitude'][pos], df['dropoff_longitude'][pos], df['dropoff_latitude'][pos]
    loc1 = (lati1, long1)
    loc2 = (lati2, long2)
    c = hs.haversine(loc1, loc2)
    travel_dist.append(c)

# Assign the calculated distances to a new column in the DataFrame
df['dist_travel_km'] = travel_dist


In [23]:
# Display the first few rows of the DataFrame
df.head()

Unnamed: 0,fare_amount,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,second,minute,hour,day,month,year,dayofweek,dist_travel_km
0,7.5,-73.999817,40.738354,-73.999512,40.723217,1.0,6,52,19,7,5,2015,3,1.683325
1,7.7,-73.994355,40.728225,-73.99471,40.750325,1.0,56,4,20,17,7,2009,4,2.457593
2,12.9,-74.005043,40.74077,-73.962565,40.772647,1.0,0,45,21,24,8,2009,0,5.036384
3,5.3,-73.976124,40.790844,-73.965316,40.803349,3.0,21,22,8,26,6,2009,4,1.661686
4,16.0,-73.929786,40.744085,-73.973082,40.761247,3.5,0,47,17,28,8,2014,3,4.116088


In [24]:
#Finding inccorect latitude (Less than or greater than 90) and longitude (greater than or less than 90)
incorrect_coordinates = df.loc[
(df.pickup_latitude > 90) |(df.pickup_latitude < -90) |
(df.dropoff_latitude > 90) |(df.dropoff_latitude < -90) |
(df.pickup_longitude > 180) |(df.pickup_longitude < -180) |
(df.dropoff_longitude > 90) |(df.dropoff_longitude < -90)
]
df.drop(incorrect_coordinates, inplace = True, errors = 'ignore')


In [25]:
df.head()

Unnamed: 0,fare_amount,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,second,minute,hour,day,month,year,dayofweek,dist_travel_km
0,7.5,-73.999817,40.738354,-73.999512,40.723217,1.0,6,52,19,7,5,2015,3,1.683325
1,7.7,-73.994355,40.728225,-73.99471,40.750325,1.0,56,4,20,17,7,2009,4,2.457593
2,12.9,-74.005043,40.74077,-73.962565,40.772647,1.0,0,45,21,24,8,2009,0,5.036384
3,5.3,-73.976124,40.790844,-73.965316,40.803349,3.0,21,22,8,26,6,2009,4,1.661686
4,16.0,-73.929786,40.744085,-73.973082,40.761247,3.5,0,47,17,28,8,2014,3,4.116088


### Dividing the dataset into features and target values

In [26]:
x = df[['pickup_longitude','pickup_latitude','dropoff_longitude','dropoff_latitude']]
y = df['fare_amount']

### Dividing the dataset into training and testing 

In [27]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(x,y,test_size = 0.33)


### Linear Regression

In [28]:
from sklearn.linear_model import LinearRegression
regression = LinearRegression()

In [29]:
regression.fit(X_train,y_train)

In [30]:
regression.intercept_ #To find the linear intercept

7370.065363771459

In [31]:
regression.coef_ #To find the linear coeeficient

array([ 23.58939362, -13.61712975,  52.58660413, -28.70731194])

In [32]:
prediction = regression.predict(X_test) 
print(prediction)

[16.88087504  8.79396249  8.46765019 ...  8.61631542  9.21361182
  9.05284756]


In [33]:
y_test

116024     8.1
177432    11.5
82705     10.1
110434     8.5
142762    11.0
          ... 
84468      4.9
5668       8.9
110316     8.5
51960      5.7
185567    17.8
Name: fare_amount, Length: 66000, dtype: float64

### Metrics Evaluation using R2, Mean Squared Error, Root Mean Sqared Error

In [34]:
from sklearn.metrics import r2_score
r2_score (y_test,prediction)

0.08913712629724835

In [35]:
from sklearn.metrics import mean_squared_error
MSE = mean_squared_error(y_test,prediction)
MSE

26.86261611663543

In [36]:
# The equation RMSE = np.sqrt(MSE) is used to calculate the Root Mean Square Error (RMSE) from the Mean Squared Error (MSE).
RMSE = np.sqrt(MSE)
RMSE

5.182915792933108

### Random Forest Regression

In [37]:
from sklearn.ensemble import RandomForestRegressor
rf = RandomForestRegressor(n_estimators=100)
rf.fit(X_train,y_train)

In [38]:
y_pred = rf.predict(X_test)
y_pred

array([10.57745445, 11.149     , 11.752     , ...,  8.843     ,
        9.314     , 17.837     ])

In [40]:
# metric evaluation for random forest
R2_Random = r2_score(y_test,y_pred)
print(R2_Random)

MSE_Random = mean_squared_error(y_test,y_pred)
print(MSE_Random)

RMSE_Random = np.sqrt(MSE_Random)
print(RMSE_Random)



0.7606985971387621
7.057332016511124
2.656563949260609
