In [796]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler
import geopy.distance
import haversine as hs   
from haversine import Unit
import matplotlib.pyplot as plt

In [797]:
# Load the dataset
df = pd.read_csv('Uber.csv')


In [798]:
#Remove unnecessary column
df.drop(['key','Unnamed: 0'], axis=1, inplace=True)
df.dropna(axis=0,inplace=True)

In [799]:
# Check for missing values
print("\nMissing values in the dataset:")
print(df.isnull().sum())


Missing values in the dataset:
fare_amount          0
pickup_datetime      0
pickup_longitude     0
pickup_latitude      0
dropoff_longitude    0
dropoff_latitude     0
passenger_count      0
dtype: int64


In [800]:
condition = df[
    (df['pickup_longitude'] < -180) | (df['pickup_longitude'] > 180) |
    (df['pickup_latitude'] < -90) | (df['pickup_latitude'] > 90) |
    (df['dropoff_longitude'] < -180) | (df['dropoff_longitude'] > 180) |
    (df['dropoff_latitude'] < -90) | (df['dropoff_latitude'] > 90)
].index
df.drop(condition, inplace=True)

In [801]:
# Display the first few rows of the dataset
print("First few rows of the dataset:")
print(df.head())

# Display the columns of the dataset
print("\nColumns of the dataset:")
print(df.columns)

First few rows of the dataset:
   fare_amount          pickup_datetime  pickup_longitude  pickup_latitude  \
0          7.5  2015-05-07 19:52:06 UTC        -73.999817        40.738354   
1          7.7  2009-07-17 20:04:56 UTC        -73.994355        40.728225   
2         12.9  2009-08-24 21:45:00 UTC        -74.005043        40.740770   
3          5.3  2009-06-26 08:22:21 UTC        -73.976124        40.790844   
4         16.0  2014-08-28 17:47:00 UTC        -73.925023        40.744085   

   dropoff_longitude  dropoff_latitude  passenger_count  
0         -73.999512         40.723217                1  
1         -73.994710         40.750325                1  
2         -73.962565         40.772647                1  
3         -73.965316         40.803349                3  
4         -73.973082         40.761247                5  

Columns of the dataset:
Index(['fare_amount', 'pickup_datetime', 'pickup_longitude', 'pickup_latitude',
       'dropoff_longitude', 'dropoff_latitude',

In [802]:
latitude, longitude = [], []

longitude.append(df["pickup_longitude"].tolist())
longitude.append(df["dropoff_longitude"].tolist())
latitude.append(df["pickup_latitude"].tolist())
latitude.append(df["dropoff_latitude"].tolist())

#Find the distance in meters
distance = [hs.haversine((latitude[0][i], longitude[0][i]), (latitude[1][i], longitude[1][i]), unit=Unit.METERS) for i in range(len(longitude[0]))]


In [803]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score

In [804]:
df['pickup_datetime'] = pd.to_datetime(df['pickup_datetime'])

df['pickup_day'] = df['pickup_datetime'].dt.day
df['pickup_month'] = df['pickup_datetime'].dt.month
df['pickup_hour'] = df['pickup_datetime'].dt.hour
df['pickup_minute'] = df['pickup_datetime'].dt.minute

In [805]:
df.drop(["pickup_datetime"], axis=1, inplace=True)
distance = pd.DataFrame({"distance": distance})
day = pd.DataFrame({"day": df['pickup_day']})
month = pd.DataFrame({"month": df['pickup_month']})
hour = pd.DataFrame({"hour": df['pickup_hour']})
minute = pd.DataFrame({"minute": df['pickup_minute']})

In [806]:
columns_extra = ["distance", "day", "month", "hour", "minute"]
df2 = pd.concat([distance, day, month, hour, minute], axis=1, ignore_index=True)
df2.columns = columns_extra
print(df2.head())

   distance   day  month  hour  minute
0  1.683325   7.0    5.0  19.0    52.0
1  2.457593  17.0    7.0  20.0     4.0
2  5.036384  24.0    8.0  21.0    45.0
3  1.661686  26.0    6.0   8.0    22.0
4  4.475456  28.0    8.0  17.0    47.0


In [807]:
columns = df.columns
df = df.reset_index(drop=True)
df.columns = columns
dataframe = pd.concat([df, df2], axis=1)
print(dataframe.head())

   fare_amount  pickup_longitude  pickup_latitude  dropoff_longitude  \
0          7.5        -73.999817        40.738354         -73.999512   
1          7.7        -73.994355        40.728225         -73.994710   
2         12.9        -74.005043        40.740770         -73.962565   
3          5.3        -73.976124        40.790844         -73.965316   
4         16.0        -73.925023        40.744085         -73.973082   

   dropoff_latitude  passenger_count  pickup_day  pickup_month  pickup_hour  \
0         40.723217              1.0         7.0           5.0         19.0   
1         40.750325              1.0        17.0           7.0         20.0   
2         40.772647              1.0        24.0           8.0         21.0   
3         40.803349              3.0        26.0           6.0          8.0   
4         40.761247              5.0        28.0           8.0         17.0   

   pickup_minute  distance   day  month  hour  minute  
0           52.0  1.683325   7.0    

In [808]:
#create x and y sets
Y = df["fare_amount"]
X = df.drop(["fare_amount"], axis=1)

x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

In [809]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score


# Initialize and train the Linear Regression model
lr_regressor = LinearRegression()
lr_regressor.fit(x_train, y_train)

# Predict fare amounts for the test set
y_pred = lr_regressor.predict(x_test)

# Calculate regression metrics
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
rmse = np.sqrt(mse)
# Print regression metrics
print("Mean Squared Error:", mse)
print("Mean Absolute Error:", mae)
print("R-squared:", r2)
print(mse, " ", rmse)


Mean Squared Error: 91.64726651311395
Mean Absolute Error: 5.9938316093351105
R-squared: 0.0012664415953705932
91.64726651311395   9.573257883976277
