#Predict the price of the Uber ride from a given pickup
point to the agreed drop-off location. Perform following
tasks:
1. Pre-process the dataset.
2. Identify outliers.
3. Check the correlation.
4. Implement linear regression and random forest regression models.
5. Evaluate the models and compare their respective scores like R2, RMSE, etc

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.ensemble import RandomForestRegressor
from sklearn import metrics
from sklearn import preprocessing

In [None]:
df  = pd.read_csv("uber.csv")




















### 1. Pre-process the dataset.

In [None]:
df.head()

In [None]:
df.describe()

In [None]:
df.info()

In [None]:
df.columns

In [None]:
df.shape

In [None]:
df.dtypes

### Filling Missing Value

In [None]:
df.isnull().sum()

In [None]:
df['dropoff_latitude'].fillna(value=df['dropoff_latitude'].mean(),inplace = True)
df['dropoff_longitude'].fillna(value=df['dropoff_longitude'].median(), inplace=True)

In [None]:
df.isnull().sum()

In [None]:
df = df.drop(['Unnamed: 0', 'key'], axis= 1) 

In [None]:
df.head()

### Column pickup_datetime is in wrong format (Object). Convert it to DateTime Format


In [None]:
# Assuming df is your DataFrame and 'pickup_datetime' is your column
df.pickup_datetime = pd.to_datetime(df.pickup_datetime, errors='coerce')

In [None]:
## Separating the date and time into separate columns for more usability
df= df.assign(
            second = df.pickup_datetime.dt.second,
            minute = df.pickup_datetime.dt.minute,
            hour = df.pickup_datetime.dt.hour,
            day= df.pickup_datetime.dt.day,
            month = df.pickup_datetime.dt.month,
            year = df.pickup_datetime.dt.year,
            dayofweek = df.pickup_datetime.dt.dayofweek
            )
df = df.drop('pickup_datetime',axis=1)

In [None]:
df.dtypes

In [None]:
df.head()

### Checking outliers and flling them

In [None]:
df.plot(kind = "box",subplots = True,layout = (7,2),figsize=(15,20))

In [None]:
import numpy as np

def remove_outlier(df1 , col):
    Q1 = df1[col].quantile(0.25)
    Q3 = df1[col].quantile(0.75)
    IQR = Q3 - Q1
    lower_whisker = Q1-1.5*IQR
    upper_whisker = Q3+1.5*IQR
    df1[col] = np.clip(df1[col] , lower_whisker , upper_whisker)
    return df1

def treat_outliers_all(df1 , col_list):
    for c in col_list:
        df1 = remove_outlier(df1 , c)
    return df1

# Assuming that 'df' is your DataFrame and 'df.columns' is the list of columns in your DataFrame
df = treat_outliers_all(df , df.columns)

# Plotting boxplots for all columns in the DataFrame
df.plot(kind = "box", subplots = True, layout = (7,2), figsize=(15,20))

### Function to find Corrrlation

In [None]:
corr = df.corr()

corr.style.background_gradient(cmap='BuGn')

In [None]:
pip install haversine

In [None]:
import haversine as hs

# Initialize an empty list to store travel distances
travel_dist = []

# Iterate through the DataFrame rows to calculate distances
for pos in range(len(df['pickup_longitude'])):
    long1, lati1, long2, lati2 = df['pickup_longitude'][pos], df['pickup_latitude'][pos], df['dropoff_longitude'][pos], df['dropoff_latitude'][pos]
    loc1 = (lati1, long1)
    loc2 = (lati2, long2)
    c = hs.haversine(loc1, loc2)
    travel_dist.append(c)

# Assign the calculated distances to a new column in the DataFrame
df['dist_travel_km'] = travel_dist


In [None]:
# Display the first few rows of the DataFrame
df.head()

In [None]:
#Finding inccorect latitude (Less than or greater than 90) and longitude (greater than or less than 90)
incorrect_coordinates = df.loc[
(df.pickup_latitude > 90) |(df.pickup_latitude < -90) |
(df.dropoff_latitude > 90) |(df.dropoff_latitude < -90) |
(df.pickup_longitude > 180) |(df.pickup_longitude < -180) |
(df.dropoff_longitude > 90) |(df.dropoff_longitude < -90)
]
df.drop(incorrect_coordinates, inplace = True, errors = 'ignore')


In [None]:
df.head()

### Dividing the dataset into features and target values

In [None]:
x = df[['pickup_longitude','pickup_latitude','dropoff_longitude','dropoff_latitude']]
y = df['fare_amount']

### Dividing the dataset into training and testing 

In [None]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(x,y,test_size = 0.33)


### Linear Regression

In [None]:
from sklearn.linear_model import LinearRegression
regression = LinearRegression()

In [None]:
regression.fit(X_train,y_train)

In [None]:
regression.intercept_ #To find the linear intercept

In [None]:
regression.coef_ #To find the linear coeeficient

In [None]:
prediction = regression.predict(X_test) 
print(prediction)

In [None]:
y_test

### Metrics Evaluation using R2, Mean Squared Error, Root Mean Sqared Error

In [None]:
from sklearn.metrics import r2_score
r2_score (y_test,prediction)

In [None]:
from sklearn.metrics import mean_squared_error
MSE = mean_squared_error(y_test,prediction)
MSE

In [None]:
# The equation RMSE = np.sqrt(MSE) is used to calculate the Root Mean Square Error (RMSE) from the Mean Squared Error (MSE).
RMSE = np.sqrt(MSE)
RMSE

### Random Forest Regression

In [None]:
from sklearn.ensemble import RandomForestRegressor
rf = RandomForestRegressor(n_estimators=100)
rf.fit(X_train,y_train)

In [None]:
y_pred = rf.predict(X_test)
y_pred

In [None]:
# metric evaluation for random forest
R2_Random = r2_score(y_test,y_pred)
print(R2_Random)

MSE_Random = mean_squared_error(y_test,y_pred)
print(MSE_Random)

RMSE_Random = np.sqrt(MSE_Random)
print(RMSE_Random)

