# ğŸš– Uber Fare Prediction using Machine Learning
This notebook predicts the price of an Uber ride using the provided dataset (`uber.csv`) and the Random Forest Regressor algorithm.

In [None]:

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score


In [None]:

df = pd.read_csv("uber.csv")
print("Dataset Shape:", df.shape)
df.head()


In [None]:

# Data Cleaning
df.drop(['Unnamed: 0', 'key'], axis=1, inplace=True)
df.dropna(inplace=True)
df = df[(df['fare_amount'] > 0) & (df['passenger_count'] > 0) & (df['passenger_count'] <= 6)]
df.reset_index(drop=True, inplace=True)

print("Cleaned Data Shape:", df.shape)
df.head()


In [None]:

# Feature Engineering
df['pickup_datetime'] = pd.to_datetime(df['pickup_datetime'], errors='coerce')
df['hour'] = df['pickup_datetime'].dt.hour
df['day'] = df['pickup_datetime'].dt.day
df['month'] = df['pickup_datetime'].dt.month
df['year'] = df['pickup_datetime'].dt.year
df['day_of_week'] = df['pickup_datetime'].dt.dayofweek

def haversine(lat1, lon1, lat2, lon2):
    R = 6371
    lat1, lon1, lat2, lon2 = map(np.radians, [lat1, lon1, lat2, lon2])
    dlat = lat2 - lat1
    dlon = lon2 - lon1
    a = np.sin(dlat / 2)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon / 2)**2
    c = 2 * np.arcsin(np.sqrt(a))
    return R * c

df['distance_km'] = haversine(df['pickup_latitude'], df['pickup_longitude'],
                              df['dropoff_latitude'], df['dropoff_longitude'])
df.drop(['pickup_datetime'], axis=1, inplace=True)
df.head()


In [None]:

X = df[['pickup_longitude', 'pickup_latitude', 'dropoff_longitude', 'dropoff_latitude',
        'passenger_count', 'hour', 'day', 'month', 'year', 'day_of_week', 'distance_km']]
y = df['fare_amount']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [None]:

model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)


In [None]:

rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)

print(f"Root Mean Squared Error (RMSE): {rmse:.2f}")
print(f"RÂ² Score: {r2:.3f}")


In [None]:

importances = pd.Series(model.feature_importances_, index=X.columns).sort_values(ascending=False)
plt.figure(figsize=(10,5))
sns.barplot(x=importances, y=importances.index)
plt.title("Feature Importance in Fare Prediction")
plt.show()


In [None]:

new_data = pd.DataFrame({
    'pickup_longitude': [-73.985130],
    'pickup_latitude': [40.758896],
    'dropoff_longitude': [-73.778139],
    'dropoff_latitude': [40.641311],
    'passenger_count': [1],
    'hour': [15],
    'day': [9],
    'month': [11],
    'year': [2025],
    'day_of_week': [6],
    'distance_km': [40.758896]
})

predicted_fare = model.predict(new_data)[0]
print(f"Predicted Uber Fare: ${predicted_fare:.2f}")
