# Ride Price prediction project


In [None]:
import kagglehub
import os
from matplotlib import pyplot as plt
import seaborn as sns

path = kagglehub.dataset_download("ravi72munde/uber-lyft-cab-prices")
print(os.listdir(path))
print("Path to dataset files:", path)

In [None]:
import pandas as pd
df=pd.read_csv(os.path.join(path, "cab_rides.csv"))
df.describe()

In [None]:
df.isna().sum()

# Data Preprocessing

1. Adding datetime from timestamp for easier processing .
2. Extract hour of the day to determine peak hours.

In [None]:
df['date_time']=pd.to_datetime(df["time_stamp"], unit='ms')
df['hour_of_day']=df['date_time'].dt.hour
df['day']=df['date_time'].dt.day_of_week
df.head()

## Understansing the data

## Trips per hour

In [None]:
trips_per_hour = df.groupby('hour_of_day').size() # Plot 
trips_per_hour.plot(kind='bar', color='skyblue', edgecolor='black',figsize=(10,4)) 

plt.title("Number of Trips per Hour") 
plt.xlabel("Hour of Day") 
plt.ylabel("Number of Trips") 
plt.xticks(rotation=0) 
plt.grid(axis='y', linestyle="--") 
plt.show()



## Distance vs price

In [None]:


plt.figure(figsize=(10,8))
plt.title( 'Distance vs price')
plt.scatter(x=df['distance'],y=df['price'])
plt.xlabel("Distance") 
plt.ylabel("Price") 
plt.show()

## Trips per day of the week

In [None]:


plt.figure(figsize=(10,6))
sns.countplot(x=df['day'])
plt.title("Number of trips per day of the week")
plt.show()

# Cleaning

## Handling missing values

In [None]:
from sklearn.impute import SimpleImputer
df[['price']]=SimpleImputer(strategy="median").fit_transform(df[['price']])
df[['price']].isna().sum()

## Outliers

In [None]:
df = df[df["distance"] < df["distance"].quantile(0.995)]
df = df[df["price"] < df["price"].quantile(0.995)]

## Feature Engineering

In [None]:
df["day_of_week"] = df["datetime"].dt.dayofweek
df["is_weekend"] = df["day_of_week"].isin([5,6]).astype(int)
df["is_rush_hour"] = df["hour"].isin([7,8,9,16,17,18]).astype(int)