In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from scipy import stats

In [2]:
df = pd.read_csv("Uber Cleaned Dataset.csv")
df.head()

FileNotFoundError: [Errno 2] No such file or directory: 'Uber Cleaned Dataset.csv'

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
df.isnull().sum()

## Understanding the Target Variable — Fare

In this step, we aim to understand the distribution and behavior of the target variable `fare`. This includes:

- Plotting the histogram and boxplot to visually assess the distribution and detect potential outliers.
- Calculating key summary statistics such as mean, median, mode, and Interquartile Range (IQR).
- Checking skewness to understand the shape of the distribution (e.g., right-skewed or left-skewed).

In [None]:
# Plotting the histogram of fare
plt.figure(figsize=(12, 5))
sns.histplot(df["fare"], bins=50, kde=True, color="skyblue")
plt.title("Histogram of Fare")
plt.xlabel("Fare")
plt.ylabel("Frequency")
plt.show()

# Plotting the boxplot of fare
plt.figure(figsize=(10, 2))
sns.boxplot(data=df, x="fare", color="salmon")
plt.title("Boxplot of Fare")
plt.xlabel("Fare")
plt.show()

# Calculate summary statistics
fare_mean = df["fare"].mean()
fare_median = df["fare"].median()
fare_mode = df["fare"].mode()[0]
fare_std = df["fare"].std()

# Interquartile Range (IQR)
Q1 = df["fare"].quantile(0.25)
Q3 = df["fare"].quantile(0.75)
IQR = Q3 - Q1

# Skewness
fare_skewness = df["fare"].skew()

print(f"Mean Fare: {fare_mean:.2f}")
print(f"Median Fare: {fare_median:.2f}")
print(f"Mode Fare: {fare_mode:.2f}")
print(f"Standard Deviation: {fare_std:.2f}")
print(f"Q1: {Q1:.2f}, Q3: {Q3:.2f}, IQR: {IQR:.2f}")
print(f"Skewness: {fare_skewness:.2f}")


## Step 3: Univariate Analysis of Features

In this step, we explore the distribution and characteristics of each individual feature in the dataset. The goal is to understand the data types, identify potential anomalies, and gain insights into the behavior of each variable.

We'll examine:
- **Numerical features** such as distance, temperature, humidity, etc.
- **Categorical features** such as vehicle type, source, destination, weather description, etc.






In [None]:
# List of numerical features (excluding target 'fare')
num_cols = ['distance', 'temperature_C', 'humidity', 'pressure', 'wind_speed', 'visibility', 'cloud_cover', 'uv_index','surge_multiplier']

# Plot histograms
df[num_cols].hist(bins=30, figsize=(15, 10), color='skyblue', edgecolor='black')
plt.suptitle("Distribution of Numerical Features", fontsize=16)
plt.tight_layout()
plt.show()

# Summary statistics
df[num_cols].describe()


In [None]:
# Categorical features
cat_cols = ['vehicle', 'source', 'destination', 'weather_desc_short', 'weather_status']

# Count plots for categorical variables
for col in cat_cols:
    plt.figure(figsize=(10, 4))
    sns.countplot(data=df, x=col, order=df[col].value_counts().index, palette="viridis")
    plt.title(f"Distribution of {col}")
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.show()


## **Bivariate Analysis**

## **Fare vs Distance**

In this step, we analyze the relationship between the target variable `fare` and other features to uncover potential patterns and associations.

We'll explore:
- Correlation between `fare` and numerical variables like `distance`, `temperature`, etc.
- How `fare` varies across different categories such as `vehicle type`, `source`, and `weather`.


In [None]:
# Correlation of fare with numerical features
corr_with_fare = df.corr(numeric_only=True)["fare"].sort_values(ascending=False)
print("Correlation with Fare:\n", corr_with_fare)

# Scatter plot: Distance vs Fare
plt.figure(figsize=(10, 5))
sns.scatterplot(data=df, x="distance", y="fare", alpha=0.5)
plt.title("Distance vs Fare")
plt.show()

To understand the relationship between fare and distance, We plotted the distance vs fare and calculated correlation of fare with other numerical features. Features except distance doesn't show correalation. The overall fare vs distance correlation coefficient is 0.33, indicating a weak to moderate linear relationship. This suggests that distance alone does not fully explain fare variations.
Several other factors influence the final fare, including the ride company and the type of vehicle used. For example, luxury vehicles typically charge higher rates for the same distance compared to standard vehicles. Therefore, it's important to consider these variables when analyzing fare patterns.

In [None]:
print(f"Lyft:\n {df[df["ride_company"]=='Lyft'][["fare","distance"]].corr()}" )
print(f"Uber:\n {df[df["ride_company"]=='Uber'][["fare","distance"]].corr()}" )

Difference in Ride company don't change the correlation much.

In [None]:
df["vehicle"].unique()

In [None]:
for vehicle in df["vehicle"].unique():
    print(f"{vehicle}: {df[df["vehicle"]==vehicle][["fare","distance"]].corr()["fare"]["distance"].round(2)}")

All vehicle types show a strong positive correlation between fare and distance.

In [None]:
# Calculate median fare per vehicle
median_fares = df.groupby("vehicle")["fare"].median().sort_values(ascending=False)

# Plot sorted boxplot
plt.figure(figsize=(10, 5))
sns.boxplot(data=df, x="vehicle", y="fare", order=median_fares.index)
plt.title("Fare Distribution by Vehicle Type (Sorted by Median Fare)")
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

In [None]:
fare_stats = df.groupby("vehicle")["fare"].describe().T
fare_stats = fare_stats.loc[:, df.groupby("vehicle")["fare"].mean().sort_values(ascending=False).index]

fare_stats

High end vehicles have higher min and mean fare.

## **Fare vs Destination & Source**

In [None]:
# Source
plt.figure(figsize=(10, 4))
order = df.groupby("source")["fare"].mean().sort_values(ascending=False).index
sns.boxplot(data=df, x="source", y="fare", order=order)
plt.title("Fare Distribution by Source")
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

# Destination
plt.figure(figsize=(10, 4))
order = df.groupby("destination")["fare"].mean().sort_values(ascending=False).index
sns.boxplot(data=df, x="destination", y="fare", order=order)
plt.title("Fare Distribution by Destination")
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

In [None]:
# Filter out invalid distances
df_valid = df.copy()

# Compute fare per unit distance
df_valid["fare_per_distance"] = df_valid["fare"] / df_valid["distance"]

# Create a pivot table: source vs vehicle with mean fare/distance
fare_ratio_pivot = df_valid.pivot_table(index="source", columns="vehicle", values="fare_per_distance", aggfunc="mean")

# Calculate row-wise sum to sort the table
row_sum = fare_ratio_pivot.sum(axis=1)

# Sort rows by total fare_per_distance
fare_ratio_pivot = fare_ratio_pivot.loc[row_sum.sort_values(ascending=False).index]

# Sort columns by total fare_per_distance
col_sum = fare_ratio_pivot.sum(axis=0)
fare_ratio_pivot = fare_ratio_pivot[col_sum.sort_values(ascending=False).index]

# Display the table
fare_ratio_pivot.style.background_gradient(cmap="YlOrRd")


We uncovered a consistent and meaningful pattern: the average price-to-distance ratio varies notably across different source locations. This disparity is not random—it holds true across all vehicle categories, indicating a systemic pricing behavior based on pickup location.

Source locations with higher price/distance ratios—such as Haymarket Square, Financial District, and South Station—are predominantly commercial zones. These areas are characterized by high traffic, dense transit access, office clusters, shopping centers, and tourist activity. The elevated fare in these zones likely reflects greater ride demand, frequent congestion, and limited transportation alternatives.

On the other hand, areas with lower price/distance ratios, including Fenway, Northeastern University, and Boston University, tend to be residential or academic neighborhoods. These locations typically have lower ride demand, less traffic, and a calmer urban environment, which keeps fare rates more stable and affordable.

This analysis highlights that location, traffic intensity, and local demand significantly influence fare behavior, beyond just the vehicle type or distance traveled.

In [None]:
df_valid = df.copy()
# Compute fare per unit distance
df_valid["fare_per_distance"] = df_valid["fare"] / df_valid["distance"]

# Create a pivot table: source vs vehicle with mean fare/distance
fare_ratio_pivot = df_valid.pivot_table(index="destination", columns="vehicle", values="fare_per_distance", aggfunc="mean")

# Calculate row-wise sum to sort the table
row_sum = fare_ratio_pivot.sum(axis=1)

# Sort rows by total fare_per_distance
fare_ratio_pivot = fare_ratio_pivot.loc[row_sum.sort_values(ascending=False).index]

# Sort columns by total fare_per_distance
col_sum = fare_ratio_pivot.sum(axis=0)
fare_ratio_pivot = fare_ratio_pivot[col_sum.sort_values(ascending=False).index]

# Display the taorder = df.groupby("destination")["fare"].mean().sort_values(ascending=False).indexble
fare_ratio_pivot.style.background_gradient(cmap="YlOrRd")

We observed a similar fare pattern when grouped by destination: locations with the highest average price-per-distance ratio as destinations were also those that ranked highest as sources. This consistent structure suggests that both pickup and drop-off locations influence fare significantly.

Busy commercial areas—such as business districts, transit hubs, and tourist spots—tend to have higher fares, regardless of whether they are the origin or destination of a ride. 

In [None]:
ride_counts = pd.crosstab(df["source"], df["destination"])
ride_counts.style.background_gradient(cmap="Blues")


Another interesting insight from the dataset is the ride distribution across location pairs. The total number of rides between various source and destination combinations is remarkably balanced—most pairs have very similar ride counts, often in the range of 8,000 to 9,500.

However, rides are concentrated only among 72 specific source-destination pairs, while several combinations show zero rides, indicating that not all location pairs are equally connected in this dataset. This suggests a structured and limited travel pattern, possibly reflecting common commuting routes, urban layout, or user demand behavior.

## **Fare vs Time**

In [None]:
df["fare_per_km"] = df["fare"] / df["distance"] 

plt.figure(figsize=(10, 5))
sns.lineplot(data=df, x="hour", y="fare_per_km", estimator='mean', ci=None)
plt.title("Average Fare per km by Hour of Day")
plt.xlabel("Hour of Day")
plt.ylabel("Avg Fare per km")
plt.grid(True)
plt.tight_layout()
plt.show()

The average fare per kilometer remains consistent throughout the day, ranging narrowly between `$9.55` and `$9.90`. There is no significant deviation or discernible pattern across different hours, indicating that time of day does not substantially impact the fare rate per kilometer in this dataset.

In [None]:
df["fare_per_km"] = df["fare"] / df["distance"] 

plt.figure(figsize=(10, 5))
sns.lineplot(data=df, x="day_of_week", y="fare_per_km", estimator='mean', ci=None)
plt.title("Average Fare per km by Hour of Day")
plt.xlabel("Hour of Day")
plt.ylabel("Avg Fare per km")
plt.grid(True)
plt.tight_layout()
plt.show()

The variation in fare across different days of the week is minimal, with average fares per kilometer ranging only from 9.65 to 9.77. Combined with the earlier observation that fare remains consistent across different hours of the day, we can conclude that neither the time of day nor the day of the week has any significant impact on fare in this dataset.

## **Fare vs Weather Description**

In [None]:
plt.figure(figsize=(10, 5))
sns.boxplot(data=df, x="weather_desc_short", y="fare")
plt.title("Fare vs Weather Description Short")
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

plt.figure(figsize=(10, 5))
sns.boxplot(data=df, x="weather_desc_long", y="fare")
plt.title("Fare vs Weather Description Long")
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

plt.figure(figsize=(10, 5))
sns.boxplot(data=df, x="weather_status", y="fare")
plt.title("Fare vs Weather Status")
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

In [None]:
fare_stats = df.groupby("weather_desc_short")["fare"].describe().T
fare_stats = fare_stats.loc[:, df.groupby("weather_desc_short")["fare"].mean().sort_values(ascending=False).index]

fare_stats

In [None]:
plt.figure(figsize=(10, 5))
sns.boxplot(data=df, x="weather_desc_short", y="surge_multiplier")
plt.title("Fare vs Weather Description")
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

In [None]:
temp_df = df[df["surge_multiplier"]>1]
temp_df.groupby("weather_desc_short")["surge_multiplier"].count() / df.groupby("weather_desc_short")["surge_multiplier"].count() * 100

Our analysis indicates that weather conditions have no significant impact on either fare amounts or surge pricing. The proportion of surge rides remained fairly consistent—ranging from 2.9% to 3.8%—across all weather types. Additionally, summary statistics and boxplot distributions of fare grouped by weather_desc_short, weather_desc_long, and weather_status showed no meaningful variation.

Although some influence from weather was expected, surprisingly, the data reveals no observable relationship between weather and fare behavior. Since weather_desc_short, weather_desc_long, and weather_status convey overlapping information, and none contribute to fare prediction, we can confidently drop the three columns.

In [None]:
df = df.drop(columns=["weather_desc_long","weather_desc_short","weather_status"], axis=1, inplace=False)

## **Fare vs Weather Features**

In [None]:
features = ["temperature_C", "wind_speed", "humidity", "visibility", "uv_index", "cloud_cover"]

for feature in features:
    plt.figure(figsize=(6, 4))
    sns.regplot(data=df, x=feature, y="fare_per_km", scatter_kws={"alpha": 0.3}, line_kws={"color": "red"})
    plt.title(f"Fare vs {feature.capitalize()}")
    plt.xlabel(feature.capitalize())
    plt.ylabel("Fare")
    plt.grid(True)
    plt.tight_layout()
    plt.show()

We analyzed the relationship between fare and several continuous weather-related features: temperature, wind speed, humidity, visibility, UV index, and cloud cover. Scatter plots with regression lines were generated for each feature.

Across all weather variables, the regression lines were nearly flat and parallel to the x-axis, indicating no linear relationship between fare and these weather conditions. The fare distribution remained constant regardless of changes in weather parameters. This reinforces earlier findings that weather does not influence fare pricing in this dataset.

In [None]:
df.columns

## **Explore Surge Multiplier**

In [None]:
df["surge_multiplier"].describe()

In [None]:
df["surge_multiplier"].unique()

In [None]:
x = df["surge_multiplier"].value_counts()
x.plot(kind='bar', color='skyblue', edgecolor='black')
plt.title('Surge Multiplier Distribution')
plt.xlabel('Surge Multiplier')
plt.ylabel('Count')
x

The suge_multipliers are multiple of 0.25.

In [None]:
surge_rides = df[df["surge_multiplier"]>1]
surge_vehicles = surge_rides["vehicle"].unique()

print("Ride Comapny: ",surge_rides["ride_company"].unique())
print("Ride Vehicles: ",surge_vehicles)

surge_rides.head()

So, Uber doesn't have surge_multiplier. It is only for lyft. And, the surge is applied to only high end vehicles of Lyft.

In [None]:
print(f"{len(surge_rides)/len(df)*100}%")

Majority value is 1. Only 3.29% of the total data have surge_multiplier>1. And surge_multiplier is multiple of 0.25. 

In [None]:
print(f"{len(surge_rides)/ len(df[df["vehicle"].isin(surge_vehicles)])*100}%")

Of the rides from vehicles eligible for surge pricing, 8.19% of the rides experienced surge in fares.

In [None]:
surge_rides["vehicle"].value_counts()

The surge mutiplication is eveny distributed among the particular vehicles.

In [None]:
df[df["vehicle"].isin(surge_vehicles)]["vehicle"].value_counts()

In [None]:
surge_rides["vehicle"].value_counts()/df[df["vehicle"].isin(surge_vehicles)]["vehicle"].value_counts()*100

Surge fares are evenly distributed across the selected vehicle types, with approximately 8.19% of rides in each experiencing surge pricing.

In [None]:
# sns.heatmap(surge_rides.corr(numeric_only=True))
surge_rides.corr(numeric_only=True)["surge_multiplier"]

Surge_multiplier has 0 correalation with weather data. It is not applied depending on the weather condition.

In [None]:
# Grouped and sorted values for source location
grouped = df.groupby("source")["surge_multiplier"].apply(lambda x: (x > 1).mean()).sort_values(ascending=False) * 100
print(grouped)

plt.figure(figsize=(8, 4))
grouped.plot(kind='bar', color='orange', edgecolor='black')

plt.title('Average Surge Rate by Source')
plt.xlabel('Source')
plt.ylabel('Average Surge in %')
plt.xticks(rotation=45)
plt.grid(axis='y')
plt.tight_layout()
plt.show()

While busy commercial areas like Back Bay and Theatre District do show higher surge percentages, others like Financial District and South Station have relatively low surge frequencies. Conversely, academic and residential zones like Northeastern University and Boston University exhibit higher surge rates. This suggests that surge pricing may be more influenced by vehicle availability than just demand—locations with fewer available drivers, despite being less busy overall, may trigger surge more frequently. In contrast, commercial zones have higher regular fare but less frequently have surge in fare.

In [None]:
# Grouped and sorted values for source location
grouped = df.groupby("destination")["surge_multiplier"].apply(lambda x: (x > 1).mean()).sort_values(ascending=False) * 100
print(grouped)
plt.figure(figsize=(8, 4))
grouped.plot(kind='bar', color='orange', edgecolor='black')

plt.title('Average Surge Rate by Destination')
plt.xlabel('Destination')
plt.ylabel('Average Surge in %')
plt.xticks(rotation=45)
plt.grid(axis='y')
plt.tight_layout()
plt.show()

Surge rates by destination vary only slightly, ranging from 3.08% to 3.55%, showing no strong link to the type of area—whether commercial or residential. In contrast, source locations show more variation, suggesting that surge pricing is primarily driven by supply-demand imbalances at the pickup point rather than traffic or business activity at the destination. This may indicate limited vehicle availability in residential or university areas, leading to more frequent surge pricing there.

### **Surge vs Time**

In [None]:
plt.figure(figsize=(12,6))
sns.boxplot(data=surge_rides, x="hour", y="surge_multiplier")
plt.title("Distribution of Fares with Surge Pricing across Hours of the Day")
plt.show()

In [None]:
# Grouped and sorted values for day of week
grouped = df.groupby("destination")["surge_multiplier"].apply(lambda x: (x > 1).mean()).sort_values(ascending=False) * 100

plt.figure(figsize=(8, 4))
grouped.plot(kind='bar', color='orange', edgecolor='black')

plt.title('Average Surge Rate by Day of Week')
plt.xlabel('Day of Week')
plt.ylabel('Average Surge in %')
plt.xticks(rotation=45)
plt.grid(axis='y')
plt.tight_layout()
plt.show()

From the above charts, It's seen that surge doesn't depend on hour or day of the week. 

### **Surge vs Fare & Distance**

In [None]:
sns.boxplot(x=df["surge_multiplier"]>1, y="fare", data=df)
plt.title("Fare comparison: With Surge vs Without Surge")

For, surged rides, regular fare is multiplied with surge multiplier. So, it was expected to see overall higher fare in surged rides.

In [None]:
sns.boxplot(x=df["surge_multiplier"]>1, y="distance", data=df)
plt.title("Distance comparison: With Surge vs Without Surge")

Surge is slightly more seen in higher distances. But, No useful information could be derived from this stat.

## **Multivariate Analysis**

In [None]:
print(df.corr(numeric_only=True)["fare"])
sns.heatmap(df.corr(numeric_only=True))

Our target variable fare has no correalation with any numerical feature except distance.

# 📊 Exploratory Data Analysis Summary

## 🚖 1. Fare Patterns by Location
- **Source Matters:** Fare per kilometer is not constant across pickup locations. High-traffic commercial areas like Haymarket Square, Financial District, South Station, and Theatre District consistently show higher fare per distance.
- **Residential & Academic Areas:** Locations such as Fenway, Northeastern University, and Boston University have lower fare per km, possibly due to lower demand or reduced traffic.
- **Destination Impact:** Similar trends are observed in destination fares. Both pickup and drop-off locations influence overall fare, though source has a stronger effect.

## 📌 2. Route-Level Ride Patterns
- Only 72 unique source-destination pairs exist, meaning rides are limited to specific combinations.
- Ride counts between these combinations are very balanced, suggesting a controlled or engineered distribution
## ⏰ 3. Time of Day and Day of Week
- **Hourly Analysis:** Average fare per km remains consistent throughout the day, ranging from **9.55 to 9.9 USD/km**.
- **Weekly Trend:** Fare per km is also stable across days, ranging from **9.65 to 9.77 USD/km**.
- ➤ **Conclusion:** Neither time of day nor day of week significantly affects fare.

## 🌦️ 4. Weather Variables
- Features like **temperature, humidity, wind speed, visibility, UV index, and cloud cover** show **no linear correlation** with fare.
- Regression lines are nearly flat across all plots.
- ➤ Suggests weather conditions do not influence fare in this dataset.

## 💸 5. Surge Pricing Analysis
- Only **3.29%** of all rides had a surge multiplier > 1.
- Among surge-eligible vehicles (e.g., Lux, Lux Black, Lyft XL), about **8.19%** of rides experienced surge pricing.
- Surge application is evenly distributed across all high-end vehicle types.

## 🧭 6. Surge by Source & Destination
- **Surge by Source:** Pickup location strongly affects surge. For example:
  - **Back Bay:** 5.4%
  - **Northeastern University:** 5.0%
  - **North End:** 0.7%
- High-traffic commercial areas tend to have lower surge rates, likely due to better vehicle supply.
- Residential or university areas see more frequent surge, suggesting vehicle shortages in those zones.
- **Surge by Destination:** Rates are uniform (3.08%–3.55%) across drop-off locations with no clear pattern.
- ➤ Indicates surge pricing is influenced more by pickup location (supply-demand dynamics) than destination.

## 🗺️ 7. Geographic Patterns
- Ride locations are concentrated within a specific urban zone, with well-defined clusters.
- Indicates a compact city region with clear mobility hotspots and corridors.

---

## ✅ Key Takeaways
- Fare and surge pricing are location-dependent, especially influenced by the pickup point.
- Surge pricing is supply-driven, more common in areas with potentially fewer available vehicles.
- Time and weather do not play significant roles in determining fare or surge in this dataset.
- The ride network is well-defined and clustered, reflecting structured movement patterns within the city

<!-- 📊 Exploratory Data Analysis (EDA) Steps
Initial Data Inspection
Load the cleaned dataset.

df.head(), df.tail(), df.sample() – View samples of the data.

df.info() – Check column data types and non-null values.

df.describe() – Get numerical summary statistics.

df.isnull().sum() – Identify missing values.

**Understand the Target Variable (fare)**
Plot distribution: histogram, boxplot.

Check skewness & outliers.

Calculate summary stats: mean, median, IQR, mode. -->

<!-- **Univariate Analysis** -->
<!-- Analyze distribution of key variables: distance, hour, vehicle, surge_multiplier, weather, source, destination.

Use bar plots, histograms, or pie charts depending on the variable type (categorical vs numerical). -->

<!-- **Bivariate Analysis** -->
<!-- Fare vs Distance: -->

<!-- Scatter plot, correlation coefficient. -->

<!-- Group by vehicle type and re-check correlation. -->

<!-- Fare vs Time (hour, day, etc.): -->

<!-- Boxplots or line plots to observe fare changes throughout the day/week. -->

<!-- Fare vs Weather: -->

<!-- Boxplots or violin plots for weather categories. -->

<!-- Fare vs Vehicle Type: -->

<!-- Boxplots by vehicle type. -->

<!-- Surge Multiplier vs Time/Weather: -->

<!-- Check if surge pricing correlates with time/weather patterns. -->

**Multivariate Analysis**
<!-- Use sns.pairplot() or sns.heatmap() to explore relationships among multiple numeric features. -->

<!-- Group by combinations of features (e.g., source, destination, vehicle) to check fare patterns. -->

<!-- **Outlier Analysis**
Use boxplots and thresholds (e.g., IQR, domain knowledge) to detect outliers.

Investigate and handle fare outliers (already partially done).

Identify anomalies for distance, duration, or unexpected combinations (e.g., low distance but very high fare). -->

<!-- **Time-Based Patterns**
Plot fare patterns over hour, day_of_week, or month.

Examine how demand and pricing vary with time. -->

<!-- **Geospatial Trends (if location coordinates available)**
Plot source and destination densities on maps.

Cluster common routes. -->
<!-- 
**Missing/Invalid Data Handling**
Review how you handled missing or noisy entries.

Check if any columns still need imputation or transformation.
 -->
**Feature Engineering Ideas**
Create new variables (e.g., fare_per_mile, is_weekend, ride_duration_category, etc.)

Bin continuous variables (distance, hour) to analyze fare buckets.

**Summary & Insights**
Document key insights:

Which variables influence fare the most?

When do fares spike?

Which vehicle types are most expensive?

Are shared rides cheaper and shorter?