In [None]:

# # Zomato Delivery Operations Analytics: Delivery Performance Optimization

# %%
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Load dataset
df = pd.read_csv("Zomato Dataset.csv")


# ## 1. Data Cleaning


# Rename misnamed column
df.rename(columns={"Time_Orderd": "Time_Ordered"}, inplace=True)

# Convert datatypes
df['Order_Date'] = pd.to_datetime(df['Order_Date'], format='%d-%m-%Y', errors='coerce')
df['Time_taken (min)'] = pd.to_numeric(df['Time_taken (min)'], errors='coerce')
df['Delivery_person_Age'] = pd.to_numeric(df['Delivery_person_Age'], errors='coerce')
df['Delivery_person_Ratings'] = pd.to_numeric(df['Delivery_person_Ratings'], errors='coerce')

# Drop rows with null in critical columns
df.dropna(subset=['Order_Date', 'Time_Ordered', 'Time_taken (min)', 'Delivery_person_Age', 'Delivery_person_Ratings'], inplace=True)


# ## 2. Feature Engineering


# Delivery distance (approximate, using Euclidean distance for simplicity)
df['delivery_distance_km'] = np.sqrt(
    (df['Restaurant_latitude'] - df['Delivery_location_latitude'])**2 +
    (df['Restaurant_longitude'] - df['Delivery_location_longitude'])**2
) * 111  # approximate conversion to kilometers

# %% [markdown]
# ## 3. Exploratory Data Analysis (EDA)

# %%
# Plot: Delivery time distribution
plt.figure(figsize=(8, 5))
plt.hist(df['Time_taken (min)'], bins=20, color='skyblue', edgecolor='black')
plt.title("Delivery Time Distribution")
plt.xlabel("Time Taken (min)")
plt.ylabel("Number of Deliveries")
plt.grid(True)
plt.tight_layout()
plt.show()


# Boxplot: Delivery time by traffic density
plt.figure(figsize=(9, 5))
df.boxplot(column='Time_taken (min)', by='Road_traffic_density', grid=False)
plt.title("Delivery Time by Traffic Density")
plt.suptitle("")
plt.xlabel("Traffic Density")
plt.ylabel("Time Taken (min)")
plt.tight_layout()
plt.show()


# Boxplot: Delivery time by weather
plt.figure(figsize=(9, 5))
df.boxplot(column='Time_taken (min)', by='Weather_conditions', grid=False)
plt.title("Delivery Time by Weather Conditions")
plt.suptitle("")
plt.xlabel("Weather Conditions")
plt.ylabel("Time Taken (min)")
plt.xticks(rotation=30)
plt.tight_layout()
plt.show()


# Scatter: Delivery person rating vs. delivery time
plt.figure(figsize=(8, 5))
plt.scatter(df['Delivery_person_Ratings'], df['Time_taken (min)'], alpha=0.6, color='coral')
plt.title("Delivery Time vs. Delivery Person Rating")
plt.xlabel("Rating")
plt.ylabel("Time Taken (min)")
plt.grid(True)
plt.tight_layout()
plt.show()

# %%
# Correlation heatmap (basic)
numerics = df[['Delivery_person_Age', 'Delivery_person_Ratings', 'Vehicle_condition', 'delivery_distance_km', 'Time_taken (min)']]
correlation_matrix = numerics.corr()

plt.figure(figsize=(7, 5))
plt.imshow(correlation_matrix, cmap='coolwarm', interpolation='nearest')
plt.xticks(range(len(correlation_matrix)), correlation_matrix.columns, rotation=45)
plt.yticks(range(len(correlation_matrix)), correlation_matrix.columns)
plt.colorbar(label='Correlation Coefficient')
plt.title("Correlation Heatmap")
plt.tight_layout()
plt.show()


# ## 4. Key Insights

# %%
# Grouped averages
grouped = df.groupby('Road_traffic_density')['Time_taken (min)'].mean().sort_values(ascending=False)
print("Average delivery time by traffic density:")
print(grouped)

grouped2 = df.groupby('Weather_conditions')['Time_taken (min)'].mean().sort_values(ascending=False)
print("\nAverage delivery time by weather condition:")
print(grouped2)


df['Time_Ordered_hour'] = pd.to_datetime(df['Time_Ordered'], format='%H:%M', errors='coerce').dt.hour


def is_rush_hour(hour):
    return 1 if hour in range(12, 15) or hour in range(18, 22) else 0

df['Rush_hour'] = df['Time_Ordered_hour'].apply(is_rush_hour)


rush_comparison = df.groupby('Rush_hour')['Time_taken (min)'].mean()
print("Avg delivery time (Rush=1, Non-Rush=0):")
print(rush_comparison)

# Bar plot
plt.figure(figsize=(6, 4))
rush_comparison.plot(kind='bar', color=['steelblue', 'salmon'])
plt.xticks([0,1], ['Non-Rush Hour', 'Rush Hour'], rotation=0)
plt.ylabel('Avg Time Taken (min)')
plt.title('Rush Hour Impact on Delivery Time')
plt.tight_layout()
plt.show()
# Mean time per delivery person
person_perf = df.groupby('Delivery_person_ID')['Time_taken (min)'].mean().sort_values()

# Top 5 fastest
print("Top 5 Fastest Delivery Persons:")
print(person_perf.head(5))

# Delivery persons with very high average time (potential outliers)
print("\nSlowest (Potential Issue):")
print(person_perf.tail(5))
festival_analysis = df.groupby('Festival')['Time_taken (min)'].mean()
print("Average Delivery Time on Festival vs Non-Festival Days:")
print(festival_analysis)

# Pie chart
plt.figure(figsize=(5,5))
festival_analysis.plot(kind='pie', autopct='%1.1f%%', colors=['lightgreen', 'orange'], labels=['No Festival', 'Festival'])
plt.title("Delivery Time Distribution: Festival vs Non-Festival")
plt.ylabel("")
plt.tight_layout()
plt.show()
# Create a new column: Is_Delivery_Slow
average_time = df['Time_taken (min)'].mean()
df['Is_Delivery_Slow'] = df['Time_taken (min)'] > average_time

slow_ratio = df['Is_Delivery_Slow'].mean()
print(f"Percentage of Deliveries Slower Than Average: {slow_ratio * 100:.2f}%")

# Bar plot
plt.figure(figsize=(6, 4))
df['Is_Delivery_Slow'].value_counts().plot(kind='bar', color=['lightblue', 'salmon'])
plt.xticks([0,1], ['Fast or Avg', 'Slow'])
plt.ylabel('Number of Deliveries')
plt.title('Deliveries Above Average Time')
plt.tight_layout()
plt.show()
