# Zomato Exploratory Data Analysis (EDA)

This notebook performs comprehensive exploratory data analysis on Zomato restaurant dataset to uncover insights about restaurant trends, customer preferences, and factors influencing ratings.

## 1. Import Required Libraries

In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings

# Set display options
pd.set_option('display.max_columns', None)
pd.set_option('display.width', 1000)
warnings.filterwarnings('ignore')

# Set plotting style
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

print("Libraries imported successfully!")

## 2. Load and Explore the Dataset

In [None]:
# Load the dataset
try:
    df = pd.read_csv('zomato.csv')
    print(f"Dataset loaded successfully! Shape: {df.shape}")
except FileNotFoundError:
    print("Error: zomato.csv file not found. Please ensure the dataset is in the current directory.")
    raise

In [None]:
# Display basic information about the dataset
print("Dataset Info:")
print(f"Shape: {df.shape}")
print(f"\nColumns: {list(df.columns)}")
print(f"\nData types:")
print(df.dtypes)

In [None]:
# Display first few rows
print("First 5 rows of the dataset:")
df.head()

In [None]:
# Check for missing values
print("Missing values in each column:")
missing_values = df.isnull().sum()
missing_percent = (missing_values / len(df)) * 100
missing_df = pd.DataFrame({
    'Missing Count': missing_values,
    'Missing Percentage': missing_percent
})
print(missing_df[missing_df['Missing Count'] > 0])

## 3. Data Preprocessing

In [None]:
# Create a clean copy of the dataset
df_clean = df.copy()

# Handle missing values if any
if df_clean.isnull().sum().sum() > 0:
    print("Handling missing values...")
    # Fill missing cuisines with 'Unknown'
    if 'Cuisines' in df_clean.columns:
        df_clean['Cuisines'].fillna('Unknown', inplace=True)
    
    # Fill missing ratings with median
    if 'Aggregate rating' in df_clean.columns:
        df_clean['Aggregate rating'].fillna(df_clean['Aggregate rating'].median(), inplace=True)

# Remove duplicates
initial_rows = len(df_clean)
df_clean.drop_duplicates(inplace=True)
final_rows = len(df_clean)
print(f"Removed {initial_rows - final_rows} duplicate rows")

print(f"Clean dataset shape: {df_clean.shape}")

## 4. Univariate Analysis

In [None]:
# Analyze distribution of restaurants by country
plt.figure(figsize=(12, 6))
country_counts = df_clean['Country'].value_counts().head(10)
plt.subplot(1, 2, 1)
country_counts.plot(kind='bar')
plt.title('Top 10 Countries by Number of Restaurants')
plt.xlabel('Country')
plt.ylabel('Number of Restaurants')
plt.xticks(rotation=45)

# Pie chart for top 5 countries
plt.subplot(1, 2, 2)
country_counts.head(5).plot(kind='pie', autopct='%1.1f%%')
plt.title('Top 5 Countries Distribution')
plt.ylabel('')

plt.tight_layout()
plt.show()

print("\nTop 10 Countries by Restaurant Count:")
print(country_counts)

In [None]:
# Analyze distribution of restaurants by city (top 10)
plt.figure(figsize=(14, 8))
city_counts = df_clean['City'].value_counts().head(10)

plt.subplot(2, 2, 1)
city_counts.plot(kind='bar', color='skyblue')
plt.title('Top 10 Cities by Number of Restaurants')
plt.xlabel('City')
plt.ylabel('Number of Restaurants')
plt.xticks(rotation=45)

# Pie chart for top 5 cities
plt.subplot(2, 2, 2)
city_counts.head(5).plot(kind='pie', autopct='%1.2f%%')
plt.title('Top 5 Cities Distribution')
plt.ylabel('')

# Rating distribution
plt.subplot(2, 2, 3)
plt.hist(df_clean['Aggregate rating'], bins=20, edgecolor='black', alpha=0.7)
plt.title('Distribution of Aggregate Ratings')
plt.xlabel('Rating')
plt.ylabel('Frequency')

# Price range distribution
plt.subplot(2, 2, 4)
price_counts = df_clean['Price range'].value_counts().sort_index()
price_counts.plot(kind='bar', color='lightcoral')
plt.title('Distribution of Price Ranges')
plt.xlabel('Price Range')
plt.ylabel('Number of Restaurants')

plt.tight_layout()
plt.show()

print("\nTop 10 Cities by Restaurant Count:")
print(city_counts)

## 5. Bivariate Analysis

In [None]:
# Analyze relationship between price range and ratings
plt.figure(figsize=(15, 5))

# Box plot: Price range vs Rating
plt.subplot(1, 3, 1)
sns.boxplot(data=df_clean, x='Price range', y='Aggregate rating')
plt.title('Rating Distribution by Price Range')
plt.xlabel('Price Range')
plt.ylabel('Aggregate Rating')

# Scatter plot: Cost vs Rating (for India only to avoid currency confusion)
india_data = df_clean[df_clean['Country'] == 'India']
plt.subplot(1, 3, 2)
plt.scatter(india_data['Average Cost for two'], india_data['Aggregate rating'], alpha=0.6)
plt.title('Cost vs Rating (India Only)')
plt.xlabel('Average Cost for Two (INR)')
plt.ylabel('Aggregate Rating')

# Bar plot: Online delivery vs Average rating
plt.subplot(1, 3, 3)
delivery_rating = df_clean.groupby('Has Online delivery')['Aggregate rating'].mean()
delivery_rating.plot(kind='bar', color=['lightblue', 'orange'])
plt.title('Average Rating by Online Delivery Availability')
plt.xlabel('Has Online Delivery')
plt.ylabel('Average Rating')
plt.xticks(rotation=0)

plt.tight_layout()
plt.show()

print("Average rating by online delivery:")
print(delivery_rating)

In [None]:
# Analyze table booking and online delivery patterns
plt.figure(figsize=(12, 8))

# Cross-tabulation of online delivery and table booking
cross_tab = pd.crosstab(df_clean['Has Online delivery'], df_clean['Has Table booking'])

plt.subplot(2, 2, 1)
sns.heatmap(cross_tab, annot=True, fmt='d', cmap='Blues')
plt.title('Online Delivery vs Table Booking')

# Online delivery by country (top 5 countries)
plt.subplot(2, 2, 2)
top_countries = df_clean['Country'].value_counts().head(5).index
delivery_by_country = df_clean[df_clean['Country'].isin(top_countries)].groupby(['Country', 'Has Online delivery']).size().unstack()
delivery_by_country.plot(kind='bar', stacked=True)
plt.title('Online Delivery by Top 5 Countries')
plt.xlabel('Country')
plt.ylabel('Number of Restaurants')
plt.xticks(rotation=45)
plt.legend(title='Has Online Delivery')

# Table booking by price range
plt.subplot(2, 2, 3)
booking_by_price = df_clean.groupby(['Price range', 'Has Table booking']).size().unstack()
booking_by_price.plot(kind='bar')
plt.title('Table Booking by Price Range')
plt.xlabel('Price Range')
plt.ylabel('Number of Restaurants')
plt.legend(title='Has Table Booking')

# Rating distribution by table booking
plt.subplot(2, 2, 4)
sns.boxplot(data=df_clean, x='Has Table booking', y='Aggregate rating')
plt.title('Rating Distribution by Table Booking')

plt.tight_layout()
plt.show()

## 6. Cuisine Analysis

In [None]:
# Analyze top cuisines
# Split cuisines and count individual ones
all_cuisines = []
for cuisines in df_clean['Cuisines'].dropna():
    if isinstance(cuisines, str):
        individual_cuisines = [c.strip() for c in cuisines.split(',')]
        all_cuisines.extend(individual_cuisines)

cuisine_counts = pd.Series(all_cuisines).value_counts()

plt.figure(figsize=(15, 10))

# Top 15 cuisines
plt.subplot(2, 2, 1)
top_15_cuisines = cuisine_counts.head(15)
top_15_cuisines.plot(kind='barh')
plt.title('Top 15 Most Popular Cuisines')
plt.xlabel('Number of Restaurants')

# Top 10 cuisines pie chart
plt.subplot(2, 2, 2)
top_10_cuisines = cuisine_counts.head(10)
plt.pie(top_10_cuisines.values, labels=top_10_cuisines.index, autopct='%1.1f%%')
plt.title('Top 10 Cuisines Distribution')

# Cuisine popularity in India vs other countries
plt.subplot(2, 1, 2)
india_cuisines = []
other_cuisines = []

for idx, row in df_clean.iterrows():
    if isinstance(row['Cuisines'], str):
        individual_cuisines = [c.strip() for c in row['Cuisines'].split(',')]
        if row['Country'] == 'India':
            india_cuisines.extend(individual_cuisines)
        else:
            other_cuisines.extend(individual_cuisines)

india_cuisine_counts = pd.Series(india_cuisines).value_counts().head(10)
other_cuisine_counts = pd.Series(other_cuisines).value_counts().head(10)

# Create comparison plot
x = range(len(india_cuisine_counts))
width = 0.35

plt.bar([i - width/2 for i in x], india_cuisine_counts.values, width, label='India', alpha=0.8)
plt.bar([i + width/2 for i in x], other_cuisine_counts.values[:len(x)], width, label='Other Countries', alpha=0.8)

plt.xlabel('Cuisines')
plt.ylabel('Number of Restaurants')
plt.title('Top Cuisines: India vs Other Countries')
plt.xticks(x, india_cuisine_counts.index, rotation=45)
plt.legend()

plt.tight_layout()
plt.show()

print("\nTop 10 Cuisines Overall:")
print(top_10_cuisines)

## 7. Rating Analysis

In [None]:
# Comprehensive rating analysis
plt.figure(figsize=(15, 10))

# Rating distribution by rating text
plt.subplot(2, 3, 1)
rating_text_counts = df_clean['Rating text'].value_counts()
rating_text_counts.plot(kind='bar', color='lightgreen')
plt.title('Distribution by Rating Text')
plt.xlabel('Rating Text')
plt.ylabel('Count')
plt.xticks(rotation=45)

# Average rating by country (top 5)
plt.subplot(2, 3, 2)
top_countries = df_clean['Country'].value_counts().head(5).index
country_ratings = df_clean[df_clean['Country'].isin(top_countries)].groupby('Country')['Aggregate rating'].mean().sort_values(ascending=False)
country_ratings.plot(kind='bar', color='coral')
plt.title('Average Rating by Country (Top 5)')
plt.xlabel('Country')
plt.ylabel('Average Rating')
plt.xticks(rotation=45)

# Votes vs Rating scatter plot
plt.subplot(2, 3, 3)
plt.scatter(df_clean['Votes'], df_clean['Aggregate rating'], alpha=0.6)
plt.title('Votes vs Rating')
plt.xlabel('Number of Votes')
plt.ylabel('Aggregate Rating')

# Rating distribution histogram
plt.subplot(2, 3, 4)
plt.hist(df_clean['Aggregate rating'], bins=25, edgecolor='black', alpha=0.7, color='skyblue')
plt.title('Detailed Rating Distribution')
plt.xlabel('Rating')
plt.ylabel('Frequency')

# Average rating by city (top 10)
plt.subplot(2, 3, 5)
top_cities = df_clean['City'].value_counts().head(10).index
city_ratings = df_clean[df_clean['City'].isin(top_cities)].groupby('City')['Aggregate rating'].mean().sort_values(ascending=False)
city_ratings.plot(kind='bar', color='gold')
plt.title('Average Rating by City (Top 10)')
plt.xlabel('City')
plt.ylabel('Average Rating')
plt.xticks(rotation=45)

# Rating vs Cost correlation (India only)
plt.subplot(2, 3, 6)
india_data = df_clean[df_clean['Country'] == 'India']
if len(india_data) > 0:
    correlation = india_data['Average Cost for two'].corr(india_data['Aggregate rating'])
    plt.scatter(india_data['Average Cost for two'], india_data['Aggregate rating'], alpha=0.6)
    plt.title(f'Cost vs Rating (India)\nCorrelation: {correlation:.3f}')
    plt.xlabel('Average Cost for Two (INR)')
    plt.ylabel('Aggregate Rating')

plt.tight_layout()
plt.show()

print("\nRating Statistics:")
print(df_clean['Aggregate rating'].describe())
print("\nRating Text Distribution:")
print(rating_text_counts)

## 8. Key Insights and Summary

In [None]:
# Generate comprehensive summary statistics
print("=" * 80)
print("ZOMATO DATASET - KEY INSIGHTS SUMMARY")
print("=" * 80)

# Dataset overview
print(f"\n📊 DATASET OVERVIEW:")
print(f"   • Total Restaurants: {len(df_clean):,}")
print(f"   • Countries Covered: {df_clean['Country'].nunique()}")
print(f"   • Cities Covered: {df_clean['City'].nunique()}")
print(f"   • Unique Cuisines: {len(set(all_cuisines))}")

# Top performers
print(f"\n🏆 TOP PERFORMERS:")
print(f"   • Country with Most Restaurants: {df_clean['Country'].value_counts().index[0]} ({df_clean['Country'].value_counts().iloc[0]:,} restaurants)")
print(f"   • City with Most Restaurants: {df_clean['City'].value_counts().index[0]} ({df_clean['City'].value_counts().iloc[0]:,} restaurants)")
print(f"   • Most Popular Cuisine: {cuisine_counts.index[0]} ({cuisine_counts.iloc[0]:,} restaurants)")

# Rating insights
print(f"\n⭐ RATING INSIGHTS:")
print(f"   • Average Rating: {df_clean['Aggregate rating'].mean():.2f}")
print(f"   • Highest Rated Restaurants: {len(df_clean[df_clean['Aggregate rating'] >= 4.5])} restaurants (≥4.5 rating)")
print(f"   • Most Common Rating Text: {df_clean['Rating text'].value_counts().index[0]}")

# Service insights
online_delivery_pct = (df_clean['Has Online delivery'].value_counts()['Yes'] / len(df_clean)) * 100
table_booking_pct = (df_clean['Has Table booking'].value_counts()['Yes'] / len(df_clean)) * 100

print(f"\n🚚 SERVICE INSIGHTS:")
print(f"   • Restaurants with Online Delivery: {online_delivery_pct:.1f}%")
print(f"   • Restaurants with Table Booking: {table_booking_pct:.1f}%")

# Price insights
print(f"\n💰 PRICING INSIGHTS:")
most_common_price_range = df_clean['Price range'].value_counts().index[0]
print(f"   • Most Common Price Range: {most_common_price_range}")
if 'India' in df_clean['Country'].values:
    india_avg_cost = df_clean[df_clean['Country'] == 'India']['Average Cost for two'].mean()
    print(f"   • Average Cost in India: ₹{india_avg_cost:.0f} for two")

# Geographic insights
print(f"\n🌍 GEOGRAPHIC INSIGHTS:")
print(f"   • Top 3 Countries: {', '.join(df_clean['Country'].value_counts().head(3).index.tolist())}")
print(f"   • Top 3 Cities: {', '.join(df_clean['City'].value_counts().head(3).index.tolist())}")

print("\n" + "=" * 80)

## 9. Recommendations

Based on the analysis, here are key recommendations:

### For Restaurant Owners:
1. **Focus on Rating Improvement**: Restaurants with ratings above 4.0 tend to get more visibility and customers
2. **Online Delivery**: Consider implementing online delivery as it's becoming increasingly popular
3. **Table Booking**: Higher-end restaurants (price range 3-4) benefit significantly from table booking facilities

### For Zomato Platform:
1. **Market Expansion**: Focus on underrepresented cities and countries for growth opportunities
2. **Cuisine Diversity**: Promote diverse cuisine options in areas dominated by popular cuisines
3. **Rating System**: Encourage more customer reviews to improve rating reliability

### For Customers:
1. **Price vs Quality**: Higher price ranges generally correlate with better ratings
2. **Service Features**: Look for restaurants with online delivery and table booking for convenience
3. **Popular Areas**: Major cities tend to have more restaurant options and competitive pricing

## 10. Conclusion

This exploratory data analysis of the Zomato dataset has revealed several important insights:

- **Market Concentration**: The restaurant market is heavily concentrated in major urban areas
- **Quality Correlation**: There's a clear relationship between price range and customer ratings
- **Service Evolution**: Online delivery and table booking are becoming standard features
- **Cuisine Preferences**: Local cuisines dominate in most markets, with international options in metropolitan areas
- **Rating Distribution**: Most restaurants maintain good ratings, indicating quality standards

This analysis provides a solid foundation for data-driven decision making in the restaurant and food delivery industry.