In [None]:
import pandas as pd
import os
import matplotlib.pyplot as plt
import seaborn as sns

print("--- CELL 1: Data Preparation and Initial Exploration for Geographical Analysis ---")

# --- 1. Load the Dataset ---
print("--- Step 1: Loading the Dataset ---")
file_path = 'Dataset .csv'
if not os.path.exists(file_path):
    print(f"Error: '{file_path}' not found. Please ensure the dataset file is in the same directory.")
    exit()

df = pd.read_csv(file_path)
print("Dataset loaded successfully!")

print(f"Initial dataset shape: {df.shape}")
print("Initial 5 rows:")
print(df.head().to_markdown(index=False, numalign="left", stralign="left"))


# --- 2. Handle Missing Values relevant to Geographical Analysis ---
print("\n--- Step 2: Handling Missing Values ---")
print("Missing values before handling:")
print(df.isnull().sum()[df.isnull().sum() > 0].to_markdown(numalign="left", stralign="left"))

# Drop rows where essential geographical or rating data might be missing
# For geographical analysis, Latitude, Longitude, City, Locality are crucial.
# Cuisines, Aggregate rating are also important for aggregated stats.
df.dropna(subset=['Latitude', 'Longitude', 'City', 'Locality', 'Cuisines', 'Aggregate rating'], inplace=True)
print(f"Dataset shape after dropping rows with missing geographical/key data: {df.shape}")
print("Missing values after handling:")
print(df.isnull().sum()[df.isnull().sum() > 0].to_markdown(numalign="left", stralign="left"))


# --- 3. Convert Data Types (if necessary) ---
print("\n--- Step 3: Ensuring Correct Data Types ---")
# Ensure Latitude and Longitude are numeric
df['Latitude'] = pd.to_numeric(df['Latitude'], errors='coerce')
df['Longitude'] = pd.to_numeric(df['Longitude'], errors='coerce')
# Drop any rows that might have become NaN after coercion
df.dropna(subset=['Latitude', 'Longitude'], inplace=True)
print("Latitude and Longitude ensured as numeric.")


# --- 4. Initial Exploration of Latitude and Longitude ---
print("\n--- Step 4: Initial Exploration of Latitude and Longitude ---")
print("Descriptive statistics for Latitude and Longitude:")
print(df[['Latitude', 'Longitude']].describe().to_markdown(numalign="left", stralign="left"))

# Visualize their distribution (conceptual, as direct interactive map is not feasible here)
# If running locally, you would use libraries like matplotlib/seaborn for scatter plots
# or folium/plotly for interactive maps.

# Example: Scatter plot of Lat/Long (if running in a notebook that supports plots)
# plt.figure(figsize=(10, 8))
# sns.scatterplot(data=df, x='Longitude', y='Latitude', alpha=0.6, s=10, hue='Aggregate rating', palette='viridis')
# plt.title('Restaurant Distribution by Latitude and Longitude (Colored by Rating)')
# plt.xlabel('Longitude')
# plt.ylabel('Latitude')
# plt.grid(True, linestyle='--', alpha=0.7)
# plt.show()
# print("\n(A scatter plot of Latitude vs. Longitude would show the geographical spread of restaurants.)")

print("\nData preparation and initial exploration for geographical analysis complete.")


--- CELL 1: Data Preparation and Initial Exploration for Geographical Analysis ---
--- Step 1: Loading the Dataset ---
Dataset loaded successfully!
Initial dataset shape: (9551, 21)
Initial 5 rows:
| Restaurant ID   | Restaurant Name        | Country Code   | City             | Address                                                                 | Locality                                   | Locality Verbose                                             | Longitude   | Latitude   | Cuisines                         | Average Cost for two   | Currency         | Has Table booking   | Has Online delivery   | Is delivering now   | Switch to order menu   | Price range   | Aggregate rating   | Rating color   | Rating text   | Votes   |
|:----------------|:-----------------------|:---------------|:-----------------|:------------------------------------------------------------------------|:-------------------------------------------|:------------------------------------------------------------

In [None]:
import pandas as pd
import os
# Assuming df is available from Cell 1

print("--- CELL 2: Geographical Grouping and Aggregation ---")

# --- Re-run Data Preparation to ensure df is available ---
# This block is included for standalone execution if the user runs only this cell.
file_path = 'Dataset .csv'
if not os.path.exists(file_path):
    print(f"Error: '{file_path}' not found. Please ensure the dataset file is in the same directory.")
    exit()
df = pd.read_csv(file_path)
df.dropna(subset=['Latitude', 'Longitude', 'City', 'Locality', 'Cuisines', 'Aggregate rating'], inplace=True)
df['Latitude'] = pd.to_numeric(df['Latitude'], errors='coerce')
df['Longitude'] = pd.to_numeric(df['Longitude'], errors='coerce')
df.dropna(subset=['Latitude', 'Longitude'], inplace=True)
print("Data reloaded and preprocessed for geographical analysis.")


# --- 1. Group by City and Analyze Concentration ---
print("\n--- Step 1: Grouping by City and Analyzing Concentration ---")
restaurants_by_city = df['City'].value_counts().reset_index()
restaurants_by_city.columns = ['City', 'Restaurant Count']
print("Top 10 Cities by Restaurant Count:")
print(restaurants_by_city.head(10).to_markdown(index=False, numalign="left", stralign="left"))


# --- 2. Calculate Statistics by City ---
print("\n--- Step 2: Calculating Statistics by City ---")
city_stats = df.groupby('City').agg(
    Average_Rating=('Aggregate rating', 'mean'),
    Median_Rating=('Aggregate rating', 'median'),
    Average_Cost_for_Two=('Average Cost for two', 'mean'),
    Most_Common_Price_Range=('Price range', lambda x: x.mode()[0] if not x.mode().empty else None),
    Total_Votes=('Votes', 'sum'),
    Restaurant_Count=('Restaurant ID', 'count')
).reset_index()

# For most common cuisines, it's a bit more complex due to multi-label
def get_top_cuisines(cuisines_series, top_n=3):
    all_cuisines = cuisines_series.str.split(', ').explode().dropna()
    if all_cuisines.empty:
        return "N/A"
    top_cuisines = all_cuisines.value_counts().head(top_n).index.tolist()
    return ', '.join(top_cuisines)

city_cuisine_stats = df.groupby('City')['Cuisines'].apply(get_top_cuisines).reset_index()
city_cuisine_stats.rename(columns={'Cuisines': 'Top_Cuisines'}, inplace=True)

# Merge cuisine stats with other city stats
city_analysis_df = pd.merge(city_stats, city_cuisine_stats, on='City')
city_analysis_df = city_analysis_df.sort_values(by='Restaurant_Count', ascending=False)

print("\nTop 10 Cities by Restaurant Count with Key Statistics:")
print(city_analysis_df.head(10).to_markdown(index=False, numalign="left", stralign="left"))


# --- 3. Group by Locality (within Cities) and Analyze Concentration ---
print("\n--- Step 3: Grouping by Locality and Analyzing Concentration ---")
# Combine City and Locality for unique identification of areas
df['City_Locality'] = df['City'] + ' - ' + df['Locality']
restaurants_by_locality = df['City_Locality'].value_counts().reset_index()
restaurants_by_locality.columns = ['City_Locality', 'Restaurant Count']
print("\nTop 10 Localities (City - Locality) by Restaurant Count:")
print(restaurants_by_locality.head(10).to_markdown(index=False, numalign="left", stralign="left"))


# --- 4. Calculate Statistics by Locality ---
print("\n--- Step 4: Calculating Statistics by Locality ---")
locality_stats = df.groupby('City_Locality').agg(
    Average_Rating=('Aggregate rating', 'mean'),
    Median_Rating=('Aggregate rating', 'median'),
    Average_Cost_for_Two=('Average Cost for two', 'mean'),
    Most_Common_Price_Range=('Price range', lambda x: x.mode()[0] if not x.mode().empty else None),
    Total_Votes=('Votes', 'sum'),
    Restaurant_Count=('Restaurant ID', 'count')
).reset_index()

locality_cuisine_stats = df.groupby('City_Locality')['Cuisines'].apply(get_top_cuisines).reset_index()
locality_cuisine_stats.rename(columns={'Cuisines': 'Top_Cuisines'}, inplace=True)

locality_analysis_df = pd.merge(locality_stats, locality_cuisine_stats, on='City_Locality')
locality_analysis_df = locality_analysis_df.sort_values(by='Restaurant_Count', ascending=False)

print("\nTop 10 Localities (City - Locality) by Restaurant Count with Key Statistics:")
print(locality_analysis_df.head(10).to_markdown(index=False, numalign="left", stralign="left"))

print("\nGeographical grouping and aggregation complete.")


--- CELL 2: Geographical Grouping and Aggregation ---
Data reloaded and preprocessed for geographical analysis.

--- Step 1: Grouping by City and Analyzing Concentration ---
Top 10 Cities by Restaurant Count:
| City         | Restaurant Count   |
|:-------------|:-------------------|
| New Delhi    | 5473               |
| Gurgaon      | 1118               |
| Noida        | 1080               |
| Faridabad    | 251                |
| Ghaziabad    | 25                 |
| Bhubaneshwar | 21                 |
| Ahmedabad    | 21                 |
| Lucknow      | 21                 |
| Guwahati     | 21                 |
| Amritsar     | 21                 |

--- Step 2: Calculating Statistics by City ---

Top 10 Cities by Restaurant Count with Key Statistics:
| City         | Average_Rating   | Median_Rating   | Average_Cost_for_Two   | Most_Common_Price_Range   | Total_Votes   | Restaurant_Count   | Top_Cuisines                       |
|:-------------|:-----------------|:--------------

In [None]:
import pandas as pd
import os
# Assuming df, city_analysis_df, locality_analysis_df are available from Cell 1 and 2

print("--- CELL 3: Identify Insights and Patterns Related to Locations ---")



print("--- Dataframes for analysis are prepared (assuming Cell 1 and 2 were executed). ---")


# --- 1. Overall Restaurant Concentration ---
print("\n### 1. Overall Restaurant Concentration ###")
print("Cities with the highest number of restaurants tend to be major metropolitan areas, indicating a higher demand and supply of dining options in these urban centers.")
print(f"Top 5 Cities by Restaurant Count:\n{city_analysis_df[['City', 'Restaurant_Count']].head().to_markdown(index=False, numalign='left', stralign='left')}")
print(f"\nTop 5 Localities by Restaurant Count:\n{locality_analysis_df[['City_Locality', 'Restaurant_Count']].head().to_markdown(index=False, numalign='left', stralign='left')}")
print("Within cities, certain localities stand out as restaurant hubs, likely due to factors like commercial activity, residential density, or tourist attractions.")


# --- 2. Rating Patterns by Location ---
print("\n### 2. Rating Patterns by Location ###")
print("Analyzing average ratings by city/locality can reveal areas known for higher quality dining experiences.")
top_rated_cities = city_analysis_df.sort_values(by='Average_Rating', ascending=False).head(5)
print(f"\nTop 5 Cities by Average Rating:\n{top_rated_cities[['City', 'Average_Rating', 'Restaurant_Count']].to_markdown(index=False, numalign='left', stralign='left')}")

top_rated_localities = locality_analysis_df.sort_values(by='Average_Rating', ascending=False).head(5)
# Filter out localities with very few restaurants for more meaningful average rating
top_rated_localities = top_rated_localities[top_rated_localities['Restaurant_Count'] >= 5] # Example threshold
print(f"\nTop 5 Localities (with >= 5 restaurants) by Average Rating:\n{top_rated_localities[['City_Locality', 'Average_Rating', 'Restaurant_Count']].to_markdown(index=False, numalign='left', stralign='left')}")
print("Cities/Localities with higher average ratings might indicate a more competitive culinary scene or a higher standard of restaurants.")


# --- 3. Cuisine Distribution by Location ---
print("\n### 3. Cuisine Distribution by Location ###")
print("The most common cuisines can vary significantly by city or even locality, reflecting local tastes, demographics, and cultural influences.")
print("\nExample: Top Cuisines in Cities with High Restaurant Count:")
print(city_analysis_df[['City', 'Top_Cuisines', 'Restaurant_Count']].head(5).to_markdown(index=False, numalign='left', stralign='left'))
print("\nExample: Top Cuisines in Specific Localities:")
# Show top cuisines for a few diverse localities
sample_localities = locality_analysis_df.head(5) # Take top 5 by count
print(sample_localities[['City_Locality', 'Top_Cuisines', 'Restaurant_Count']].to_markdown(index=False, numalign='left', stralign='left'))
print("This analysis helps understand regional culinary trends and market saturation for certain cuisine types.")


# --- 4. Price Range and Cost by Location ---
print("\n### 4. Price Range and Cost by Location ###")
print("Average cost and most common price ranges can highlight the economic profile of dining in different areas.")
print("\nCities by Average Cost for Two and Most Common Price Range:")
print(city_analysis_df[['City', 'Average_Cost_for_Two', 'Most_Common_Price_Range', 'Restaurant_Count']].sort_values(by='Average_Cost_for_Two', ascending=False).head(5).to_markdown(index=False, numalign='left', stralign='left'))
print("\nLocalities by Average Cost for Two and Most Common Price Range:")
print(locality_analysis_df[['City_Locality', 'Average_Cost_for_Two', 'Most_Common_Price_Range', 'Restaurant_Count']].sort_values(by='Average_Cost_for_Two', ascending=False).head(5).to_markdown(index=False, numalign='left', stralign='left'))
print("High average costs and price ranges often correlate with affluent areas or fine-dining districts, while lower ranges might indicate areas with more casual or budget-friendly options.")


# --- 5. Insights from Votes/Popularity ---
print("\n### 5. Insights from Votes/Popularity ###")
print("Total votes can indicate the overall dining activity and popularity of restaurants in a given area.")
print("\nCities by Total Votes:")
print(city_analysis_df[['City', 'Total_Votes', 'Restaurant_Count']].sort_values(by='Total_Votes', ascending=False).head(5).to_markdown(index=False, numalign='left', stralign='left'))
print("\nLocalities by Total Votes:")
print(locality_analysis_df[['City_Locality', 'Total_Votes', 'Restaurant_Count']].sort_values(by='Total_Votes', ascending=False).head(5).to_markdown(index=False, numalign='left', stralign='left'))
print("Areas with high total votes suggest active dining scenes and popular establishments, which could be a good indicator for new restaurant ventures or marketing efforts.")


print("\nGeographical analysis complete. These insights can be valuable for business expansion, local marketing, or understanding market dynamics.")

--- CELL 3: Identify Insights and Patterns Related to Locations ---
--- Dataframes for analysis are prepared (assuming Cell 1 and 2 were executed). ---

### 1. Overall Restaurant Concentration ###
Cities with the highest number of restaurants tend to be major metropolitan areas, indicating a higher demand and supply of dining options in these urban centers.
Top 5 Cities by Restaurant Count:
| City      | Restaurant_Count   |
|:----------|:-------------------|
| New Delhi | 5473               |
| Gurgaon   | 1118               |
| Noida     | 1080               |
| Faridabad | 251                |
| Ghaziabad | 25                 |

Top 5 Localities by Restaurant Count:
| City_Locality               | Restaurant_Count   |
|:----------------------------|:-------------------|
| New Delhi - Connaught Place | 122                |
| New Delhi - Rajouri Garden  | 99                 |
| New Delhi - Shahdara        | 87                 |
| New Delhi - Defence Colony  | 86                 |
| Ne