In [None]:
# import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import boto3
import os
from dotenv import load_dotenv
import warnings 
warnings.filterwarnings('ignore')

In [None]:
# authentication credentials for AWS account
load_dotenv()
Access_key = os.environ['access_key']
Secret_key = os.environ['secret_key']

In [None]:
# configuration setup
s3_client = boto3.client(
                          's3',
                          aws_access_key_id = Access_key,
                          aws_secret_access_key = Secret_key)

In [None]:
# Load the Data
df = pd.read_csv('Zomato_combined_data.csv')

****Exploratory Data Analaysis****

In [None]:
# Get a summary of the DataFrame
df.info()

In [None]:
# Get statistical summaries of numerical columns
df.describe()

In [None]:
# statistical view of the categorical data 
df.describe(include="object")

In [None]:
# Display the first few rows of the DataFrame
df.head()

***Missing Values***

In [None]:
# Check for missing values
missing_values = df.isnull().sum()
missing_values

In [None]:
# replace nan values with cuisine name
df.loc[df['Restaurant_name']=='HI Lite Bar & Lounge','Cuisines'] = 'Bar'
df.loc[df['Restaurant_name']=='Hillstone','Cuisines'] = 'Thai'
df.loc[df['Restaurant_name']=="Jimmie's Hot Dogs",'Cuisines'] = 'Hot dogs'
df.loc[df['Restaurant_name']=="Pearly's Famous Country Cookng",'Cuisines'] = 'American'
df.loc[df['Restaurant_name']=='Cookie Shoppe','Cuisines'] = 'Cookies'
df.loc[df['Restaurant_name']=='Corkscrew Cafe','Cuisines'] = 'Cafe'
df.loc[df['Restaurant_name']=='Tybee Island Social Club','Cuisines'] = 'Coastal'
df.loc[df['Restaurant_name']=='Dovetail','Cuisines'] = 'Cocktails '
df.loc[df['Restaurant_name']=="Leonard's Bakery",'Cuisines'] = 'Bakery'

In [None]:
# Fill missing values with forward fill
df.fillna(method='ffill', inplace=True)

In [None]:
df

In [None]:
# Filtering Irrelevant Columns
# Drop irrelevant columns
df.drop(columns=['Locality', 'Locality_Verbose', 'Rating_color', 'Rating_text'], inplace=True)

****Data Visualization****

In [None]:
# 1. Average Aggregate Ratings for Each Restaurant
plt.figure(figsize=(12, 6))
sns.barplot(x='Aggregate_rating', y='Restaurant_name', data=df, palette='viridis')
plt.title('Average Aggregate Ratings for Each Restaurant')
plt.xlabel('Average Rating')
plt.ylabel('Restaurant Name')
plt.tight_layout()
plt.show()

In [None]:
# 2. Aggregate Ratings for the Top 20 Restaurants
top_20_restaurants = df.sort_values(by='Aggregate_rating', ascending=False).head(20)
plt.figure(figsize=(12, 6))
sns.barplot(x='Aggregate_rating', y='Restaurant_name', data=top_20_restaurants, palette='viridis')
plt.title('Top 20 Restaurants Based on Aggregate Ratings')
plt.xlabel('Aggregate Rating')
plt.ylabel('Restaurant Name')
plt.tight_layout()
plt.show()

In [None]:
# 3. Scatter Plot: Individual Aggregate Ratings
plt.figure(figsize=(10, 6))
sns.scatterplot(x=df['Cuisines'].str.split(', ').str.len(), y=df['Aggregate_rating'], alpha=0.5)
plt.title('Individual Aggregate Ratings vs. Number of Cuisines')
plt.xlabel('Number of Cuisines')
plt.ylabel('Aggregate Rating')
plt.grid()
plt.tight_layout()
plt.show()

In [None]:
# 4. Pie Chart: Distribution of Cuisines Across All Cities
cuisine_counts = df['Cuisines'].str.split(', ').explode().value_counts()
plt.figure(figsize=(10, 10))
cuisine_counts.plot.pie(autopct='%1.1f%%', startangle=90)
plt.title('Distribution of Cuisines Across All Cities')
plt.ylabel('')
plt.show()

In [None]:
# Select top 20 cuisines
top_cuisines = df['Cuisines'].value_counts().head(20)

# Create pie chart
plt.figure(figsize=(12, 8))
plt.pie(top_cuisines, labels=top_cuisines.index, autopct='%1.1f%%')
plt.title('Top 20 Cuisines Across All Cities')
plt.show()

In [None]:
# 5. Bar Chart: Distribution of Cuisines for the Top 20 Restaurants
top_20_cuisines = top_20_restaurants['Cuisines'].str.split(', ').explode().value_counts()
plt.figure(figsize=(12, 6))
sns.barplot(x=top_20_cuisines.index, y=top_20_cuisines.values, palette='viridis')
plt.title('Distribution of Cuisines for the Top 20 Restaurants')
plt.xlabel('Cuisines')
plt.ylabel('Count')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

In [None]:
df["Cuisines"].unique()

In [None]:
df["Cuisines"].value_counts()

In [None]:
# Split the cuisines into individual categories
cuisines_series = df['Cuisines'].str.split(', ')
cuisines_flat = cuisines_series.explode()  # Flatten the list of cuisines

In [None]:
# Count the occurrences of each cuisine category
cuisine_counts = cuisines_flat.value_counts()

In [None]:
# Limit to top N categories (e.g., top 20)
top_n = 20
top_cuisines = cuisine_counts.head(top_n)

In [None]:
# Plot the results as a horizontal bar plot
plt.figure(figsize=(10, 8))
sns.barplot(x=top_cuisines.values, y=top_cuisines.index, palette='viridis')
plt.title(f'Top {top_n} Cuisine Categories Count')
plt.xlabel('Count')
plt.ylabel('Cuisine')
plt.tight_layout()  # Adjust layout
plt.show()

In [None]:
# Sort the DataFrame by 'Aggregate_rating' in descending order
top_restaurants = df.sort_values(by='Aggregate_rating', ascending=False).head(20)

In [None]:
# Create a bar plot for the top 20 restaurants
plt.figure(figsize=(12, 8))
sns.barplot(x='Aggregate_rating', y='Restaurant_name', data=top_restaurants, palette='viridis', hue='City')
plt.title('Top 20 Restaurants Based on Aggregate Ratings')
plt.xlabel('Aggregate Rating')
plt.ylabel('Restaurant Name')
plt.legend(title='City', bbox_to_anchor=(1.05, 1), loc='upper left')  # Legend outside the plot
plt.tight_layout()  # Adjust layout
plt.show()

In [None]:
# Clean the 'Average_Cost_for_two' column but keep currency
df['Average_Cost_for_two'] = df['Average_Cost_for_two'].replace({',': ''}, regex=True)
df['Average_Cost_for_two'] = pd.to_numeric(df['Average_Cost_for_two'], errors='coerce')

In [None]:
# Group by currency and calculate the average cost for two
average_cost_by_currency = df.groupby('Currency')['Average_Cost_for_two'].mean().reset_index()

In [None]:
average_cost_by_currency

In [None]:
# Visualize the average cost for two by currency using a bar plot
plt.figure(figsize=(10, 6))
sns.barplot(x='Currency', y='Average_Cost_for_two', data=average_cost_by_currency, palette='viridis')
plt.title('Average Cost for Two by Currency')
plt.xlabel('Currency')
plt.ylabel('Average Cost for Two (in respective currency)')
plt.xticks(rotation=45)  # Rotate x labels for better readability
plt.tight_layout()  # Adjust layout
plt.show()

In [None]:
# Select Numeric Columns
numeric_df = df.select_dtypes(include=['float64', 'int64'])

# Calculate the Correlation Matrix
correlation_matrix = numeric_df.corr()

# Visualize the Correlation Matrix
plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt=".2f")
plt.title('Correlation Matrix of Numeric Features')
plt.show()

In [None]:
# Create the map
fig = px.scatter_geo(df, 
                     lat='Latitude', 
                     lon='Longitude', 
                     hover_name='Restaurant_name', 
                     title='Restaurant Locations',
                     projection='natural earth')  # You can change the projection type if needed

In [None]:
# Show the map
fig.show()

In [None]:
# histogram to visual the data distribution
df.hist(figsize=(12,12))
plt.show()

In [None]:
# Save cleaned df to a CSV file
df.to_csv('Zomato_cleaned_data.csv', index=False)

In [None]:
# upload the cleaned data to s3
file_path = "Zomato_cleaned_data.csv"
s3_client.upload_file(file_path,'chefmatebucket1','datas/Zomato_cleaned_data.csv') #folder/filename