<a href="https://colab.research.google.com/github/Ishita95-harvad/Zomato-Review-Analysis/blob/main/Zomato_Review_Analysis_ipynb_.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Machine Learning Workflow for Facebook Data**

**Step 1: Load and Preprocess Data**

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, SpatialDropout1D, Dropout
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np
import dash
from dash import dcc, html, Input, Output
import plotly.express as px
import zipfile
import os
import io
from PIL import Image
import base64

In [None]:
# Extract ZIP file with error handling
zip_file_path = "67a0615431763_resources.zip"
extract_folder = "extracted_zomato_reviews"

if not os.path.exists(extract_folder):
    os.makedirs(extract_folder)
    with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
        zip_ref.extractall(extract_folder)
    print("Extraction completed. Files:", os.listdir(extract_folder))
else:
    print("Files already extracted.")

Files already extracted.


In [None]:
import os
import zipfile
import pandas as pd

extract_folder = "extracted_zomato_reviews"
zip_file_path = "67a0615431763_resources.zip"  # Assuming this is the correct zip file name

# Ensure the folder exists and extract if necessary
if not os.path.exists(extract_folder):
    os.makedirs(extract_folder)
    with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
        zip_ref.extractall(extract_folder)
    print("Extraction completed. Files:", os.listdir(extract_folder))
else:
    print("Files already extracted.")

# Print all files in the extracted folder for verification
print(os.listdir(extract_folder))

# Load the dataset safely
file_path = None
df = None  # Initialize df to None

for file in os.listdir(extract_folder):
    if file.endswith(".xlsx"):  # Check only for .xlsx extension
        file_path = os.path.join(extract_folder, file)
        print(f"Found potential dataset file: {file_path}")
        try:
            # Update 'Zomato Review Kaggle' with the actual sheet name if different
            df = pd.read_excel(file_path, sheet_name="Zomato Review Kaggle")
            print(f"Successfully loaded dataset from: {file_path}")
            break  # Exit loop if successful
        except (FileNotFoundError, ValueError) as e:
            print(f"Error loading file: {file_path}. Trying next file. Error: {e}")
            continue



Files already extracted.
[]


In [None]:
# Display dataset information
print("Dataset Head:\n", df.head())
print("\nDataset Info:\n")
df.info()


In [None]:

# Visualizing Review Distribution
sns.countplot(x=df['Liked'])
plt.title("Distribution of Positive and Negative Reviews")
plt.show()

**Step 2: Feature Engineering**

predicting engagement (likes, shares, comments), extract meaningful features:

In [None]:
# WordCloud for Positive and Negative Reviews
positive_reviews = " ".join(df[df['Liked'] == 1]['Review'])
negative_reviews = " ".join(df[df['Liked'] == 0]['Review'])

def generate_wordcloud(text):
    wordcloud = WordCloud(width=800, height=400, background_color='white').generate(text)
    img = io.BytesIO()
    Image.fromarray(wordcloud.to_array()).save(img, format='PNG')
    encoded_img = base64.b64encode(img.getvalue()).decode()
    return encoded_img

pos_wc_img = generate_wordcloud(positive_reviews)
neg_wc_img = generate_wordcloud(negative_reviews)

In [None]:
# Text Processing using TF-IDF
vectorizer = TfidfVectorizer(stop_words='english')
X = vectorizer.fit_transform(df['Review'])
y = df['Liked']

**Step 3: Prepare Data for Training**

In [None]:

# Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Model Training with Hyperparameter Tuning (Naive Bayes)
param_grid = {'alpha': [0.1, 0.5, 1.0, 2.0, 5.0]}
grid_search = GridSearchCV(MultinomialNB(), param_grid, cv=5)
grid_search.fit(X_train, y_train)
best_nb_model = grid_search.best_estimator_

In [None]:
# Model Training (SVM for comparison)
svm_model = SVC(kernel='linear')
svm_model.fit(X_train, y_train)

In [None]:
# Deep Learning Model with Dropout and Hyperparameter Tuning
max_words = 5000
max_len = 100
tokenizer = Tokenizer(num_words=max_words, oov_token="<OOV>")
tokenizer.fit_on_texts(df['Review'])
X_dl = tokenizer.texts_to_sequences(df['Review'])
X_dl = pad_sequences(X_dl, maxlen=max_len)

y_dl = np.array(df['Liked'])
X_train_dl, X_test_dl, y_train_dl, y_test_dl = train_test_split(X_dl, y_dl, test_size=0.2, random_state=42)

model = Sequential([
    Embedding(input_dim=max_words, output_dim=128, input_length=max_len),
    SpatialDropout1D(0.2),
    LSTM(100, dropout=0.2, recurrent_dropout=0.2),
    Dense(64, activation='relu'),
    Dropout(0.3),
    Dense(1, activation='sigmoid')
])

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [None]:
# Train Deep Learning Model with Early Stopping
model.fit(X_train_dl, y_train_dl, epochs=10, batch_size=32, validation_data=(X_test_dl, y_test_dl))

In [None]:
# Evaluate Deep Learning Model
loss, accuracy = model.evaluate(X_test_dl, y_test_dl)
print("Deep Learning Model Accuracy:", accuracy)

**Step 4: Train a Machine Learning Model**

In [None]:
# Dashboard Implementation
app = dash.Dash(__name__)

**Step 5: Save and Deploy Model**

In [None]:
# Column Chart for Review Sentiment Distribution
fig = px.bar(df, x=df['Liked'].value_counts().index, y=df['Liked'].value_counts().values,
             labels={'x': 'Sentiment', 'y': 'Count'}, title='Review Sentiment Distribution')

@app.callback(
    Output('filtered-graph', 'figure'),
    Input('rating-filter', 'value')
)
def update_graph(selected_rating):
    if selected_rating is not None:
        filtered_df = df[df['Liked'] == selected_rating]
    else:
        filtered_df = df
    return px.histogram(filtered_df, x='Review', title='Filtered Reviews')

app.layout = html.Div([
    html.H1("Zomato Review Sentiment Analysis Dashboard"),
    dcc.Dropdown(
        id='rating-filter',
        options=[
            {'label': 'Positive', 'value': 1},
            {'label': 'Negative', 'value': 0}
        ],
        placeholder='Select Review Sentiment'
    ),
    dcc.Graph(id='filtered-graph'),
    dcc.Graph(figure=fig),
    html.Img(src='data:image/png;base64,' + pos_wc_img, style={'width': '50%'}),
    html.Img(src='data:image/png;base64,' + neg_wc_img, style={'width': '50%'})
])

if __name__ == '__main__':
    app.run_server(debug=True)


In [None]:
# Model Evaluation
y_pred_nb = best_nb_model.predict(X_test)
y_pred_svm = svm_model.predict(X_test)
print("Naive Bayes Best Params:", grid_search.best_params_)
print("Naive Bayes Accuracy:", accuracy_score(y_test, y_pred_nb))
print("SVM Accuracy:", accuracy_score(y_test, y_pred_svm))
print("Naive Bayes Confusion Matrix:\n", confusion_matrix(y_test, y_pred_nb))
print("SVM Confusion Matrix:\n", confusion_matrix(y_test, y_pred_svm))
print("Naive Bayes Classification Report:\n", classification_report(y_test, y_pred_nb))
print("SVM Classification Report:\n", classification_report(y_test, y_pred_svm))

# **2️⃣ Interactive Dashboard Workflow (Plotly & Dash)**

**Step 1: Install Required Libraries**

In [None]:
pip install dash plotly


Collecting dash
  Downloading dash-3.0.1-py3-none-any.whl.metadata (10 kB)
Collecting Flask<3.1,>=1.0.4 (from dash)
  Downloading flask-3.0.3-py3-none-any.whl.metadata (3.2 kB)
Collecting Werkzeug<3.1 (from dash)
  Downloading werkzeug-3.0.6-py3-none-any.whl.metadata (3.7 kB)
Collecting retrying (from dash)
  Downloading retrying-1.3.4-py3-none-any.whl.metadata (6.9 kB)
Downloading dash-3.0.1-py3-none-any.whl (8.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.0/8.0 MB[0m [31m62.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading flask-3.0.3-py3-none-any.whl (101 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m101.7/101.7 kB[0m [31m8.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading werkzeug-3.0.6-py3-none-any.whl (227 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m228.0/228.0 kB[0m [31m19.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading retrying-1.3.4-py3-none-any.whl (11 kB)
Installing collected packages: Werkzeug, retryi

**Step 2: Create a Dashboard with Key Metrics**