In [None]:
import requests
import csv
import time
import random
import os

csv_file = os.path.join(os.getcwd(), "talabat_15_restaurants_reviews.csv")

# Map restaurant names to branch IDs (extracted from your URLs)
restaurants = {
    "99 Grill": 624149,
    "Pizza Hut": 600352,
    "Ninja Sushi": 734783,
    "Wazzup Dog": 628339,
    "Between Buns": 40177,
    "Dominos Pizza": 9778,
    "Shawerma Reem": 650859,
    "Shawarmaati": 47347,
    "Mr Hotdog": 661804,
    "Xn Shawerma": 736547,
    "Burger Maker": 683243,
    "Ibra Sandwich": 760475,
    "Crispy Chicken": 662822,
    "Buffalo Wings Rings": 638830,
    "Chicken Kingdom": 730696
}

all_reviews = []

for name, branch_id in restaurants.items():
    print(f"Scraping reviews for {name} (branch {branch_id})...")

    page = 1
    page_size = 50  # number of reviews per request
    total_pages = 1  # will update from first request

    while page <= total_pages:
        url = f"https://www.talabat.com/nextFeedbackApi/branches/{branch_id}/reviews/{page}/{page_size}"
        headers = {
            "User-Agent": "Mozilla/5.0",
            "accept": "application/json, text/plain, /"
        }
        response = requests.get(url, headers=headers)
        data = response.json()

        # Update total pages from API response
        total_pages = data.get("totalPages", 1)

        # Extract reviews
        for review in data.get("details", []):
            all_reviews.append({
                "restaurant_name": name,
                "review_date": review.get("date"),
                "rating": review.get("rate"),
                "review_text": review.get("review")
            })

        page += 1
        time.sleep(random.uniform(0.5, 1.5))  # polite delay

# Save CSV
csv_file = os.path.join(os.getcwd(), "talabat_15_restaurants_reviews.csv")
keys = all_reviews[0].keys()
with open(csv_file, "w", newline="", encoding="utf-8") as f:
    dict_writer = csv.DictWriter(f, keys)
    dict_writer.writeheader()
    dict_writer.writerows(all_reviews)

print(f"\n Successfully saved {len(all_reviews)} reviews to CSV!")
print(f"File location: {csv_file}")

In [None]:
import pandas as pd
import re
import torch
import emoji
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import numpy as np

# ------------------------
# 1. Load data
# ------------------------
df = pd.read_csv(r"C:\Users\Kat\Downloads\talabat_15_restaurants_reviews.csv", encoding='utf-8')

# ------------------------
# 2. Preprocessing
# ------------------------
def preprocess(text):
    text = str(text)
    text = emoji.demojize(text, delimiters=(" ", " "))
    text = re.sub(r'\s+', ' ', text).strip()
    text = re.sub(r'(.)\1{2,}', r'\1\1', text)
    return text

df['cleaned_text'] = df['review_text'].apply(preprocess)

# ------------------------
# 3. Load model
# ------------------------
model_name = "nlptown/bert-base-multilingual-uncased-sentiment"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, output_attentions=True)
model.eval()

labels = ['1_star', '2_star', '3_star', '4_star', '5_star']

# ------------------------
# 4. Prediction function
# ------------------------
def predict_sentiment(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=128)
    with torch.no_grad():
        outputs = model(**inputs)
        logits = outputs.logits
        probs = torch.softmax(logits, dim=1).cpu().numpy()[0]
    idx = np.argmax(probs)
    stars = idx + 1  # Convert 0–4 to 1–5
    return stars, probs[idx]

df[['model_star', 'model_confidence']] = df['cleaned_text'].apply(lambda x: pd.Series(predict_sentiment(x)))

# ------------------------
# 5. Combine rating with model
# ------------------------
# Map 1–5 stars to sentiment score (-1 to +1)
rating_map = {1: -1, 2: -0.5, 3: 0, 4: 0.5, 5: 1}
df['rating_score'] = df['rating'].map(rating_map)
df['model_score'] = df['model_star'].map(rating_map) * df['model_confidence']

# Weighted final score
df['final_score'] = 0.7 * df['model_score'] + 0.3 * df['rating_score']

def map_final_sentiment(score):
    if score > 0.2:
        return "positive"
    elif score < -0.2:
        return "negative"
    else:
        return "neutral"

df['final_sentiment'] = df['final_score'].apply(map_final_sentiment)

# ------------------------
# 6. Extract top words using attention weights
# ------------------------
def get_top_attention_words(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=128)
    with torch.no_grad():
        outputs = model(**inputs)
        attentions = outputs.attentions  # List of attention layers

    attn = attentions[-1][0].mean(dim=0)  # last layer, average heads
    cls_attention = attn[0].cpu().numpy()  # CLS token attention

    tokens = tokenizer.convert_ids_to_tokens(inputs['input_ids'][0])
    token_scores = list(zip(tokens, cls_attention))
    token_scores = [t for t in token_scores if t[0] not in tokenizer.all_special_tokens]
    token_scores = sorted(token_scores, key=lambda x: x[1], reverse=True)
    top_tokens = [t for t, s in token_scores[:5]]
    return " ".join(top_tokens)

df['top_words'] = df['cleaned_text'].apply(get_top_attention_words)

# ------------------------
# 7. Save final CSV
# ------------------------
df.to_csv(r"C:\Users\Kat\Documents\talabat_restaurants_final.csv", index=False, encoding='utf-8')
print(" CSV saved with sentiment")

In [None]:
import pandas as pd
import re
import numpy as np
import nltk
from nltk.corpus import stopwords
from deep_translator import GoogleTranslator
from sentence_transformers import SentenceTransformer, util
from collections import defaultdict, Counter

# -----------------------------
# Setup
# -----------------------------
nltk.download("stopwords")

EN_STOP = set(stopwords.words("english"))
AR_STOP = set([
    "مش","مو","ما","في","على","من","عن","كان","كانت","جدا","جداً","مره","مرة"
])

translator = GoogleTranslator(source="auto", target="en")
model = SentenceTransformer("paraphrase-multilingual-MiniLM-L12-v2")

# -----------------------------
# Load data
# -----------------------------
df = pd.read_csv(
    r"C:\Users\Kat\Downloads\talabat_restaurants_final.csv",
    encoding="utf-8"
)

# -----------------------------
# Text cleaning
# -----------------------------
def clean_text(text):
    text = str(text).lower()
    text = re.sub(r"[^\w\s]", " ", text)
    text = re.sub(r"\s+", " ", text).strip()
    return text

df["cleaned_text"] = df["cleaned_text"].apply(clean_text)

# -----------------------------
# Translate Arabic → English
# -----------------------------
def translate_if_needed(text):
    try:
        if re.search(r"[\u0600-\u06FF]", text):
            return translator.translate(text)
        return text
    except:
        return text

df["en_text"] = df["cleaned_text"].apply(translate_if_needed)

# -----------------------------
# Split into sentences
# -----------------------------
def split_sentences(text):
    return [s.strip() for s in re.split(r"[.!?]", text) if len(s.strip()) > 3]

df["sentences"] = df["en_text"].apply(split_sentences)

# -----------------------------
# Define semantic categories
# -----------------------------
CATEGORIES = {
    "Food Temperature": [
        "food was cold", "not hot", "arrived cold", "temperature bad"
    ],
    "Taste & Flavor": [
        "bad taste", "not tasty", "delicious", "flavorless"
    ],
    "Cooking Quality": [
        "undercooked", "raw", "burnt", "dry food", "not cooked well"
    ],
    "Missing / Wrong Items": [
        "missing item", "order incomplete", "wrong order"
    ],
    "Portion Size": [
        "small portion", "quantity too small"
    ],
    "Packaging": [
        "bad packaging", "spilled", "leaking"
    ],
    "Delivery Speed": [
        "late delivery", "fast delivery", "on time"
    ],
    "Service": [
        "bad service", "good service", "rude staff", "polite staff"
    ],
    "Price / Value": [
        "expensive", "not worth", "good price"
    ],
    "Cleanliness": [
        "dirty", "not clean", "clean restaurant"
    ]
}

# Encode category examples
category_embeddings = {}
for cat, examples in CATEGORIES.items():
    category_embeddings[cat] = model.encode(examples, convert_to_tensor=True)

# -----------------------------
# Assign sentence to category
# -----------------------------
def classify_sentence(sentence):
    sent_emb = model.encode(sentence, convert_to_tensor=True)
    best_cat = None
    best_score = 0

    for cat, emb in category_embeddings.items():
        score = util.cos_sim(sent_emb, emb).max().item()
        if score > best_score:
            best_score = score
            best_cat = cat

    return best_cat if best_score >= 0.55 else None

# -----------------------------
# Collect categories per restaurant
# -----------------------------
restaurant_results = {}

for restaurant, group in df.groupby("restaurant_name"):
    neg_counts = Counter()
    pos_counts = Counter()

    total_neg = 0
    total_pos = 0

    for _, row in group.iterrows():
        sentiment = row["final_sentiment"]
        for sent in row["sentences"]:
            category = classify_sentence(sent)
            if not category:
                continue

            if sentiment == "negative":
                neg_counts[category] += 1
                total_neg += 1
            elif sentiment == "positive":
                pos_counts[category] += 1
                total_pos += 1

    # Convert to percentages
    neg_percent = {
        k: round((v / total_neg) * 100, 1)
        for k, v in neg_counts.items()
    } if total_neg > 0 else {}

    pos_percent = {
        k: round((v / total_pos) * 100, 1)
        for k, v in pos_counts.items()
    } if total_pos > 0 else {}

    restaurant_results[restaurant] = {
        "top_problems": ", ".join(
            [f"{k} ({v}%)" for k, v in sorted(neg_percent.items(), key=lambda x: -x[1])[:5]]
        ),
        "top_positive_features": ", ".join(
            [f"{k} ({v}%)" for k, v in sorted(pos_percent.items(), key=lambda x: -x[1])[:5]]
        )
    }

# -----------------------------
# Save final results
# -----------------------------
final_df = pd.DataFrame.from_dict(restaurant_results, orient="index").reset_index()
final_df.rename(columns={"index": "restaurant_name"}, inplace=True)

final_df.to_csv(
    r"C:\Users\Kat\Downloads\talabat_restaurant_insights_percent.csv",
    index=False,
    encoding="utf-8"
)

print("Final NLP-based restaurant insights saved successfully")

In [None]:
import pandas as pd

# ===============================
# 1. Load raw reviews data
# ===============================
df = pd.read_csv(
    r"C:\Users\Kat\Downloads\talabat_restaurants_final.csv",
    encoding="utf-8"
)

# ===============================
# 2. Convert review_date to datetime
# ===============================
df["review_date"] = pd.to_datetime(df["review_date"], errors="coerce")
df = df.dropna(subset=["review_date"])

# ===============================
# 3. Create Year-Month column
# ===============================
df["year_month"] = df["review_date"].dt.to_period("M").astype(str)

# ===============================
# 4. Create sentiment flags
# ===============================
df["is_positive"] = (df["final_sentiment"] == "positive").astype(int)
df["is_negative"] = (df["final_sentiment"] == "negative").astype(int)
df["is_neutral"]  = (df["final_sentiment"] == "neutral").astype(int)

# ===============================
# 5. Aggregate monthly timeline metrics
# ===============================
timeline_df = (
    df.groupby(["restaurant_name", "year_month"])
      .agg(
          review_count=("final_sentiment", "count"),
          avg_final_score=("final_score", "mean"),
          positive_ratio=("is_positive", "mean"),
          negative_ratio=("is_negative", "mean"),
          neutral_ratio=("is_neutral", "mean")
      )
      .reset_index()
)

# ===============================
# 6. Apply minimum review threshold
# ===============================
MIN_REVIEWS = 10
timeline_df = timeline_df[timeline_df["review_count"] >= MIN_REVIEWS]

# ===============================
# 7. Convert ratios to percentages
# ===============================
timeline_df["positive_ratio"] = (timeline_df["positive_ratio"] * 100).round(1)
timeline_df["negative_ratio"] = (timeline_df["negative_ratio"] * 100).round(1)
timeline_df["neutral_ratio"]  = (timeline_df["neutral_ratio"] * 100).round(1)
timeline_df["avg_final_score"] = timeline_df["avg_final_score"].round(2)

# ===============================
# 8. Sort for proper timeline order
# ===============================
timeline_df = timeline_df.sort_values(
    by=["restaurant_name", "year_month"]
)

# ===============================
# 9. Calculate month-to-month sentiment change
# ===============================
timeline_df["positive_change"] = (
    timeline_df.groupby("restaurant_name")["positive_ratio"]
    .diff()
)

timeline_df["negative_change"] = (
    timeline_df.groupby("restaurant_name")["negative_ratio"]
    .diff()
)

# ===============================
# 10. Define spike/drop thresholds
# ===============================
DROP_THRESHOLD = -15   # -15% or worse
SPIKE_THRESHOLD = 15   # +15% or better

def classify_change(x):
    if pd.isna(x):
        return "Stable"
    elif x <= DROP_THRESHOLD:
        return "Sentiment Drop"
    elif x >= SPIKE_THRESHOLD:
        return "Sentiment Spike"
    else:
        return "Stable"

# ===============================
# 11. Classify sentiment events
# ===============================
timeline_df["sentiment_event"] = timeline_df["positive_change"].apply(classify_change)

# ===============================
# 12. Save final output
# ===============================
timeline_df.to_csv(
    r"C:\Users\Kat\Downloads\restaurant_Timeline_analysis_drop.csv",
    index=False,
    encoding="utf-8"
)

print(" Timeline analysis + sentiment spike/drop detection completed")
print(" File saved: restaurant_sentiment_events.csv")

In [None]:
import pandas as pd
from transformers import pipeline

# ===============================
# 1. Load original sentiment file
# ===============================
df = pd.read_csv(
    talabat_restaurants_final.csv",
    encoding="utf-8"
)

# ===============================
# 2. Prepare dates
# ===============================
df["review_date"] = pd.to_datetime(df["review_date"], errors="coerce")
df = df.dropna(subset=["review_date"])
df["year_month"] = df["review_date"].dt.to_period("M").astype(str)

# ===============================
# 3. Keep ONLY negative reviews
# ===============================
df = df[df["final_sentiment"] == "negative"]

# ===============================
# 4. Define candidate problem categories
# ===============================
problem_categories = [
    "Food Temperature",
    "Taste & Flavor",
    "Service",
    "Portion Size",
    "Missing / Wrong Items",
    "Cleanliness",
    "Delivery Speed",
    "Packaging",
    "Price / Value",
    "Incorrect Billing",
    "Other"
]

# ===============================
# 5. Load zero-shot classifier
# ===============================
classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")

# ===============================
# 6. Detect problems for each review
# ===============================
def detect_problems(text, categories, threshold=0.2):
    if not isinstance(text, str) or text.strip() == "":
        return []
    result = classifier(text, candidate_labels=categories, multi_label=True)
    # Only keep labels above threshold
    detected = [label for label, score in zip(result['labels'], result['scores']) if score >= threshold]
    return detected

# Apply detection
df["detected_problem"] = df["review_text"].apply(lambda x: detect_problems(x, problem_categories))
df = df.explode("detected_problem")
df = df[df["detected_problem"].notna() & (df["detected_problem"] != "")]

# ===============================
# 7. Aggregate problems monthly
# ===============================
problem_timeline = (
    df.groupby(["restaurant_name", "year_month", "detected_problem"])
      .size()
      .reset_index(name="problem_count")
)

# ===============================
# 8. Convert to percentage
# ===============================
total_per_month = (
    problem_timeline.groupby(["restaurant_name", "year_month"])["problem_count"]
    .transform("sum")
)

problem_timeline["problem_percentage"] = (
    (problem_timeline["problem_count"] / total_per_month) * 100
).round(1)

# ===============================
# 9. Sort cleanly
# ===============================
problem_timeline = problem_timeline.sort_values(
    by=["restaurant_name", "year_month", "problem_percentage"],
    ascending=[True, True, False]
)

# ===============================
# 10. Save output
# ===============================
problem_timeline.to_csv(
    restaurant_problem_timeline_monthly.csv",
    index=False,
    encoding="utf-8"
)

print(" Problem timeline analysis completed")
print(" File saved: restaurant_problem_timeline_monthly.csv")