In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
df = pd.read_csv("labeled_tourism_reviews (1).csv")

In [None]:
df.head()

In [None]:
df = df.dropna(subset=['processed_review'])

In [None]:
df = df[df['aspects'].map(len) > 0]

In [None]:
from keybert import KeyBERT
kw_model = KeyBERT()

df['aspects'] = df['processed_review'].apply(lambda x: [kw[0] for kw in kw_model.extract_keywords(x, keyphrase_ngram_range=(1, 2), stop_words='english', top_n=3)])


In [None]:
import re

def clean_aspect(aspect):
    aspect = aspect.lower().strip()
    aspect = re.sub(r'\W+', ' ', aspect)  # remove non-alphanumeric
    return aspect

df['aspects'] = df['aspects'].apply(lambda aspects: [clean_aspect(a) for a in aspects])


In [None]:
df['aspects'].head(20)

In [None]:
from collections import Counter

all_aspects = [aspect for aspects in df['aspects'][df['sentiment_ensemble'] == 'positive'] for aspect in aspects]
most_common_positive_aspects = [a for a in Counter(all_aspects).most_common(100)]  # Top 50


In [None]:
most_common_positive_aspects

In [None]:
all_aspects = [aspect for aspects in df['aspects'][df['sentiment_ensemble'] == 'negative'] for aspect in aspects]
most_common_negative_aspects = [a for a in Counter(all_aspects).most_common(100)]  # Top 50

In [None]:
most_common_negative_aspects

In [None]:
top_aspects = [a[0] for a in most_common_negative_aspects]

for aspect in top_aspects:
    print(f"\n🔹 Reviews mentioning aspect: **{aspect}**\n{'-'*50}")

    matching_reviews = df[
        (df['sentiment_ensemble'] == 'negative') &
        (df['aspects'].apply(lambda x: aspect in x))
    ]['processed_review']

    for i, review in enumerate(matching_reviews.head(5), 1):  # limit to top 5 reviews per aspect
        print(f"{i}. {review}")


In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch.nn.functional as F
from transformers import pipeline

absa_tokenizer = AutoTokenizer.from_pretrained("yangheng/deberta-v3-base-absa-v1.1")
absa_model = AutoModelForSequenceClassification \
  .from_pretrained("yangheng/deberta-v3-base-absa-v1.1")

In [None]:
sentiment_model_path = "cardiffnlp/twitter-xlm-roberta-base-sentiment"
sentiment_model = pipeline("sentiment-analysis", model=sentiment_model_path,
                          tokenizer=sentiment_model_path)

In [None]:

sentence = "Is one of the worst countries (if not the worst) , for sexual harassment of women . And one of the tops ones for denying that the problem exist."
print(f"Sentence: {sentence}")
print()

aspect = "people"
inputs = absa_tokenizer(f"[CLS] {sentence} [SEP] {aspect} [SEP]", return_tensors="pt")
outputs = absa_model(**inputs)
probs = F.softmax(outputs.logits, dim=1)
probs = probs.detach().numpy()[0]
print(f"Sentiment of aspect '{aspect}' is:")
for prob, label in zip(probs, ["negative", "neutral", "positive"]):
  print(f"Label {label}: {prob}")
print()

In [None]:
# Overall sentiment of the sentence
sentiment = sentiment_model([sentence])[0]
print(f"Overall sentiment: {sentiment['label']} with score {sentiment['score']}")

In [None]:
sentence = "Egypt has many beautiful sites especially the places on the sea for those who love nature.museums for those who love history.In a nutshell, All types of tourism can exist.beautiful weather in the winter where you can wear summer clothes in the middle of the day.Locals are extremely friendly"

print(f"Sentence: {sentence}")
print()

aspect = "Egypt"
inputs = absa_tokenizer(f"[CLS] {sentence} [SEP] {aspect} [SEP]", return_tensors="pt")
outputs = absa_model(**inputs)
probs = F.softmax(outputs.logits, dim=1)
probs = probs.detach().numpy()[0]
print(f"Sentiment of aspect '{aspect}' is:")
for prob, label in zip(probs, ["negative", "neutral", "positive"]):
  print(f"Label {label}: {prob}")
print()

In [None]:
sentiment = sentiment_model([sentence])[0]
print(f"Overall sentiment: {sentiment['label']} with score {sentiment['score']}")