<a href="https://colab.research.google.com/github/Mosapmohamd/DEPI-Graduation-Project/blob/main/Text_Preprocessing_%26_Annotation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Data annotation**

# Import required dependences

In [None]:
import pandas as pd
import numpy as np
from transformers import pipeline, AutoTokenizer, AutoModelForSequenceClassification
from tqdm import tqdm
import nltk
from nltk.sentiment import SentimentIntensityAnalyzer
import torch
from collections import Counter

In [None]:
df = pd.read_csv('Egypt_Tourism_Reviews.csv')

In [None]:
df = pd.read_csv('preprocessed_tourism_reviewsv2.csv')
df = df[df['word_count'] < 384] # limit due to transformers 512 token limit

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
# Method 1: Roberta Transformer  
def label_with_roberta(df, text_column, model_name="siebert/sentiment-roberta-large-english"):
    sentiment_analyzer = pipeline("sentiment-analysis", model=model_name, device=0 if device == 'cuda' else -1)
    batch_size = 32
    results = []
    for i in tqdm(range(0, len(df), batch_size), desc="Roberta Transformer"):
        batch = df[text_column].iloc[i:i+batch_size].tolist()
        outputs = sentiment_analyzer(batch, truncation=True)
        results.extend(outputs)
    labels = [result['label'] for result in results]
    scores = [result['score'] for result in results]
    sentiment_map = {'POSITIVE': 'positive', 'NEGATIVE': 'negative'}
    final_labels = [sentiment_map.get(label, label.lower()) for label in labels]
    return final_labels, scores

In [None]:
# Method 2: VADER Sentiment Analysis
def label_with_vader(df, text_column):
    # Download VADER lexicon if not present
    try:
        nltk.data.find('sentiment/vader_lexicon.zip')
    except LookupError:
        nltk.download('vader_lexicon')
    
    # Initialize VADER sentiment analyzer
    sid = SentimentIntensityAnalyzer()
    scores = []
    
    # Process each text with progress bar
    for text in tqdm(df[text_column], desc="VADER Binary"):
        if isinstance(text, str):
            sentiment_dict = sid.polarity_scores(text)
            scores.append(sentiment_dict)
        else:
            scores.append({'compound': 0, 'neg': 0, 'neu': 0, 'pos': 0})
    
    # Extract compound scores
    compound_scores = [score['compound'] for score in scores]
    
    # Assign binary labels: positive (≥ 0) or negative (< 0)
    labels = ['positive' if score >= 0 else 'negative' for score in compound_scores]
    
    return labels, compound_scores

In [None]:
def label_with_bert_binary(df, text_column, model_name="distilbert-base-uncased-finetuned-sst-2-english"):
    # Load tokenizer and model
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForSequenceClassification.from_pretrained(model_name).to(device)
    print(f"Distilbert Model is on device: {next(model.parameters()).device}")
    
    labels = []
    scores = []
    batch_size = 32
    
    # Process reviews in batches
    for i in tqdm(range(0, len(df), batch_size), desc="Distilbert Transformer"):
        batch_texts = df[text_column].iloc[i:i+batch_size].tolist()
        # Ensure all inputs are strings
        batch_texts = [str(text) if isinstance(text, str) else "" for text in batch_texts]
        
        # Tokenize batch
        encoded_batch = tokenizer(batch_texts, padding=True, truncation=True, max_length=512, return_tensors='pt')
        encoded_batch = {k: v.to(device) for k, v in encoded_batch.items()}  # Move to GPU
        
        # Get model predictions
        with torch.no_grad():
            outputs = model(**encoded_batch)
            predictions = torch.nn.functional.softmax(outputs.logits, dim=-1)
        
        # Get predicted classes and confidence scores
        predicted_classes = predictions.argmax(dim=1).tolist()
        max_scores = predictions.max(dim=1).values.tolist()
        
        # Map to binary labels: 0 -> negative, 1 -> positive
        batch_labels = ['negative' if label == 0 else 'positive' for label in predicted_classes]
        
        labels.extend(batch_labels)
        scores.extend(max_scores)
    
    return labels, scores

In [None]:
# Apply models
text_column = 'review'
labels_binary, scores_binary = label_with_binary_transformer(df, text_column)
labels_vader, scores_vader = label_with_vader(df, text_column)
labels_three_class, scores_three_class = label_with_bert(df, text_column)

In [None]:
df['sentiment_binary'] = labels_binary
df['sentiment_score_binary'] = scores_binary
df['sentiment_vader'] = labels_vader
df['sentiment_score_vader'] = scores_vader
df['sentiment_three_class'] = labels_three_class
df['sentiment_score_three_class'] = scores_three_class

In [None]:
# Ensemble Method: Majority Voting - hard voting
def ensemble_sentiment(row):
    sentiments = [row['sentiment_binary'], row['sentiment_vader'], row['sentiment_three_class']]
    sentiment_counts = Counter(sentiments)
    most_common = sentiment_counts.most_common(1)[0][0]
    return most_common

df['sentiment_ensemble'] = df.apply(ensemble_sentiment, axis=1)

In [None]:
df.to_csv('labeled_tourism_reviews.csv', index=False)