In [None]:
# Step 1: Disable TensorFlow loading
import os
os.environ["USE_TF"] = "0"

# Step 2: Import libraries
import pandas as pd
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
import torch.nn.functional as F

# Step 3: Load your full dataset
file_path = "../datasets/merged_data/merged_cleaned_ethiopian_airlines_reviews.csv"
df = pd.read_csv(file_path)

# Step 4: Define category keywords
category_keywords = {
    'cabin_crew': ['crew', 'attendant', 'hostess'],
    'flight_delay': ['delay', 'late', 'waiting', 'reschedule', 'cancelled'],
    'luggage_handling': ['luggage', 'baggage', 'bag', 'lost', 'claim'],
    'food_service': ['food', 'meal', 'snack', 'drink', 'beverage'],
    'seat_comfort': ['seat', 'legroom', 'comfort', 'space', 'seating'],
    'restroom_quality': ['toilet', 'bathroom', 'restroom', 'clean', 'dirty'],
    'airport_check': ['check-in', 'boarding', 'gate', 'queue', 'counter'],
    'customer_service': ['service', 'help', 'support', 'rude', 'kind','staff','personnel'],
    'value_for_money': ['price', 'value', 'expensive', 'cheap', 'worth'],
    'inflight_entertainment': ['entertainment', 'movie', 'tv', 'screen', 'music']
}

# Step 5: Load sentiment model and tokenizer
print("Loading sentiment model and tokenizer...")
model_name = "cardiffnlp/twitter-roberta-base-sentiment"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)
model.eval()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
labels = ['Negative', 'Neutral', 'Positive']

# Step 6: Define sentiment function
def get_sentiment(text):
    if not isinstance(text, str) or not text.strip():
        return 'neutral'
    encoded = tokenizer(text, return_tensors='pt', truncation=True, padding=True, max_length=512)
    input_ids = encoded['input_ids'].to(device)
    attention_mask = encoded['attention_mask'].to(device)
    with torch.no_grad():
        output = model(input_ids, attention_mask=attention_mask)
        probs = F.softmax(output.logits, dim=-1)
    return labels[torch.argmax(probs).item()]

# Step 7: Add overall sentiment
print("Running overall sentiment analysis...")
df["overall_sentiment"] = df["review_comment"].fillna("").apply(get_sentiment)

# Step 8: Add category-based sentiment without saving extracted text
print("Running category-based sentiment analysis...")
for category, keywords in category_keywords.items():
    def filter_by_keywords(text):
        if pd.isnull(text): return ''
        sentences = text.split('.')
        category_sentences = [s for s in sentences if any(k in s.lower() for k in keywords)]
        return '. '.join(category_sentences).strip()
    
    category_sentiment_col = f"{category}_sentiment"
    df[category_sentiment_col] = df["review_comment"].apply(lambda x: get_sentiment(filter_by_keywords(x)))

# Step 9: Capitalize sentiment columns for consistency
sentiment_columns = ['overall_sentiment'] + [f"{c}_sentiment" for c in category_keywords.keys()]
for col in sentiment_columns:
    df[col] = df[col].str.capitalize()

# Step 10: Save final dataset with all original + sentiment columns
output_path = "../datasets/sentiment_analysis/ethiopian_airlines_overall_and_topic_sentiment.csv"
df.to_csv(output_path, index=False)

print(f"Done! All rows processed and saved to:\n{output_path}")


Loading sentiment model and tokenizer...
Running overall sentiment analysis...
Running category-based sentiment analysis...
Done! All rows processed and saved to:
C:\Users\abro27\OneDrive\Desktop\Mak\Education\3.Data_Analytics\Final Project\Capstone Projects\Datas\Sentiment_Analysis\ethiopian_airlines_overall_and_category_sentiment_updated.csv
