## Airline Review Sentiment Analysis

In [7]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from transformers import AutoModelForSequenceClassification
from transformers import AutoTokenizer, AutoConfig
from scipy.special import softmax
import warnings
warnings.filterwarnings('ignore')

In [2]:
df = pd.read_csv('cleaned_data.csv')
df.head()

Unnamed: 0,rating,country,date,review,Type Of Traveller,Seat Type,Date Flown,Seat Comfort,Cabin Staff Service,Food & Beverages,...,Value For Money,Recommended,Airlines,author,CODE2,CODE3,Latitude,Longitude,Verified,review_length
0,1.0,Jordan,04-08-2024,Ryanair lost my luggage on a direct flight. I...,Business,Economy Class,April 2024,2.0,1.0,2.0,...,1.0,0,ryanair,Alan Robinson,JO,JOR,31.166705,36.941628,1,188
1,1.0,Switzerland,04-08-2024,Booked Basel to Dublin 11.10 6.4.24. Baggage...,Family Leisure,Economy Class,April 2024,1.0,1.0,2.0,...,1.0,0,ryanair,T Maysan,CH,CHE,46.798562,8.231974,1,441
2,6.0,Germany,04-05-2024,You get what you pay. Had overweight luggag...,Couple Leisure,Economy Class,April 2024,3.0,3.0,3.0,...,5.0,1,ryanair,55 reviews\n\n\n\nR Darnel,DE,DEU,51.163818,10.447831,1,94
3,3.0,Italy,04-01-2024,Very cheeky check-in system: this did not ha...,Couple Leisure,Economy Class,March 2024,1.0,2.0,2.0,...,2.0,0,ryanair,Y Chen,IT,ITA,42.638426,12.674297,0,108
4,1.0,Spain,03-28-2024,Terrible customer service. Handling in Marra...,Family Leisure,Economy Class,March 2024,2.0,4.0,2.0,...,3.0,0,ryanair,Diego Perez,ES,ESP,39.326068,-4.837979,0,594


### Sentiment Analysis Using RoBERTa

In [3]:
model_path = "cardiffnlp/twitter-roberta-base-sentiment"
tokenizer = AutoTokenizer.from_pretrained(model_path)
config = AutoConfig.from_pretrained(model_path)
roberta_model = AutoModelForSequenceClassification.from_pretrained(model_path)

# a function that takes text and model to calculate probability of each sentiment
def sentiment_analyzer(text, model):
    encoded_input = tokenizer(text, padding=True, truncation=True, return_tensors="pt", max_length=512)
    output = model(**encoded_input)
    scores = output[0][0].detach().numpy()
    scores = np.round(softmax(scores), 2)
    scores_dict = {"neg": scores[0], "neu": scores[1], "pos": scores[2]}
    return scores_dict

In [4]:
df["probabilities"] = df["review"].apply(sentiment_analyzer, model=roberta_model)

In [5]:
# Apply the RoBERTa function
probabilities = df["probabilities"].apply(pd.Series)
df = df.join(probabilities)
df = df.drop("probabilities", axis=1)

# Calculate the polarity for each text
polarity_weights = torch.tensor([-1, 0, 1])
probs = torch.tensor(df[["neg", "neu", "pos"]].values)
polarity = polarity_weights * probs
polarity = polarity.sum(dim=-1)
polarity_scaled = nn.Tanh()(polarity)

# Assign labels for each text based on polarity
df["roberta_polarity"] = polarity_scaled.numpy()
df["roberta_sentiment"] = pd.cut(df["roberta_polarity"],
                                  bins=[-1.0, -0.25, 0.25, 1.0],
                                  labels=["Negative", "Neutral", "Positive"])
df = df.drop(["neu", "neg", "pos"], axis=1)

In [6]:
df.roberta_sentiment.value_counts()

Negative    8806
Positive    3573
Neutral      767
Name: roberta_sentiment, dtype: int64

 Saving the Progress

In [7]:
df.to_csv('Final_csv.csv', index=False) 