In [1]:
# Importing necessary libraries

import pandas as pd
import numpy as np
import re

In [2]:
# Reading positive and negative review datasets into DataFrames

df_pos = pd.read_csv("BriAri_pos_review.csv")
df_neg = pd.read_csv("BriAri_neg_review.csv")

In [3]:
df_pos.head()

Unnamed: 0.1,Unnamed: 0,Id,label,score,negative_score,neutral_score,positive_score,text
0,0,0,1,0.997455,0.012583,0.219128,0.768289,✅ Trip Verified | Easy check in a T5. Gallerie...
1,3,3,1,0.99767,0.003761,0.031903,0.964336,✅ Trip Verified | Good domestic flight operate...
2,8,8,1,0.999531,0.004443,0.021572,0.973985,✅ Trip Verified | I had the most fantastic BA ...
3,10,10,1,0.99913,0.007512,0.137402,0.855086,✅ Trip Verified | London Heathrow to Mumbai in...
4,19,19,1,0.918055,0.070034,0.431331,0.498635,Not Verified | Fast and friendly check in (tot...


In [4]:
# Importing NLTK stopwords and string library

from nltk.corpus import stopwords
import string

In [38]:
# Creating a list of stopwords
stops = stopwords.words("english")
stops += list(string.punctuation)

# Additional custom stopwords related to the context
stops += ['flight','airline','flights', "trip", "verified", "✅"]

In [39]:
df_pos["text"][0]

'✅ Trip Verified | Easy check in a T5. Galleries south and North lounges packed, only just able to get a seat. Didn’t stay long as B gates lounge much quieter. Flight boarded smoothly and on time. A380 upper deck. Old club world seating which is beginning to age a little. However, like the large toilets on this aircraft type. Cabin crew were all fantastic. One family had tech issues with their seats and entertainment systems which crew worked hard on though sadly without success. Food and drink options were all very good. Entertainment was good. Overall an enjoyable flight.'

In [79]:
# Preprocessing positive review texts

texts_pos = []

for index, row in df_pos.iterrows():
    tweet = row.text
    tweet = re.sub('[\s]+', ' ', tweet) #remove additional whitespaces
    tweet = re.sub(r'#([^\s]+)', r'\1', tweet) #replace #word with word
    tweet = tweet.strip('\'"') #trim tweet
    words_pos = []
    for word in tweet.split():
        if word.lower() not in stops:
            words_pos.append(word.lower())   
    tweet = " ".join(words_pos)
    row.text = tweet
    texts_pos.append(tweet)

In [80]:
# Preprocessing Negative review texts

texts_neg = []

for index, row in df_neg.iterrows():
    tweet = row.text
    tweet = re.sub('[\s]+', ' ', tweet) #remove additional whitespaces
    tweet = re.sub(r'#([^\s]+)', r'\1', tweet) #replace #word with word
    tweet = tweet.strip('\'"') #trim tweet
    words_neg = []
    for word in tweet.split():
        if word.lower() not in stops:
            words_neg.append(word.lower())   
    tweet = " ".join(words_neg)
    row.text = tweet
    texts_neg.append(tweet)

###  Extracting and Analyzing Top Most Frequent Words using TF-IDF

In [81]:
# For positive text

from sklearn.feature_extraction.text import TfidfVectorizer

# Sample text data
# texts_pos = ["This is a positive document.", "Another positive example.", "Yet another positive text."]

v = TfidfVectorizer(analyzer='word', max_features=3150, max_df=0.7, ngram_range=(1,3))
train_features = v.fit_transform(texts_pos)
word_tfidf_sums = train_features.sum(axis=0)

# Convert the sums to a list and retrieve corresponding feature names
word_tfidf_sums = word_tfidf_sums.tolist()[0]
feature_names = v.get_feature_names_out()

# Create a list of tuples with word and corresponding sum of TF-IDF values
word_tfidf_pairs = [(word, score) for word, score in zip(feature_names, word_tfidf_sums)]

# Sort the list in descending order based on TF-IDF sums
word_tfidf_pairs.sort(key=lambda x: x[1], reverse=True)

# Get the top most frequent words
top_n = 100  
top_words = [word for word, score in word_tfidf_pairs[:top_n]]

print("Top", top_n, "most frequent words:", top_words)


Top 100 most frequent words: ['good', 'ba', 'crew', 'service', 'food', 'london', 'time', 'cabin', 'great', 'flight', 'heathrow', 'friendly', 'check', 'well', 'seats', 'comfortable', 'seat', 'lounge', 'excellent', 'staff', 'airways', 'british', 'economy', 'british airways', 'really', 'cabin crew', 'experience', 'class', 'nice', 'new', 'one', 'club', 'aircraft', 'first', 'drinks', 'would', 'boarding', 'best', 'business', 'us', 'back', 'thank', 'amazing', 'made', 'london heathrow', 'professional', 'fine', 'quick', 'much', 'like', 'overall', 'helpful', 'arrival', 'world', 'flying', 'also', 'even', 'served', 'departure', 'attentive', 'entertainment', 'in', 'meal', 'premium', 'get', 'long', 'efficient', 'quite', 'return', 'breakfast', 'pleasant', 'could', 'full', 'fly', 'board', 'plane', 'it', 'passengers', 'free', 'early', 'business class', 'arrived', 'hour', 'due', 'minutes', 'day', 'clean', 'short', 'product', 'airport', 'ok', 'took', 'better', 'luggage', 'old', 'although', 'club world', 

In [82]:
# For negative text

train_features = v.fit_transform(texts_neg)
word_tfidf_sums = train_features.sum(axis=0)

# Convert the sums to a list and retrieve corresponding feature names
word_tfidf_sums = word_tfidf_sums.tolist()[0]
feature_names = v.get_feature_names_out()

# Create a list of tuples with word and corresponding sum of TF-IDF values
word_tfidf_pairs = [(word, score) for word, score in zip(feature_names, word_tfidf_sums)]

# Sort the list in descending order based on TF-IDF sums
word_tfidf_pairs.sort(key=lambda x: x[1], reverse=True)

# Get the top most frequent words
top_n = 100  
top_words = [word for word, score in word_tfidf_pairs[:top_n]]

print("Top", top_n, "most frequent words:", top_words)


Top 100 most frequent words: ['ba', 'service', 'london', 'seat', 'class', 'staff', 'business', 'seats', 'time', 'british', 'food', 'airways', 'british airways', 'one', 'us', 'hours', 'check', 'flight', 'business class', 'get', 'customer', 'crew', 'would', 'cabin', 'heathrow', 'cancelled', 'told', 'plane', 'economy', 'back', 'luggage', 'hour', 'delayed', 'even', 'airport', 'boarding', 'customer service', 'good', 'passengers', 'could', 'first', 'experience', 'booked', 'never', 'return', 'bag', 'minutes', 'it', 'fly', 'got', 'in', 'meal', 'new', 'late', 'airlines', 'refund', 'call', 'club', 'two', 'aircraft', 'still', 'due', 'poor', 'another', 'gate', 'last', 'day', 'arrived', 'like', 'travel', 'voucher', 'paid', 'baggage', 'pay', 'people', 'premium', 'lounge', 'check in', 'again', 'way', 'full', 'left', 'offered', 'worst', 'take', 'asked', 'days', 'said', 'made', 'long', 'ticket', 'also', 'lhr', 'make', 'next', 'given', 'much', 'money', 'go', 'via']


**Conclusion: **

From the analysis of positive reviews, the top 100 most frequent words reveal a positive sentiment associated with British Airways. The words include descriptors like "good," "great," "friendly," "excellent," and references to services like "cabin crew," "lounge," "food," and "comfortable seats."


The analysis of negative reviews reveals the top 100 most frequent words associated with British Airways. The words highlight areas of concern such as "cancelled," "delayed," "poor," "worst," and references to issues like "luggage," "customer service," "refund," and "flight delays." These words indicate customers' negative experiences and frustrations with various aspects of the airline's services.