In [1]:
import pandas as pd
import numpy as np
from transformers import BertTokenizer, BertForSequenceClassification, AutoTokenizer, AutoModelForSequenceClassification
import torch
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

import nbformat

  from .autonotebook import tqdm as notebook_tqdm


# Data Preprocessing

In [2]:
review_df = pd.read_csv('https://raw.githubusercontent.com/MuskanRaisinghani23/british-airways-customer-feedback-analysis/main/data/BA_reviews.csv')

# dropping null values
review_df.dropna(inplace=True)

# Removing reviews which are not verified
review_df['review'] = review_df['review'].apply(lambda x : x.split('|')[1] if '|' in x else x)

# Removing class word from seat_type
review_df['seat_type'] = review_df['seat_type'].str.split(' ').str[0]

# Converting is_verified column values to numerical boolean value
review_df['is_verified'].replace({True:0, False:1}, inplace=True)

review_df


Unnamed: 0,seat_type,is_verified,rating,date_posted,review
0,Economy,0,1.0,2023-11-19,4 Hours before takeoff we received a Mail st...
1,Economy,0,3.0,2023-11-19,I recently had a delay on British Airways fr...
2,Business,1,8.0,2023-11-16,"Boarded on time, but it took ages to get to ..."
3,Economy,0,1.0,2023-11-16,"5 days before the flight, we were advised by..."
4,Economy,1,1.0,2023-11-14,\nWe traveled to Lisbon for our dream vacati...
...,...,...,...,...,...
3696,Economy,1,4.0,2012-08-29,LHR-JFK-LAX-LHR. Check in was ok apart from be...
3697,Business,1,9.0,2012-08-28,LHR to HAM. Purser addresses all club passenge...
3698,Economy,1,5.0,2011-10-12,My son who had worked for British Airways urge...
3699,Premium,1,4.0,2011-10-11,London City-New York JFK via Shannon on A318 b...


# Sentimental analysis using BERT (Bidirectional Encoder Representations from Transformers)

BERT is a neural-network-based technique for language processing pre-training. 

It is used to understand the sentiments of the customer/people for products, movies, and other such things, whether they feel positive, negative, or neutral about it.

We will use bert-base-multilingual-uncased model finetuned for sentiment analysis on product reviews in six languages: English, Dutch, German, French, Spanish, and Italian.


In [3]:
# Instantiating model
tokenizer= AutoTokenizer.from_pretrained('nlptown/bert-base-multilingual-uncased-sentiment')
model = AutoModelForSequenceClassification.from_pretrained('nlptown/bert-base-multilingual-uncased-sentiment')

In [4]:
# creating function to enode and calculate sentiment of each review
def sentiment_score(review):
    tokens= tokenizer.encode(review, return_tensors = 'pt')
    result = model(tokens)
    int_score = int(torch.argmax(result.logits))+1
    return int_score

In [5]:
# Assigning senitment score to every review
review_df['sentiment_score']= review_df['review'].apply(lambda x: sentiment_score(x[:512]))

In [6]:
# Sorting values based on sentiment_score
# sorted_review_df=review_df.sort_values('sentiment_score')

In [11]:
review_df

Unnamed: 0,seat_type,is_verified,rating,date_posted,review,sentiment_score
0,Economy,0,1.0,2023-11-19,4 Hours before takeoff we received a Mail st...,1
1,Economy,0,3.0,2023-11-19,I recently had a delay on British Airways fr...,2
2,Business,1,8.0,2023-11-16,"Boarded on time, but it took ages to get to ...",3
3,Economy,0,1.0,2023-11-16,"5 days before the flight, we were advised by...",1
4,Economy,1,1.0,2023-11-14,\nWe traveled to Lisbon for our dream vacati...,1
...,...,...,...,...,...,...
3696,Economy,1,4.0,2012-08-29,LHR-JFK-LAX-LHR. Check in was ok apart from be...,3
3697,Business,1,9.0,2012-08-28,LHR to HAM. Purser addresses all club passenge...,4
3698,Economy,1,5.0,2011-10-12,My son who had worked for British Airways urge...,2
3699,Premium,1,4.0,2011-10-11,London City-New York JFK via Shannon on A318 b...,4


In [12]:
# Since dataset is huge,we will plot for random 40 values
random_sample = review_df.sample(60)
random_sample.reset_index(inplace=True)
random_sample.head()

Unnamed: 0,index,seat_type,is_verified,rating,date_posted,review,sentiment_score
0,333,Economy,0,2.0,2022-07-27,Customer service is non-existent. For over a...,1
1,3424,Business,1,10.0,2014-09-17,I recently travelled with a friend to St Lucia...,5
2,2736,Economy,1,1.0,2015-09-08,I have flown regularly with British Airways be...,2
3,1855,Economy,1,7.0,2017-01-30,"Vancouver to London Heathrow, and I was real...",3
4,2004,Premium,1,3.0,2016-11-10,London Heathrow to Bangkok with British Airw...,2


In [13]:
fig=go.Figure()
fig.add_trace(go.Scatter(x=random_sample.index, y=random_sample['sentiment_score']*2, mode='lines+markers', name='Sentiment score'))
fig.add_trace(go.Scatter(x=random_sample.index, y=random_sample['rating'], mode='lines+markers', name='Overall rating'))
fig.show()

In [10]:
five_value_df = review_df[review_df['sentiment_score']==5]
grouped_df_five = five_value_df.groupby('seat_type').count()
four_value_df = review_df[review_df['sentiment_score']==4]
grouped_df_four = four_value_df.groupby('seat_type').count()
three_value_df = review_df[review_df['sentiment_score']==3]
grouped_df_three = three_value_df.groupby('seat_type').count()
two_value_df = review_df[review_df['sentiment_score']==2]
grouped_df_two = two_value_df.groupby('seat_type').count()
one_value_df = review_df[review_df['sentiment_score']==1]
grouped_df_one = one_value_df.groupby('seat_type').count()

fig = make_subplots(rows=1, cols=3, specs=[[{'type':'domain'}, {'type':'domain'}, {'type':'domain'}]], 
                    subplot_titles=['Most Satisfied', 'Little satisfied','Neutral'])
fig.add_trace(go.Pie(labels=grouped_df_five.index.to_list(), values=grouped_df_five['sentiment_score'].to_list(), name="Most positive"), 1, 1)
fig.add_trace(go.Pie(labels=grouped_df_four.index.to_list(), values=grouped_df_four['sentiment_score'].to_list(), name="Medium positive"), 1, 2)
fig.add_trace(go.Pie(labels=grouped_df_three.index.to_list(), values=grouped_df_three['sentiment_score'].to_list(), name="Neutral"), 1, 3)

fig.show()

fig = make_subplots(rows=1, cols=2, specs=[[{'type':'domain'}, {'type':'domain'}]], subplot_titles=['Little disappointed', 'Most disaapointed'])

fig.add_trace(go.Pie(labels=grouped_df_two.index.to_list(), values=grouped_df_two['sentiment_score'].to_list(), name="Medium negative"), 1, 1)
fig.add_trace(go.Pie(labels=grouped_df_one.index.to_list(), values=grouped_df_one['sentiment_score'].to_list(), name="Most negative"), 1, 2)
fig.show()