**DATA EXTRACTION**

In [329]:
from bs4 import BeautifulSoup
import requests
import pandas as pd

In [330]:
base_url="https://www.airlinequality.com/airline-reviews/british-airways"
pages=10
page_size=150
reviews=[]

for i in range(1,pages+1):
    url = f"{base_url}/page/{i}/?sortby=post_date%3ADesc&pagesize={page_size}"
    page=requests.get(url)
    soup=BeautifulSoup(page.text,'html.parser')
    for content in soup.find_all("div",{"class":"text_content"}):
        reviews.append(content.get_text())
len(reviews)

1500

In [331]:
data = {
    "Id": list(range(1, len(reviews) + 1)),
    "Reviews": reviews,
}


df_data=pd.DataFrame(data)
print(df_data.head())

   Id                                            Reviews
0   1  ✅ Trip Verified |   Flight mainly let down by ...
1   2  ✅ Trip Verified |   Another awful experience b...
2   3  ✅ Trip Verified |   The service was rude, full...
3   4  ✅ Trip Verified |   This flight was a joke. Th...
4   5  ✅ Trip Verified |   This time British Airways ...


In [333]:
df_data.to_csv(r'C:\Users\karth\Desktop\Data Analysis\British Airways\Data\BA_reviews.csv')

**DATA CLEANING**

In [334]:
df_data['Reviews'] = df_data['Reviews'].str.replace('Trip Verified | ', '')  
df_data['Reviews'] = df_data['Reviews'].str.replace('Trip Verified |  ', '')  
df_data['Reviews'] = df_data['Reviews'].str.replace('Trip Verified |', '')  
df_data['Reviews'] = df_data['Reviews'].str.replace('Not Verified |  ', '')  
print(df_data.head(100))

     Id                                            Reviews
0     1  ✅   Flight mainly let down by a disagreeable f...
1     2  ✅   Another awful experience by British Airway...
2     3  ✅   The service was rude, full of attitude to ...
3     4  ✅   This flight was a joke. There was four peo...
4     5  ✅    This time British Airways managed to get ...
..  ...                                                ...
95   96  ✅ Flight cancelled due to bad weather, BA cont...
96   97  ✅  British Airways oversold my LHR to LAX flig...
97   98  ✅  I travelled London to Doha on July 16th, I ...
98   99  ✅  When dropping off my luggage at the luggage...
99  100  ✅ If you can’t fly First Class don’t fly Briti...

[100 rows x 2 columns]


In [335]:
import re

def clean_row(row):
  return row.apply(lambda x: re.sub(r'[^a-zA-Z\s-]', '', str(x)))

def clean_dataframe(df_data):
  return df_data.apply(clean_row, axis=1)
cleaned_df =[]
cleaned_df = clean_dataframe(df_data)
print(cleaned_df)

     Id                                            Reviews
0           Flight mainly let down by a disagreeable fl...
1           Another awful experience by British Airways...
2           The service was rude full of attitude to me...
3           This flight was a joke There was four peopl...
4            This time British Airways managed to get e...
...  ..                                                ...
1495     Two regular an uneventful flights Curiously en...
1496      London to Belfast Another regular flight by B...
1497      Very full flight on G-BNLPB flying from Miami...
1498      Warsaw to London WAW is not a pleasant airpor...
1499       I booked my flight with Cathay Pacific  the ...

[1500 rows x 2 columns]


In [337]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

plt.style.use('ggplot')

import nltk
#nltk.download('vader_lexicon')

In [338]:
df=pd.DataFrame(cleaned_df)

In [339]:
df

Unnamed: 0,Id,Reviews
0,,Flight mainly let down by a disagreeable fl...
1,,Another awful experience by British Airways...
2,,The service was rude full of attitude to me...
3,,This flight was a joke There was four peopl...
4,,This time British Airways managed to get e...
...,...,...
1495,,Two regular an uneventful flights Curiously en...
1496,,London to Belfast Another regular flight by B...
1497,,Very full flight on G-BNLPB flying from Miami...
1498,,Warsaw to London WAW is not a pleasant airpor...


In [340]:
print(df['Id'].dtype)

object


In [341]:
from nltk.sentiment import SentimentIntensityAnalyzer
from tqdm.notebook import tqdm

sia = SentimentIntensityAnalyzer()
sia

<nltk.sentiment.vader.SentimentIntensityAnalyzer at 0x206be344d10>

In [342]:
res = {}
for i, row in tqdm(df.iterrows(), total=len(df)):
    Reviews = row['Reviews']
    Id = i
    res[Id]= sia.polarity_scores(Reviews)
len(res)

  0%|          | 0/1500 [00:00<?, ?it/s]

1500

In [368]:
res

{0: {'neg': 0.124, 'neu': 0.799, 'pos': 0.077, 'compound': -0.8765},
 1: {'neg': 0.289, 'neu': 0.711, 'pos': 0.0, 'compound': -0.8294},
 2: {'neg': 0.107, 'neu': 0.893, 'pos': 0.0, 'compound': -0.4588},
 3: {'neg': 0.061, 'neu': 0.857, 'pos': 0.082, 'compound': 0.0},
 4: {'neg': 0.0, 'neu': 0.767, 'pos': 0.233, 'compound': 0.9831},
 5: {'neg': 0.03, 'neu': 0.878, 'pos': 0.092, 'compound': 0.6682},
 6: {'neg': 0.071, 'neu': 0.749, 'pos': 0.18, 'compound': 0.9428},
 7: {'neg': 0.083, 'neu': 0.884, 'pos': 0.033, 'compound': -0.9026},
 8: {'neg': 0.049, 'neu': 0.927, 'pos': 0.024, 'compound': -0.8095},
 9: {'neg': 0.046, 'neu': 0.786, 'pos': 0.168, 'compound': 0.9437},
 10: {'neg': 0.058, 'neu': 0.583, 'pos': 0.358, 'compound': 0.995},
 11: {'neg': 0.23, 'neu': 0.649, 'pos': 0.121, 'compound': -0.5994},
 12: {'neg': 0.011, 'neu': 0.65, 'pos': 0.338, 'compound': 0.9892},
 13: {'neg': 0.0, 'neu': 0.729, 'pos': 0.271, 'compound': 0.8514},
 14: {'neg': 0.108, 'neu': 0.671, 'pos': 0.221, 'compo

In [369]:
sent_analysis=pd.DataFrame(res).T

In [370]:
sent_analysis

Unnamed: 0,neg,neu,pos,compound
0,0.124,0.799,0.077,-0.8765
1,0.289,0.711,0.000,-0.8294
2,0.107,0.893,0.000,-0.4588
3,0.061,0.857,0.082,0.0000
4,0.000,0.767,0.233,0.9831
...,...,...,...,...
1495,0.090,0.817,0.093,0.0736
1496,0.055,0.807,0.138,0.7715
1497,0.112,0.827,0.061,-0.7198
1498,0.073,0.800,0.127,0.8124


In [372]:
df['Id'] = pd.to_numeric(df['Id'], errors='coerce').astype('Int64')

In [367]:
#sent_analysis = sent_analysis.rename(columns={'index': 'Id'})

In [374]:
sent_analysis['Id'] = pd.to_numeric(sent_analysis[i], errors='coerce').astype('Int64')

KeyError: 1499

In [356]:
#sent_analysis = sent_analysis.merge(df, how='left')
#sent_analysis = sent_analysis.merge(df, how='left')

In [357]:
ids = list(range(0, len(sent_analysis)))
sent_analysis['Id'] = ids

print(sent_analysis)

        neg    neu    pos  compound    Id
0     0.124  0.799  0.077   -0.8765     0
1     0.289  0.711  0.000   -0.8294     1
2     0.107  0.893  0.000   -0.4588     2
3     0.061  0.857  0.082    0.0000     3
4     0.000  0.767  0.233    0.9831     4
...     ...    ...    ...       ...   ...
1495  0.090  0.817  0.093    0.0736  1495
1496  0.055  0.807  0.138    0.7715  1496
1497  0.112  0.827  0.061   -0.7198  1497
1498  0.073  0.800  0.127    0.8124  1498
1499  0.105  0.895  0.000   -0.5996  1499

[1500 rows x 5 columns]


In [358]:
print(sent_analysis['Id'].dtype)

int64


In [359]:
print(df['Id'].dtype)

Int64


In [360]:
sent_analysis

Unnamed: 0,neg,neu,pos,compound,Id
0,0.124,0.799,0.077,-0.8765,0
1,0.289,0.711,0.000,-0.8294,1
2,0.107,0.893,0.000,-0.4588,2
3,0.061,0.857,0.082,0.0000,3
4,0.000,0.767,0.233,0.9831,4
...,...,...,...,...,...
1495,0.090,0.817,0.093,0.0736,1495
1496,0.055,0.807,0.138,0.7715,1496
1497,0.112,0.827,0.061,-0.7198,1497
1498,0.073,0.800,0.127,0.8124,1498


In [364]:
sent_analysis = sent_analysis.merge(df)

In [365]:
sent_analysis

Unnamed: 0,neg,neu,pos,compound,Id,Reviews
0,,,,,,Flight mainly let down by a disagreeable fl...
1,,,,,,Another awful experience by British Airways...
2,,,,,,The service was rude full of attitude to me...
3,,,,,,This flight was a joke There was four peopl...
4,,,,,,This time British Airways managed to get e...
...,...,...,...,...,...,...
1495,,,,,,Two regular an uneventful flights Curiously en...
1496,,,,,,London to Belfast Another regular flight by B...
1497,,,,,,Very full flight on G-BNLPB flying from Miami...
1498,,,,,,Warsaw to London WAW is not a pleasant airpor...
