In [140]:
# Let's first see what technology the website uses using the 'builtwth' library
!pip install builtwith



In [141]:
import builtwith

In [142]:
website = "https://www.airlinequality.com"
result = builtwith.parse(website)
print(result)

{'cdn': ['CloudFlare'], 'advertising-networks': ['Google AdSense'], 'font-scripts': ['Google Font API'], 'photo-galleries': ['Lightbox'], 'javascript-frameworks': ['Lightbox', 'Modernizr', 'jQuery'], 'cms': ['WordPress'], 'programming-languages': ['PHP'], 'blogs': ['PHP', 'WordPress'], 'marketing-automation': ['Yoast SEO'], 'web-frameworks': ['ZURB Foundation']}


In [143]:
!pip install python-whois



In [144]:
import whois

In [145]:
print(whois.whois(website))

{
  "domain_name": "AIRLINEQUALITY.COM",
  "registrar": "TUCOWS, INC.",
  "registrar_url": [
    "http://www.tucows.com",
    "http://tucowsdomains.com"
  ],
  "reseller": "Namesco Limited",
  "whois_server": "whois.tucows.com",
  "referral_url": null,
  "updated_date": "2024-01-26 08:58:03",
  "creation_date": "2000-02-24 11:52:16",
  "expiration_date": "2025-02-24 11:52:14",
  "name_servers": [
    "AMIR.NS.CLOUDFLARE.COM",
    "CRUZ.NS.CLOUDFLARE.COM"
  ],
  "status": [
    "clientTransferProhibited https://icann.org/epp#clientTransferProhibited",
    "clientUpdateProhibited https://icann.org/epp#clientUpdateProhibited"
  ],
  "emails": [
    "domainabuse@tucows.com",
    "transfers@names.co.uk"
  ],
  "dnssec": "unsigned",
  "name": "REDACTED FOR PRIVACY",
  "org": "REDACTED FOR PRIVACY",
  "address": "REDACTED FOR PRIVACY",
  "city": "REDACTED FOR PRIVACY",
  "state": "Greater London",
  "registrant_postal_code": "REDACTED FOR PRIVACY",
  "country": "GB"
}


In [146]:
import requests
from bs4 import BeautifulSoup

In [147]:
# https://www.airlinequality.com/airline-reviews/british-airways . This is the first page with all the reviews.

# what we will need is:
# Title
# Name
# Date
# Review
# Verification

# This means our table will have 5 columns

In [148]:
# Let's start with titles of page 1:
url = "https://www.airlinequality.com/airline-reviews/british-airways/page/1/"
response = requests.get(url) # request for page
response.raise_for_status() # Checks if the page gives successful status (200)
soup = BeautifulSoup(response.text, 'html')
title = [h2.text.strip() for h2 in soup.find_all("h2", class_="text_header")]

In [149]:
print(title)

['“food has really gone downhill”', '"thoroughly enjoyed this flight"', '“customer support was terrible”', '"a really enjoyable experience"', '"Very good flight"', '"relatively comfortable elderly plane"', '"70 days chasing BA’s complaints department"', '"BA refused to reimburse me"', '"the flight was delayed"', '"BA forced us to buy new tickets"']


In [150]:
# Great we now see that we can request, parse and display text details from our web page.
# let's see all the other details across our requirements.

In [151]:
name = [span.text.strip() for span in soup.find_all("span", itemprop="name")] # takes all the names for the reviews submitted

In [152]:
print(name)

['John Prescott', 'A Hashin', 'L Martin', 'Paul Lee', 'Guy Senior', 'Simon Channon', 'R Layne', 'Michael Chastain', 'S Herron', 'G Marton']


In [153]:
# Let's see if we can get the dates for page 1
date = [(time["datetime"], time.text.strip()) for time in soup.find_all("time", itemprop="datePublished")]

In [154]:
print(date)

[('2025-02-14', '14th February 2025'), ('2025-02-14', '14th February 2025'), ('2025-02-07', '7th February 2025'), ('2025-02-01', '1st February 2025'), ('2025-01-20', '20th January 2025'), ('2025-01-19', '19th January 2025'), ('2025-01-15', '15th January 2025'), ('2025-01-09', '9th January 2025'), ('2025-01-05', '5th January 2025'), ('2025-01-04', '4th January 2025')]


In [155]:
# This will display all user verification if the trip is verified or not
verification = [a.text.strip() for a in soup.find_all("a", href="https://www.airlinequality.com/verified-reviews/")]

In [156]:
print(verification)

['Trip Verified', 'Trip Verified', 'Trip Verified', 'Trip Verified', 'Not Verified', 'Not Verified', 'Trip Verified', 'Trip Verified', 'Trip Verified', 'Trip Verified']


In [157]:
# Now we have seen that we can get the data we need. We can create a scrape-bot for the pages 1-70 as our sample size

# Web Scrapping
 We will loop through the pages on Skytrax.com and collect the necessary data.

 We may not use all the collected data however, for future manipulation it may be vital.

In [158]:
# Scraper

base_url = "https://www.airlinequality.com/airline-reviews/british-airways"
pages = 40
titles = []
ratings = []
names = []
dates = []
reviews = []

# let's loop through our pages
for i in range(1, pages+1):
    page_url = f"{base_url}/page/{i}/?sortby=post_date%3ADesc&pagesize=100"
    response = requests.get(page_url, timeout=10) # Make request
    response.raise_for_status()
    soup = BeautifulSoup(response.text, 'html') # Parsing

    # Getting title data from the page and adding it to the list of titles
    for h2 in soup.find_all("h2", class_="text_header"):
        titles.append(h2.get_text())

    # Getting name data from the page and adding it to the name list.
    for span in soup.find_all("span", itemprop="name"):
        names.append(span.get_text())

    # Getting date data from the page and adding it to the list of dates.
    for time in soup.find_all("time", itemprop="datePublished"):
        dates.append(time.get_text())

    # Getting review data from the page and adding it to the list of reviews.
    for div in soup.find_all("div", itemprop="reviewBody", ):
        reviews.append(div.get_text())

    print(f"<--- Reviews in page {i}: {len(reviews)}")

<--- Reviews in page 1: 100
<--- Reviews in page 2: 200
<--- Reviews in page 3: 300
<--- Reviews in page 4: 400
<--- Reviews in page 5: 500
<--- Reviews in page 6: 600
<--- Reviews in page 7: 700
<--- Reviews in page 8: 800
<--- Reviews in page 9: 900
<--- Reviews in page 10: 1000
<--- Reviews in page 11: 1100
<--- Reviews in page 12: 1200
<--- Reviews in page 13: 1300
<--- Reviews in page 14: 1400
<--- Reviews in page 15: 1500
<--- Reviews in page 16: 1600
<--- Reviews in page 17: 1700
<--- Reviews in page 18: 1800
<--- Reviews in page 19: 1900
<--- Reviews in page 20: 2000
<--- Reviews in page 21: 2100
<--- Reviews in page 22: 2200
<--- Reviews in page 23: 2300
<--- Reviews in page 24: 2400
<--- Reviews in page 25: 2500
<--- Reviews in page 26: 2600
<--- Reviews in page 27: 2700
<--- Reviews in page 28: 2800
<--- Reviews in page 29: 2900
<--- Reviews in page 30: 3000
<--- Reviews in page 31: 3100
<--- Reviews in page 32: 3200
<--- Reviews in page 33: 3300
<--- Reviews in page 34: 340

In [159]:
# Let's Update our dataframe with the values we scraped from the website pages

import pandas as pd
data = pd.DataFrame()
#data["Title"] = titles
# commented the above statement because in older reviews there are no review titles.
data["Reviews"] = reviews

In [160]:
data

Unnamed: 0,Reviews
0,"✅ Trip Verified | First the good news, the clu..."
1,✅ Trip Verified | I have never travelled wit...
2,"✅ Trip Verified | Terrible overall, medium ser..."
3,✅ Trip Verified | London Heathrow to Male In...
4,Not Verified | Very good flight following an ...
...,...
3910,Business LHR to BKK. 747-400. First try back w...
3911,LHR to HAM. Purser addresses all club passenge...
3912,My son who had worked for British Airways urge...
3913,London City-New York JFK via Shannon on A318 b...


# Data Cleaning

This is  the process of fixing or removing incorrect, corrupted, incorrectly formatted, duplicate, or incomplete data within a dataset.

When sourcing data, there are many opportunities for data to be in a format in which it is difficult to analyze or manipulate.

## Text Cleaning

### Removing inconsistent data
We remove the "✅ Trip Verified" and "Not Verified".
They do not appear in older reviews.

Additionaly, this text will bring errors in our analysis later on. Such as

In [161]:
# Let's split the reviews, removing the verification because older reviews lack this data

# creating a boolean mask for rows with "Verified" text
mask_verification = data["Reviews"].str.contains("Verified", na=False)

# We now remove the text that appears before the "|" in the rows with "Verified"
data.loc[mask_verification, "Reviews"] = (data.loc[mask_verification, "Reviews"].str.replace(r"^.*?\|", "", regex=True).str.strip())

In [162]:
data

Unnamed: 0,Reviews
0,"First the good news, the club suites are such ..."
1,I have never travelled with British airways be...
2,"Terrible overall, medium service and the fligh..."
3,London Heathrow to Male In new business class....
4,Very good flight following an equally good fli...
...,...
3910,Business LHR to BKK. 747-400. First try back w...
3911,LHR to HAM. Purser addresses all club passenge...
3912,My son who had worked for British Airways urge...
3913,London City-New York JFK via Shannon on A318 b...


## Converting Text To Lowercase

Converting text to lowercase is a common preprocessing step in data cleaning and natural language processing.  

It transforms all text characters to their lowercase equivalents, ensuring uniformity and consistency within the dataset.

This is crucial because it treats words with the same spelling but different capitalization as identical, preventing them from being interpreted as distinct entities by algorithms and analytical tools.  

This standardization simplifies tasks like text comparison, search, and analysis, leading to more accurate and efficient results.  

For example, "Flight", "flight", and "FLIGHT" would all be treated as "flight" after lowercasing.

In [163]:
data[["Reviews"]] = data[["Reviews"]].apply(lambda x: x.str.lower())

In [164]:
data

Unnamed: 0,Reviews
0,"first the good news, the club suites are such ..."
1,i have never travelled with british airways be...
2,"terrible overall, medium service and the fligh..."
3,london heathrow to male in new business class....
4,very good flight following an equally good fli...
...,...
3910,business lhr to bkk. 747-400. first try back w...
3911,lhr to ham. purser addresses all club passenge...
3912,my son who had worked for british airways urge...
3913,london city-new york jfk via shannon on a318 b...


### Removing All Special Characters/Punctuations

Removing special characters and punctuation from data is a preprocessing step that cleans and standardizes text.  These characters, while sometimes meaningful, can often hinder analysis and confuse algorithms, particularly in natural language processing and machine learning.

Removing them creates a more uniform dataset, improving consistency, simplifying analysis, and optimizing the data for tasks like text mining, sentiment analysis, and model training.  

This ensures more accurate and reliable results by focusing on the core textual content.


In [170]:
data["Reviews"] = data["Reviews"].str.replace('[^\w\s]', '')
data

Unnamed: 0,Reviews
0,"first the good news, the club suites are such ..."
1,i have never travelled with british airways be...
2,"terrible overall, medium service and the fligh..."
3,london heathrow to male in new business class....
4,very good flight following an equally good fli...
...,...
3910,business lhr to bkk. 747-400. first try back w...
3911,lhr to ham. purser addresses all club passenge...
3912,my son who had worked for british airways urge...
3913,london city-new york jfk via shannon on a318 b...


In [166]:
# imports

import nltk
import re
import spacy
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tag import pos_tag

# downloads

nltk.download("punkt")
nltk.download("stopwords")
nltk.download("averaged_perceptron_tagger")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [169]:
data

Unnamed: 0,Reviews
0,"first the good news, the club suites are such ..."
1,i have never travelled with british airways be...
2,"terrible overall, medium service and the fligh..."
3,london heathrow to male in new business class....
4,very good flight following an equally good fli...
...,...
3910,business lhr to bkk. 747-400. first try back w...
3911,lhr to ham. purser addresses all club passenge...
3912,my son who had worked for british airways urge...
3913,london city-new york jfk via shannon on a318 b...


In [168]:
def clean_text(text): # Applies several NLP cleaning steps to text data
  text = text.lower() # Converting to lowercase
  text = re.sub(r"[^a-zA-Z\s]","", regex=True) # Removing special characters, numbers and punctuations
  data["pos_tags"] = pos_tag(word_tokenize(text)) # assigns POS to each tokenixed word
  data["tokenized"] = [word for word in tokens if word not in set(stopwords.words("english"))] # stop word removal
  data["stemmed_tokens"] = [stemmer.stem(word) for word in tokens] # stemming(converting words to root form)
  data["lemma"] = [token.lemma_ for token in nlp("".join(tokens))]
