In [4]:
# Let's first see what technology the website uses using the 'builtwth' library
!pip install builtwith

Collecting builtwith
  Downloading builtwith-1.3.4.tar.gz (34 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: builtwith
  Building wheel for builtwith (setup.py) ... [?25l[?25hdone
  Created wheel for builtwith: filename=builtwith-1.3.4-py3-none-any.whl size=36079 sha256=32632cb89d2289a9ae3c9c2e8d47c45159b5b5044afe5525a9c2a1d128ca6e83
  Stored in directory: /root/.cache/pip/wheels/19/85/16/8396a3cc152be563ea21d302952ff7737ff23663d11c1ec864
Successfully built builtwith
Installing collected packages: builtwith
Successfully installed builtwith-1.3.4


In [5]:
import builtwith

In [6]:
website = "https://www.airlinequality.com"
result = builtwith.parse(website)
print(result)

{'cdn': ['CloudFlare'], 'advertising-networks': ['Google AdSense'], 'font-scripts': ['Google Font API'], 'photo-galleries': ['Lightbox'], 'javascript-frameworks': ['Lightbox', 'Modernizr', 'jQuery'], 'cms': ['WordPress'], 'programming-languages': ['PHP'], 'blogs': ['PHP', 'WordPress'], 'marketing-automation': ['Yoast SEO'], 'web-frameworks': ['ZURB Foundation']}


In [7]:
!pip install python-whois

Collecting python-whois
  Downloading python_whois-0.9.5-py3-none-any.whl.metadata (2.6 kB)
Downloading python_whois-0.9.5-py3-none-any.whl (104 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m104.2/104.2 kB[0m [31m3.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: python-whois
Successfully installed python-whois-0.9.5


In [8]:
import whois

In [9]:
print(whois.whois(website))

{
  "domain_name": "AIRLINEQUALITY.COM",
  "registrar": "TUCOWS, INC.",
  "registrar_url": [
    "http://www.tucows.com",
    "http://tucowsdomains.com"
  ],
  "reseller": "Namesco Limited",
  "whois_server": "whois.tucows.com",
  "referral_url": null,
  "updated_date": "2024-01-26 08:58:03",
  "creation_date": "2000-02-24 11:52:16",
  "expiration_date": "2025-02-24 11:52:14",
  "name_servers": [
    "AMIR.NS.CLOUDFLARE.COM",
    "CRUZ.NS.CLOUDFLARE.COM"
  ],
  "status": [
    "clientTransferProhibited https://icann.org/epp#clientTransferProhibited",
    "clientUpdateProhibited https://icann.org/epp#clientUpdateProhibited"
  ],
  "emails": [
    "domainabuse@tucows.com",
    "transfers@names.co.uk"
  ],
  "dnssec": "unsigned",
  "name": "REDACTED FOR PRIVACY",
  "org": "REDACTED FOR PRIVACY",
  "address": "REDACTED FOR PRIVACY",
  "city": "REDACTED FOR PRIVACY",
  "state": "Greater London",
  "registrant_postal_code": "REDACTED FOR PRIVACY",
  "country": "GB"
}


In [10]:
import requests
from bs4 import BeautifulSoup

In [11]:
# https://www.airlinequality.com/airline-reviews/british-airways . This is the first page with all the reviews.

# what we will need is:
# Title
# Name
# Date
# Review
# Verification

# This means our table will have 5 columns

In [12]:
# Let's start with titles of page 1:
url = "https://www.airlinequality.com/airline-reviews/british-airways/page/1/"
response = requests.get(url) # request for page
response.raise_for_status() # Checks if the page gives successful status (200)
soup = BeautifulSoup(response.text, 'html')
title = [h2.text.strip() for h2 in soup.find_all("h2", class_="text_header")]

In [13]:
print(title)

['“customer support was terrible”', '"a really enjoyable experience"', '"Very good flight"', '"relatively comfortable elderly plane"', '"70 days chasing BA’s complaints department"', '"BA refused to reimburse me"', '"the flight was delayed"', '"BA forced us to buy new tickets"', '“staff had no sympathy or empathy”', '"the seat is outdated"']


In [14]:
# Great we now see that we can request, parse and display text details from our web page.
# let's see all the other details across our requirements.

In [15]:
name = [span.text.strip() for span in soup.find_all("span", itemprop="name")] # takes all the names for the reviews submitted

In [16]:
print(name)

['L Martin', 'Paul Lee', 'Guy Senior', 'Simon Channon', 'R Layne', 'Michael Chastain', 'S Herron', 'G Marton', 'Raeesa Carrim', 'J Meares']


In [17]:
# Let's see if we can get the dates for page 1
date = [(time["datetime"], time.text.strip()) for time in soup.find_all("time", itemprop="datePublished")]

In [18]:
print(date)

[('2025-02-07', '7th February 2025'), ('2025-02-01', '1st February 2025'), ('2025-01-20', '20th January 2025'), ('2025-01-19', '19th January 2025'), ('2025-01-15', '15th January 2025'), ('2025-01-09', '9th January 2025'), ('2025-01-05', '5th January 2025'), ('2025-01-04', '4th January 2025'), ('2025-01-03', '3rd January 2025'), ('2025-01-01', '1st January 2025')]


In [19]:
# This will display all user verification if the trip is verified or not
verification = [a.text.strip() for a in soup.find_all("a", href="https://www.airlinequality.com/verified-reviews/")]

In [20]:
print(verification)

['Trip Verified', 'Trip Verified', 'Not Verified', 'Not Verified', 'Trip Verified', 'Trip Verified', 'Trip Verified', 'Trip Verified', 'Trip Verified', 'Trip Verified']


In [21]:
# Now we have seen that we can get the data we need. We can create a scrape-bot for the pages 1-70 as our sample size

In [22]:
# Scraper

base_url = "https://www.airlinequality.com/airline-reviews/british-airways"
pages = 40
titles = []
ratings = []
names = []
dates = []
verification = []
reviews = []

# let's loop through our pages
for i in range(1, pages+1):
    page_url = f"{base_url}/page/{i}/?sortby=post_date%3ADesc&pagesize=100"
    response = requests.get(page_url, timeout=10) # Make request
    response.raise_for_status()
    soup = BeautifulSoup(response.text, 'html') # Parsing

    # Getting title data from the page and adding it to the list of titles
    for h2 in soup.find_all("h2", class_="text_header"):
        titles.append(h2.get_text())

    # Getting name data from the page and adding it to the name list.
    for span in soup.find_all("span", itemprop="name"):
        names.append(span.get_text())

    # Getting date data from the page and adding it to the list of dates.
    for time in soup.find_all("time", itemprop="datePublished"):
        dates.append(time.get_text())

    # Getting verification data from the page and adding it to the verification list.
    for a in soup.find_all("a", href="https://www.airlinequality.com/verified-reviews/"):
        verification.append(a.get_text())

    # Getting review data from the page and adding it to the list of reviews.
    for div in soup.find_all("div", itemprop="reviewBody"):
        reviews.append(div.get_text())

    print(f"<--- Reviews in page {i}: {len(reviews)}")

<--- Reviews in page 1: 100
<--- Reviews in page 2: 200
<--- Reviews in page 3: 300
<--- Reviews in page 4: 400
<--- Reviews in page 5: 500
<--- Reviews in page 6: 600
<--- Reviews in page 7: 700
<--- Reviews in page 8: 800
<--- Reviews in page 9: 900
<--- Reviews in page 10: 1000
<--- Reviews in page 11: 1100
<--- Reviews in page 12: 1200
<--- Reviews in page 13: 1300
<--- Reviews in page 14: 1400
<--- Reviews in page 15: 1500
<--- Reviews in page 16: 1600
<--- Reviews in page 17: 1700
<--- Reviews in page 18: 1800
<--- Reviews in page 19: 1900
<--- Reviews in page 20: 2000
<--- Reviews in page 21: 2100
<--- Reviews in page 22: 2200
<--- Reviews in page 23: 2300
<--- Reviews in page 24: 2400
<--- Reviews in page 25: 2500
<--- Reviews in page 26: 2600
<--- Reviews in page 27: 2700
<--- Reviews in page 28: 2800
<--- Reviews in page 29: 2900
<--- Reviews in page 30: 3000
<--- Reviews in page 31: 3100
<--- Reviews in page 32: 3200
<--- Reviews in page 33: 3300
<--- Reviews in page 34: 340

In [23]:
# Let's Update our dataframe with the values we scraped from the website pages

import pandas as pd
data = pd.DataFrame()
data["Title"] = titles
data["Reviews"] = reviews

In [24]:
data.head()

Unnamed: 0,Title,Reviews
0,“customer support was terrible”,"✅ Trip Verified | Terrible overall, medium ser..."
1,"""a really enjoyable experience""",✅ Trip Verified | London Heathrow to Male In...
2,"""Very good flight""",Not Verified | Very good flight following an ...
3,"""relatively comfortable elderly plane""",Not Verified | An hour's delay due to late ar...
4,"""70 days chasing BA’s complaints department""",✅ Trip Verified | I booked through BA becaus...


In [25]:
# Let's split the reviews and draw out the verification data
data[["Reviews", "Verification"]] = data["Reviews"].str.split("|", expand=True)

In [26]:
data.head()

Unnamed: 0,Title,Reviews,Verification
0,“customer support was terrible”,✅ Trip Verified,"Terrible overall, medium service and the flig..."
1,"""a really enjoyable experience""",✅ Trip Verified,London Heathrow to Male In new business cla...
2,"""Very good flight""",Not Verified,Very good flight following an equally good f...
3,"""relatively comfortable elderly plane""",Not Verified,An hour's delay due to late arrival of the i...
4,"""70 days chasing BA’s complaints department""",✅ Trip Verified,I booked through BA because Loganair don’t ...


In [27]:
mask = data["Reviews"].str.contains("\u2705")
data.loc[mask, ["Reviews"]] = data.loc[mask, "Reviews"].str.split("\u2705", expand=True)[1]

In [28]:
data.head()

Unnamed: 0,Title,Reviews,Verification
0,“customer support was terrible”,Trip Verified,"Terrible overall, medium service and the flig..."
1,"""a really enjoyable experience""",Trip Verified,London Heathrow to Male In new business cla...
2,"""Very good flight""",Not Verified,Very good flight following an equally good f...
3,"""relatively comfortable elderly plane""",Not Verified,An hour's delay due to late arrival of the i...
4,"""70 days chasing BA’s complaints department""",Trip Verified,I booked through BA because Loganair don’t ...


# Data Cleaning
Here we will remove all special characters, leaving alphabetical characters

## Uniform-case Characters
We will make all letters lowercase


## Tokenization
We will  break the text into smaller pieces called Tokens. It can be performed at sentences(sentence tokenization) or word level(word tokenization).

## Enrichment using POS tagging
Parts of Speech (POS) tagging is a process of converting each token into a tuple having the form (word, tag). POS tagging essential to preserve the context of the word and is essential for Lemmatization

## Stopword Removal
Removes unimportant words

## Stemming
Reduces words to root form

## Lemmatization
Converts words to dictionary form

In [29]:
# imports

import nltk
import re
import spacy
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tag import pos_tag

# downloads

nltk.download("punkt")
nltk.download("stopwords")
nltk.download("averaged_perceptron_tagger")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [30]:
text = data["Reviews"]

In [31]:
def clean_text(text): # Applies several NLP cleaning steps to text data
  text = text.lower() # lowercasing
  text = re.sub(r"[]")