##   Scraping data from Skytrax - Task 1

In [5]:
# importing the libraries
import requests
from bs4 import BeautifulSoup
import numpy as np
import pandas as pd
import os

In [6]:
# looping the cols i need from the website
countries = []
dates = []
reviews = []
ratings = []

for i in range(1, 36):
    page = requests.get(f"https://www.airlinequality.com/airline-reviews/british-airways/page/{i}/?sortby=post_date%3ADesc&pagesize=100")
    
    soup = BeautifulSoup(page.content, "html.parser")

    # Country
    for item in soup.find_all("h3"):
        if item.span and item.span.next_sibling:
            countries.append(item.span.next_sibling.text.strip(" ()"))
        else:
            countries.append("Unknown")

    # Date
    for item in soup.find_all("time"):
        dates.append(item.text.strip())

    # Reviews
    for item in soup.find_all("div", class_="text_content"):
        reviews.append(item.text.strip())
    # Ratings
    for item in soup.find_all("div", class_="rating-10"):
        ratings.append(item.text.strip())

In [7]:
len(reviews)

3500

In [8]:
len(countries)


3500

In [9]:
len(dates)

3500

In [10]:
len(ratings)

3535

In [11]:
# trimming the ratings col to 3500 rows for column consistency
ratings = ratings[:3500] + ["No Rating"] * (3500 - len(ratings))

In [12]:
# structuring the cols and ing a pd df from the lists above
df = pd.DataFrame({
    "Review": reviews,
    "Date": dates,
    "Country": countries,
    "Rating": ratings
})


In [13]:
df

Unnamed: 0,Review,Date,Country,Rating
0,✅ Trip Verified | Check-in was slow with onl...,20th May 2025,Australia,5/10
1,Not Verified | Check in was smooth enough. Bo...,18th May 2025,United Kingdom,6/10
2,✅ Trip Verified | Although transferring to thi...,15th May 2025,United States,4/10
3,✅ Trip Verified | We are extremely grateful ...,8th May 2025,Switzerland,8/10
4,✅ Trip Verified | I had an appalling experie...,29th April 2025,South Africa,10/10
...,...,...,...,...
3495,LAX-LHR-LAX on the A380.The first thing I noti...,10th December 2014,United States,5/10
3496,Phoenix to London - outbound a wonderful and e...,10th December 2014,United States,3/10
3497,BA216 from Washington to London in World Trave...,4th December 2014,United States,7/10
3498,LHR-BGO on A319. Flight on time and only half ...,4th December 2014,United Kingdom,1/10


In [14]:
# saving the df to csv file
df["Rating"] = df["Rating"].apply(lambda x: f"{x} rating") # modifying the Ratings col
df.to_csv("data/Web_Scraping.csv", index=False)

In [15]:
df

Unnamed: 0,Review,Date,Country,Rating
0,✅ Trip Verified | Check-in was slow with onl...,20th May 2025,Australia,5/10 rating
1,Not Verified | Check in was smooth enough. Bo...,18th May 2025,United Kingdom,6/10 rating
2,✅ Trip Verified | Although transferring to thi...,15th May 2025,United States,4/10 rating
3,✅ Trip Verified | We are extremely grateful ...,8th May 2025,Switzerland,8/10 rating
4,✅ Trip Verified | I had an appalling experie...,29th April 2025,South Africa,10/10 rating
...,...,...,...,...
3495,LAX-LHR-LAX on the A380.The first thing I noti...,10th December 2014,United States,5/10 rating
3496,Phoenix to London - outbound a wonderful and e...,10th December 2014,United States,3/10 rating
3497,BA216 from Washington to London in World Trave...,4th December 2014,United States,7/10 rating
3498,LHR-BGO on A319. Flight on time and only half ...,4th December 2014,United Kingdom,1/10 rating
