# Web Scraping
---

## 0. Constants

In [1]:
REVIEWS_DATA_EXPORT = "../data/raw/reviews.csv"

---
## 1. Imports

In [2]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

---
## 2. Web Scraping

In [3]:
base_url = "https://www.airlinequality.com/airline-reviews/british-airways"
pages = 25
page_size = 100

reviews = []

for i in range(1, pages + 1):

    print(f"Scraping page {i}")

    # Create URL to collect links from paginated data
    url = f"{base_url}/page/{i}/?sortby=post_date%3ADesc&pagesize={page_size}"

    # Collect HTML data from this page
    response = requests.get(url)

    # Parse content
    content = response.content
    parsed_content = BeautifulSoup(content, 'html.parser')
    
    for element in parsed_content.find_all("div", {"class": "text_content"}):
        reviews.append(element.get_text())
    
    print(f"   ---> {len(reviews)} total reviews")

Scraping page 1
   ---> 100 total reviews
Scraping page 2
   ---> 200 total reviews
Scraping page 3
   ---> 300 total reviews
Scraping page 4
   ---> 400 total reviews
Scraping page 5
   ---> 500 total reviews
Scraping page 6
   ---> 600 total reviews
Scraping page 7
   ---> 700 total reviews
Scraping page 8
   ---> 800 total reviews
Scraping page 9
   ---> 900 total reviews
Scraping page 10
   ---> 1000 total reviews
Scraping page 11
   ---> 1100 total reviews
Scraping page 12
   ---> 1200 total reviews
Scraping page 13
   ---> 1300 total reviews
Scraping page 14
   ---> 1400 total reviews
Scraping page 15
   ---> 1500 total reviews
Scraping page 16
   ---> 1600 total reviews
Scraping page 17
   ---> 1700 total reviews
Scraping page 18
   ---> 1800 total reviews
Scraping page 19
   ---> 1900 total reviews
Scraping page 20
   ---> 2000 total reviews
Scraping page 21
   ---> 2100 total reviews
Scraping page 22
   ---> 2200 total reviews
Scraping page 23
   ---> 2300 total reviews
Scrapi

In [21]:
df = pd.DataFrame()
df["reviews"] = reviews
df.head()

Unnamed: 0,reviews
0,✅ Trip Verified | I have flown with BA for ma...
1,✅ Trip Verified | Boarding at Mumbai was chaot...
2,"Not Verified | Mexico City Airport is a zoo, b..."
3,"✅ Trip Verified | Very poor service, very fru..."
4,Not Verified | Generally poor. Sent to gate o...


In [24]:
df['reviews']=df['reviews'].str.lstrip('✅ Trip Verified |')
df['reviews']=df['reviews'].str.lstrip('Not Verified |')
df['reviews'] = df['reviews'].str.strip()
df['reviews']= df['reviews'].str.lower()

In [25]:
df.head()

Unnamed: 0,reviews
0,i have flown with ba for many years and there ...
1,boarding at mumbai was chaotic and badly organ...
2,"mexico city airport is a zoo, but taking the l..."
3,"y poor service, very frustrating. firstly my f..."
4,"generally poor. sent to gate on time, sat mayb..."


---
## 3. Save Data

In [27]:
df.to_csv(REVIEWS_DATA_EXPORT)