In [1]:
import pandas as pd
import requests
from bs4 import BeautifulSoup

In [4]:


# Define the base URL and parameters for scraping
base_url = "https://www.airlinequality.com/airline-reviews/british-airways"
pages = 10
page_size = 100

# Lists to store scraped data
reviews = []
df = pd.DataFrame()

# Iterate through each page
for i in range(1, pages + 1):
    print(f"Scraping page {i}")
    # Construct the URL for the current page
    url = f"{base_url}/page/{i}/?sortby=post_date%3ADesc&pagesize={page_size}"
    # Send a GET request to the URL
    response = requests.get(url)
    # Extract the content from the response
    content = response.content
    # Parse the HTML content using BeautifulSoup
    parsed_content = BeautifulSoup(content, 'html.parser')

    # Extract review text from each review block
    for para in parsed_content.find_all("div", {"class": "text_content"}):
        reviews.append(para.get_text())

    # Extract review ratings and other details
    for para2 in parsed_content.find_all("div", {"class": "review-stats"}):
        rating = []
        category = []
        recomend = ""
        
        # Extract numeric ratings and recommendation status
        for para3 in para2.find_all('td', {'class': 'review-value'}):
            rating.append(para3.get_text())
            if 'recommend' in para3.get_text().lower():
                recomend = para3.get_text()
                
        # Extract star ratings
        for para4 in para2.find_all('td', {'class': 'review-rating-stars stars'}):
            para5 = len(para4.find_all('span', {'class': 'star fill'}))
            rating.append(para5)

        # Extract rating categories
        for para6 in para2.find_all('td', {'class': 'review-rating-header'}):
            category.append(para6.get_text().strip())

        # Add recommendation status as a category if present
        if recomend:
            category.append("Recommended")
            rating.append(recomend)

        # Create a dictionary from the extracted data
        data_dict = dict(zip(category, rating))
        # Convert the dictionary to a DataFrame and append it to the main DataFrame
        df = pd.concat([df, pd.DataFrame([data_dict])], ignore_index=True)

    print(f"   ---> {len(reviews)} total reviews")

# Add the 'reviews' column to the dataframe
df["reviews"] = reviews





Scraping page 1
   ---> 100 total reviews
Scraping page 2
   ---> 200 total reviews
Scraping page 3
   ---> 300 total reviews
Scraping page 4
   ---> 400 total reviews
Scraping page 5
   ---> 500 total reviews
Scraping page 6
   ---> 600 total reviews
Scraping page 7
   ---> 700 total reviews
Scraping page 8
   ---> 800 total reviews
Scraping page 9
   ---> 900 total reviews
Scraping page 10
   ---> 1000 total reviews


In [5]:
df.to_csv("BritishAirways_reviews.csv", index=False)  
# Save the DataFrame to a CSV file without including the index

In [6]:
df

Unnamed: 0,Aircraft,Type Of Traveller,Seat Type,Route,Date Flown,Seat Comfort,Cabin Staff Service,Food & Beverages,Inflight Entertainment,Ground Service,Value For Money,Recommended,Wifi & Connectivity,reviews
0,Boeing 777,Business,Business Class,London to Tokyo,March 2024,yes,5.0,2.0,5.0,5.0,3,4,,✅ Trip Verified | A last minute business trip ...
1,A320,Business,Economy Class,Lisbon to London,March 2024,yes,4.0,,,3.0,1,3,,✅ Trip Verified | Overall I would say disapp...
2,Boeing 777,Couple Leisure,Business Class,London to Delhi,February 2024,yes,5.0,5.0,5.0,5.0,5,5,,Not Verified | LHR to Delhi in Business. Exce...
3,A320,Couple Leisure,Economy Class,London to Milan,March 2024,yes,5.0,5.0,,5.0,4,5,,Not Verified | Efficient and Smooth flight fr...
4,,Couple Leisure,Economy Class,Madrid to London Heathrow,March 2024,no,3.0,,,3.0,3,2,,✅ Trip Verified | Was told we can not take han...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,,Couple Leisure,Business Class,Bridgetown to Gatwick,March 2019,no,4.0,1.0,,2,4,1,,✅ Trip Verified | Bridgetown to London Gatwic...
996,777,Couple Leisure,Economy Class,St Lucia to Gatwick,April 2019,no,1.0,2.0,1.0,1,1,1,,✅ Trip Verified | St Lucia to Gatwick on whic...
997,A380,Solo Leisure,Economy Class,Chicago to London,April 2019,no,1.0,1.0,,1,1,1,,✅ Trip Verified | Chicago to London. Cancelle...
998,,Family Leisure,Economy Class,London to Bangalore,September 2018,no,3.0,1.0,,1,2,3,,✅ Trip Verified | London to Bangalore. This w...


In the below line of code we are going to perform the following processes:


In [8]:
df = pd.read_csv('BritishAirways_reviews.csv')

In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 14 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   Aircraft                534 non-null    object 
 1   Type Of Traveller       998 non-null    object 
 2   Seat Type               1000 non-null   object 
 3   Route                   997 non-null    object 
 4   Date Flown              1000 non-null   object 
 5   Seat Comfort            914 non-null    object 
 6   Cabin Staff Service     907 non-null    float64
 7   Food & Beverages        786 non-null    float64
 8   Inflight Entertainment  520 non-null    float64
 9   Ground Service          943 non-null    object 
 10  Value For Money         1000 non-null   object 
 11  Recommended             1000 non-null   int64  
 12  Wifi & Connectivity     300 non-null    float64
 13  reviews                 1000 non-null   object 
dtypes: float64(4), int64(1), object(9)
memory

In [9]:
null_values = df.isnull().sum()  # Count the number of null values in each column

print("Null values in each column:")
print(null_values)


Null values in each column:
Aircraft                  466
Type Of Traveller           2
Seat Type                   0
Route                       3
Date Flown                  0
Seat Comfort               86
Cabin Staff Service        93
Food & Beverages          214
Inflight Entertainment    480
Ground Service             57
Value For Money             0
Recommended                 0
Wifi & Connectivity       700
reviews                     0
dtype: int64


In [17]:
# Specify the columns for which you want to replace NaN values with the mean
columns_to_fix = ['Ground Service', 'Inflight Entertainment', 'Food & Beverages', 'Cabin Staff Service', 'Seat Comfort', 'Ground Service', 'Wifi & Connectivity']

# Replace NaN values with the mean of specified columns
df[columns_to_fix] = df[columns_to_fix].fillna(df[columns_to_fix].mean())

KeyError: "None of [Index(['Ground Service', 'Inflight Entertainment', 'Food & Beverages',\n       'Cabin Staff Service', 'Seat Comfort', 'Ground Service',\n       'Wifi & Connectivity'],\n      dtype='object')] are in the [columns]"

In [11]:


# Assuming 'df' is your DataFrame containing the 'reviews' column

# Example DataFrame
data = {'reviews': ['Great hotel! ✅ Trip Verified', 'Average experience', 'Not Verified | Terrible service']}
df = pd.DataFrame(data)

# Text to remove
remove_text = ['✅ Trip Verified', 'Not Verified |']

# Iterate over each text element to remove
for text in remove_text:
    # Use replace with a regex pattern to remove the text
    df['reviews'] = df['reviews'].str.replace(text, '')

# Print the DataFrame after removing text
print(df)


              reviews
0       Great hotel! 
1  Average experience
2    Terrible service
