In [3]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

from retrying import retry
import time
import traceback

In [4]:
def fetch_webpage(url):
    response = requests.get(url)
    if response.status_code == 200:
        return response.text
    else:
        print(f"Failed to fetch webpage. Status code: {response.status_code}")
        return None

def parse_html(html_content):
    return BeautifulSoup(html_content, 'html.parser')

In [6]:
MAX_PAGES=23
list_url = [f'https://www.airlinequality.com/airline-reviews/ryanair/page/{page}/?sortby=post_date%3ADesc&pagesize=100' for page in range(1, MAX_PAGES+1)]

In [7]:
list_url

['https://www.airlinequality.com/airline-reviews/ryanair/page/1/?sortby=post_date%3ADesc&pagesize=100',
 'https://www.airlinequality.com/airline-reviews/ryanair/page/2/?sortby=post_date%3ADesc&pagesize=100',
 'https://www.airlinequality.com/airline-reviews/ryanair/page/3/?sortby=post_date%3ADesc&pagesize=100',
 'https://www.airlinequality.com/airline-reviews/ryanair/page/4/?sortby=post_date%3ADesc&pagesize=100',
 'https://www.airlinequality.com/airline-reviews/ryanair/page/5/?sortby=post_date%3ADesc&pagesize=100',
 'https://www.airlinequality.com/airline-reviews/ryanair/page/6/?sortby=post_date%3ADesc&pagesize=100',
 'https://www.airlinequality.com/airline-reviews/ryanair/page/7/?sortby=post_date%3ADesc&pagesize=100',
 'https://www.airlinequality.com/airline-reviews/ryanair/page/8/?sortby=post_date%3ADesc&pagesize=100',
 'https://www.airlinequality.com/airline-reviews/ryanair/page/9/?sortby=post_date%3ADesc&pagesize=100',
 'https://www.airlinequality.com/airline-reviews/ryanair/page/10

In [8]:
# Initialize an empty DataFrame to store the comments data
comments_data = pd.DataFrame(columns=['Date Published', 'Overall Rating', 'Passenger Country', 'Trip_verified', 'Comment title','Comment', 
                                       'Aircraft', 'Type Of Traveller', 'Seat Type', 'Origin', 'Destination' 'Date Flown', 
                                       'Seat Comfort', 'Cabin Staff Service', 'Food & Beverages', 'Ground Service', 
                                       'Value For Money', 'Recommended'])
comments_data_list = [] 

class_to_label = {
    'aircraft': 'Aircraft',
    'type_of_traveller': 'Type Of Traveller',
    'cabin_flown': 'Seat Type',
    'route': 'Route',
    'date_flown': 'Date Flown',
    'seat_comfort': 'Seat Comfort',
    'cabin_staff_service': 'Cabin Staff Service',
    'food_and_beverages': 'Food & Beverages',
    'inflight_entertainment':'Inflight Entertainment',
    'ground_service': 'Ground Service',
    'wifi_and_connectivity':'Wifi & Connectivity',
    'value_for_money': 'Value For Money',
    'recommended': 'Recommended'
}

In [9]:
for url in list_url:
    html_content = fetch_webpage(url)
    
    if html_content:
        soup = parse_html(html_content)
        
        # Find all comment elements
        comments = soup.find_all('article', itemprop='review')  # Only the first 5 comments
        
        for comment in comments:
            try:
                date_published = comment.find('meta', itemprop='datePublished')['content']
                rating = comment.find('span', itemprop='ratingValue')
                if rating:
                    rating=rating.text
                else:
                    rating=''
                
                text_header = comment.find('h2', class_='text_header').text

                text_sub_header_text = comment.find('h3', class_='text_sub_header userStatusWrapper').get_text(strip=True)
                country = text_sub_header_text.split('(')[-1].split(')')[0]

                text_content = comment.find('div', class_='text_content', itemprop='reviewBody')

                # Find the element containing 'Not Verified' or 'Trip Verified'
                verification = text_content.find('strong')
                if verification:
                    verification =verification.text.strip()
                else:
                    verification= ''
                text_content = text_content.text.strip()
                
                if '|' in text_content:
                    text_content= text_content.split('|')[1].strip()


                review_ratings = comment.find('table', class_='review-ratings')
                review_ratings = comment.find_all('tr')

                table_data = {}
                for row in review_ratings:
                    # Find the header and value cells
                    header_cell = row.find('td', class_='review-rating-header')
                    value_cell = row.find('td', class_='review-value')
                    value2_cell = row.find('td', class_='review-rating-stars')

                    # Check if both header and value cells exist
                    if header_cell and (value_cell or value2_cell):
                        # Get the class name of the header cell
                        class_name = header_cell['class'][1]

                        # Get the corresponding data label from the class_to_label dictionary
                        data_label = class_to_label.get(class_name, '')

                        # Store the data label and value in the table_data dictionary
                        if value_cell:
                            value = value_cell.text.strip()
                            # If the feature is 'Route', split the value into origin and destination
                            if data_label == 'Route':
                                if 'to' in value:
                                    origin, destination = value.split(' to ')
                                elif '-' in value:
                                    origin, destination, _ = value. split('-')
                                table_data['Origin'] = origin.strip()
                                table_data['Destination'] = destination.strip()
                            else:
                                table_data[data_label] = value
                        else:
                            filled_star_spans = value2_cell.find_all('span', class_='star fill')
                            table_data[data_label] = int(len(filled_star_spans))

                # Append the data from the current comment to the list
                comments_data_list.append({'Date Published': date_published, 'Overall Rating': rating, 
                                           'Passenger Country': country, 'Trip_verified': verification, 
                                           'Comment title': text_header, 'Comment': text_content, **table_data})
                
            except Exception as e:
                print(f'Error en el comentario: {url[60:62]} -> {comments.index(comment)}')
                traceback.print_exc()

In [10]:
# Convert the list of dictionaries into a DataFrame
comments_data = pd.DataFrame(comments_data_list)

In [11]:
comments_data.to_csv('ryanair_reviews.csv', encoding='utf-8')

In [12]:
comments_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2254 entries, 0 to 2253
Data columns (total 20 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   Date Published          2254 non-null   object 
 1   Overall Rating          2254 non-null   object 
 2   Passenger Country       2254 non-null   object 
 3   Trip_verified           2254 non-null   object 
 4   Comment title           2254 non-null   object 
 5   Comment                 2254 non-null   object 
 6   Type Of Traveller       1640 non-null   object 
 7   Seat Type               2254 non-null   object 
 8   Origin                  1639 non-null   object 
 9   Destination             1639 non-null   object 
 10  Date Flown              1636 non-null   object 
 11  Seat Comfort            2142 non-null   float64
 12  Cabin Staff Service     2133 non-null   float64
 13  Ground Service          1583 non-null   float64
 14  Value For Money         2253 non-null   