## Môi trường code
Các file notebooks chạy với môi trường `min_ds-env`

## 1. Thêm thư viện

In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np

## 2. Xây dựng hàm xử lý thu thập

-   `extract_rating`: Hàm này duyệt qua mỗi đánh giá, tìm element HTML có chứa thông tin về xếp hạng cho danh mục cụ thể đó. Nếu tìm thấy, hàm sẽ đếm số lượng 'span' có class 'fill' và trả về số lượng sao đó dưới dạng số. Nếu không tìm thấy hoặc không có thông tin sao, hàm sẽ trả về giá trị NaN.
- `get_data`: Hàm trích xuất thông tin từ cấu trúc HTML và tất cả thông tin được vào lưu vào một dataframe.

In [2]:
# Hàm xử lý dữ liệu trong phần đánh giá
def extract_rating(reviews, category):
    ratings = []
    for review in reviews:
        header_element = review.find('td', class_=f'review-rating-header {category}')
        if header_element:
            sibling = header_element.find_next_sibling('td')
            if sibling and 'stars' in sibling.get('class', []):
                # Đếm số sao đã fill cho rating
                rating = len(sibling.find_all('span', class_='fill'))
            else:
                rating = sibling.get_text().strip() if sibling else np.nan
        else:
            rating = np.nan  # Nan cho những thành phần trong review chưa được feedback
        ratings.append(rating)
    return ratings

# Hàm xử lý và lấy dữ liệu được trả về từ request 
def get_data(soup):
    date_published = [meta['content'] for meta in soup.find_all('meta', itemprop='datePublished')]
    rating_values = [value.get_text() for value in soup.find_all('span', itemprop='ratingValue')]
    rating_values= rating_values[1:]
    headers = [header.get_text() for header in soup.find_all('h2', class_='text_header')]
    names = [name.get_text() for name in soup.find_all('span', itemprop='name')]

    review_bodies = soup.find_all('div', itemprop='reviewBody')
    reviews = []
    verified_status = []
    for review in review_bodies:
        text = review.get_text()
        if 'Trip Verified' in text:
            status = 'Trip Verified'
            # Bỏ 'Trip Verified' và các string trước nó
            text_content = text.split('Trip Verified', 1)[-1]
        elif 'Not Verified' in text:
            status = 'Not Verified'
            # Bỏ 'Not Verified' và các string trước nó
            text_content = text.split('Not Verified', 1)[-1]
        else:
            status = 'Unknown'
            text_content = text.strip()
        text_content = text_content.split('|', 1)[-1].strip() if '|' in text_content else text_content.strip()
        
        verified_status.append(status)
        reviews.append(text_content)

    review_sections = soup.find_all('div', class_='review-stats')  # Lấy dữ liệu đánh giá 
    aircrafts = extract_rating(review_sections, 'aircraft')
    type_of_travellers = extract_rating(review_sections, 'type_of_traveller')
    cabin_flowns = extract_rating(review_sections, 'cabin_flown')
    routes = extract_rating(review_sections, 'route')
    date_flowns = extract_rating(review_sections, 'date_flown')
    seat_comforts = extract_rating(review_sections, 'seat_comfort')
    cabin_staff_services = extract_rating(review_sections, 'cabin_staff_service')
    food_and_beverages = extract_rating(review_sections, 'food_and_beverages')
    inflight_entertainments = extract_rating(review_sections, 'inflight_entertainment')
    ground_service = extract_rating(review_sections, 'ground_service')
    value_for_money = extract_rating(review_sections, 'value_for_money')
    wifi_and_connectivity = extract_rating(review_sections, 'wifi_and_connectivity')
    recommended = extract_rating(review_sections, 'recommended')

    data = {
        'Date published': date_published,
        'Name': names,
        'Review_header': headers,
        'Review_body': reviews,
        'Verified_review': verified_status,
        'Type_of_traveller': type_of_travellers,
        'Seat_type': cabin_flowns,
        'Route': routes,
        'Date_flown': date_flowns,
        'Aircraft': aircrafts,
        'Seat_comfort': seat_comforts,
        'Cabin_staff_service': cabin_staff_services,
        'Food_and_beverages': food_and_beverages,
        'Inflight_entertainments': inflight_entertainments,
        'Ground_service': ground_service,
        'Value_for_money': value_for_money,
        'Wifi_and_connectivity': wifi_and_connectivity,
        'Overall_rating': rating_values,
        'Recommended': recommended
    }
    temp_df = pd.DataFrame(data)
    return temp_df

## 3. Thu thập dữ liệu

-   Nhóm sẽ thu thập dữ liệu từ [trang web](https://www.airlinequality.com/airline-reviews/british-airways/page/1/?sortby=post_date%3ADesc&pagesize=100). Đây là một trang tổng hợp đánh giá về hãng hàng không British Airways trên trang Airline Quality.
-   Mỗi url chứa 100 feedback của khách hàng, thay đổi giá trị page_number trong url để lấy dữ liệu ở các trang tiếp theo.

In [3]:
base_url = 'https://www.airlinequality.com/airline-reviews/british-airways/page/'
url_parameters = '/?sortby=post_date%3ADesc&pagesize=100'

urls = [f"{base_url}{page_number}{url_parameters}" for page_number in range(1, 31)]
main_df = pd.DataFrame()
for url in urls:
    try:
        response = requests.get(url)
        if response.status_code == 200:
            html = response.text
            soup = BeautifulSoup(html, 'html.parser')
            temp_df = get_data(soup) 
            main_df = pd.concat([main_df, temp_df], ignore_index=True)
        else:
            print(f"Failed to fetch data from {url} - Status code: {response.status_code}")
    except Exception as e:
        print(f"An error occurred while processing {url}: {e}")
   
main_df.head(10)

Unnamed: 0,Date published,Name,Review_header,Review_body,Verified_review,Type_of_traveller,Seat_type,Route,Date_flown,Aircraft,Seat_comfort,Cabin_staff_service,Food_and_beverages,Inflight_entertainments,Ground_service,Value_for_money,Wifi_and_connectivity,Overall_rating,Recommended
0,2023-12-21,T Maddern,“I couldn’t fault them”,This was our first flight with British Airways...,Trip Verified,Family Leisure,Business Class,London Heathrow to Stockholm,December 2023,A329,4.0,5.0,5.0,,5.0,5,,10,yes
1,2023-12-21,K Pearson,“incompetence in customer service”,I recently encountered a highly disappointing ...,Trip Verified,Solo Leisure,Economy Class,Indianapolis to Mumbai via London Heathrow,December 2023,,1.0,1.0,,,1.0,1,,1,no
2,2023-12-20,C Shaw,“The most ridiculous thing I've heard”,Beware! BA don't provide any refund due to a v...,Not Verified,Family Leisure,Premium Economy,Cyprus to New York,December 2023,,,,,,1.0,1,,1,no
3,2023-12-20,C Horner,"""one of the most disgusting onboard meals""",Check in was chaotic and badly organised. Even...,Trip Verified,Solo Leisure,Business Class,Gran Canaria to London Gatwick,December 2023,A321,3.0,4.0,1.0,,3.0,5,,6,yes
4,2023-12-18,C Lowe,“Does BA pay experts to design these breakfasts”,All 4 of our flights were fine in terms of com...,Not Verified,Couple Leisure,Business Class,Belfast City to Atlanta via London Heathrow,November 2023,A320/Boeing 777,4.0,5.0,1.0,3.0,5.0,5,,6,yes
5,2023-12-17,Greg Zarelli,"""business class seats were abysmal""",I had hoped this would be a good review but un...,Not Verified,Business,Business Class,London to Portland,December 2023,Boeing 787-8,3.0,3.0,4.0,3.0,3.0,2,,4,no
6,2023-12-15,D Kears,"""Worst service/airline ever""",I was going to fly to Frankfurt from Glasgow o...,Trip Verified,Solo Leisure,Economy Class,London to Frankfurt,June 2023,,1.0,,,,1.0,1,,1,no
7,2023-12-14,B Jackson,“Much better experience this time around”,Much better experience this time around. Flew ...,Trip Verified,Solo Leisure,Business Class,Cape Town to London Heathrow,December 2023,A350,5.0,4.0,5.0,5.0,5.0,4,1.0,8,yes
8,2023-12-12,H Miller,"“our compensation rights""",Once again a terrible business class experienc...,Trip Verified,Couple Leisure,Business Class,Miami to Manchester via Heathrow,December 2023,,1.0,3.0,2.0,3.0,1.0,2,1.0,3,no
9,2023-12-12,Allan Gittens,"""A380's are showing their age""","BA A380's are showing their age, hopefully the...",Trip Verified,Solo Leisure,Economy Class,Johannesburg to London,December 2023,A380,2.0,4.0,2.0,2.0,4.0,3,,4,no


## 4. Lưu dữ liệu 

In [4]:
main_df.to_csv('../data/customer_feedback.csv', index=False)
main_df.to_excel('../data/customer_feedback.xlsx', index=False)