# African Airlines Sentiment Analysis (2015-Present)
**Data Extraction** | Extracting Customer Reviews from 2015 on the Top African Airlines as of 2019)

Compiling results from [Routes Online](https://www.routesonline.com/news/29/breaking-news/287576/these-are-the-top-ten-largest-african-carriers-/) and [Skytrax](https://www.worldairlineawards.com/best-airlines-2019-by-region/), these are the top African airlines in no particular order:
- Ethiopian Airlines
- South African Airways
- Air Mauritius
- EgyptAir
- Royal Air Maroc
- Air Algerie
- Comair
- Kenya Airways
- Tunisair
- Fastjet
- Air Seychelles
- FlySafair

### 1. Import Relevant Packages

In [2]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
from datetime import datetime as dt
import time
from glob import glob

### 2. Scrape HTML content from page


In [10]:
def reviews_scraper(webpage):
    
    #Get HTML content from page
    URL = webpage
    page = requests.get(URL)
    
    #Create html parser object with beautiful soup
    soup = BeautifulSoup(page.content, "html.parser")
    
    
    #Get all the reviews on the page
    all_reviews = soup.findAll('article')
    reviews = all_reviews[0].findAll('article')

    #List to store the data of each review
    reviews_list = []
    
    #Iterate through all the reviews
    for review in reviews:
    
        #Dictionary to store the data for each review
        review_data = {}

        """
        Extracting the review date
        """

        #Get the date the review was published
        date_published = dt.strptime(review.find('time',{'itemprop': 'datePublished'})['datetime'], '%Y-%M-%d')
        
        #If the review is published before 2015, terminate scraping
        if date_published < dt.strptime('2015','%Y'):
            
            #Return True (indicating scraping should be terminated) and the current df
            return True, pd.DataFrame(reviews_list)
        
        else:
            review_data['date_published'] = date_published

        """
        Extracting the review content and verification status
        """

        #Get the review content
        review_content = review.find('div',{'class': 'text_content'}).text.split('|')

        if len(review_content) > 1: #i.e. there is a verification tag
            
            #Indicate if the review is verified or not, and note in dictionary
            review_data['verified'] = False if 'Not' in review_content[0] else True
            
            #Get the actual review text, removing the trip verification tag
            review_data['review_text'] = review_content[1].strip()
            
        else:
            
            #No verification tag, get review text
            review_data['verified'] = np.nan
            
            #Get the actual review text, removing the trip verification tag
            review_data['review_text'] = review_content[0].strip()


        """
        Extracting the review ratings
        """

        #Get the ratings table 
        ratings_table = review.find('table',{'class': 'review-ratings'})

        #Get all the attributes being rated on
        ratings = ratings_table.find_all('tr')

        #For all the attributes
        for rating in ratings:

            #Get the attribute name
            rating_name = rating.find('td').text.lower().replace(' ','_')

            #Get the characteristic or the rating where relevant
            try:
                review_data[rating_name.lower()] = rating.find('td',{'class':'review-value'}).text
            except:
                review_data[rating_name.lower()] = len(rating.find_all('span',{'class': 'star fill'}))
        
        #Store the record in a list – this list will ultimately be transformed to a dataframe
        reviews_list.append(review_data)
        
    return False, pd.DataFrame(reviews_list)     
    

In [11]:
def multi_page_scraper(airline):
    
    
    """
    Function to scrape review data from multiple pages
    
    Inputs:
        - airline (int): Indicator for the airline to scrape data on
        - num_pages (int): The number of pages to scrape from, where each page yields 100 reviews (if available)
        
    Output:
        - merged_df (pd DataFrame): A DataFrame of the reviews from all the pages
    """
    
    #Empty dataframe to store the page reviews
    merged_df = pd.DataFrame()
    
    """
    Multi-page scraping:
    """
    
    #Variable to track if the page iterations should be terminated
    terminate = False
    
    #Start search from page 1
    pg = 1
    
    #Iterate through pages until a review from later than 2015 is encountered
    while not terminate:
        
        #Try to scrape the page
        try:
            
            webpage = f'https://www.airlinequality.com/airline-reviews/{airline}/page/{str(pg)}/?sortby=post_date%3ADesc&pagesize=100'
            
            terminate, df = reviews_scraper(webpage)

            #Note the airline
            df.insert(0,'Subject',airline)
        
            #Add the scraped reviews to the merged_df
            merged_df = merged_df.append(df)
            
            #Go to the next page
            pg += 1
        
        #If any error (most likely page does not exist i.e. reviews finished), terminate scraping for the airline
        except:
            
            terminate = True
        
    #Reformat airline name
    airline_name = airline.replace('-','_')
        
    #Returns the airline name and the dataframe
    return airline_name, merged_df

### 3. Get and write scraped data to CSV file

In [14]:
airlines = ['ethiopian-airlines','south-african-airways','air-mauritius',
            'egyptair','royal-air-maroc','air-algerie','comair','kenya-airways',
            'tunisair','fastjet','air-seychelles','flysafair']

for airline in airlines:
    airline_name, df = multi_page_scraper(airline)
    time.sleep(0.5)

    #Write dataframe to csv
    df.to_csv(f'../data/raw/{airline_name}.csv')

### A brief showcase of what the dataframe looks like:

In [52]:
df.head()

Unnamed: 0,Subject,date_published,verified,review_text,type_of_traveller,seat_type,route,date_flown,seat_comfort,cabin_staff_service,ground_service,value_for_money,recommended,aircraft,food_&_beverages,inflight_entertainment,wifi_&_connectivity
0,flysafair,2021-01-27 00:09:00,True,Terrible.....they state they are the most on t...,Family Leisure,Economy Class,Johannesburg to Durban,September 2021,1.0,1.0,3.0,1.0,no,,,,
1,flysafair,2021-01-27 00:09:00,False,Cape Town to Lanseria and Lanseria to Cape Tow...,Couple Leisure,Economy Class,Cape Town to Lanseria,September 2021,4.0,5.0,5.0,4.0,yes,Boeing 737-800,,,
2,flysafair,2021-01-23 00:09:00,False,My parents booked a flight for the 22nd of Sep...,Couple Leisure,Economy Class,Johannesburg to George,September 2021,,,1.0,1.0,no,,,,
3,flysafair,2021-01-17 00:09:00,True,Everything went smoothly. I didn’t expect much...,Business,Economy Class,East London to Johannesburg,September 2021,4.0,5.0,5.0,5.0,yes,Boeing 737,,,
4,flysafair,2021-01-14 00:08:00,True,We were already on the aircraft and it had sta...,Family Leisure,Economy Class,Cape Town to Johannesburg,August 2021,2.0,1.0,1.0,1.0,no,,,,


### 4. Merge the dataframe of all the airlines together

In [8]:
filepath = r'../data/raw' #File path with all data for the respective airlines
all_files = glob(filepath + "/*.csv") #Reference all the files

#List to store the files as dataframes
files_list = []

#Iterating through all the files
for filename in all_files:
    
    #Read the csv and append to the files_list
    df = pd.read_csv(filename,index_col=0)
    files_list.append(df)

#Form a merged dataframe with data on all airlines
airlines_df = pd.concat(files_list, axis=0, ignore_index=True)

In [9]:
airlines_df.to_csv(f'../data/raw/african_airlines.csv')

In [10]:
airlines_df.shape

(1256, 17)