In [276]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import re
import numpy as np

# Webscraping passenger seats per aircraft manufacturer and model

[Source](https://blog.thetravelinsider.info/airplane-types)

In [2]:
#!pip install html5lib

In [3]:
# url
url = "https://blog.thetravelinsider.info/airplane-types"

#parse
soup = BeautifulSoup(requests.get(url).text, "html5lib") # We have to use html5lib parser to correctly parse the table

In [9]:
# find table
table = soup.find_all("table", id="tablepress-17")

In [22]:
# find all rows in table
rows = [row for row in table[0].find_all("tr")] # table has to be indexed in order to call find_all

In [80]:
# get row number of parts of the table which separate two manufactures ("RETURN TO TOP")
idxs = [row.parent.attrs["class"][0].split("-")[-1] for row in table[0].find_all("td", {"colspan":"8"}) if row.find_all("a", href="#Top")]

In [634]:
# split complete table in multiple tables (one per manufacturer) (result list of lists with rows per manufacturer)
man_tables = []

for i, row in enumerate(idxs):
    if i ==0:
        man_tables.append(rows[1:int(row)-1])
    else:
        man_tables.append(rows[int(idxs[i-1]):int(row)-1])

In [625]:
# List of manufactures name
man_names = [re.findall("^[^\(]+", i[0].text)[0].strip() for i in man_tables]

In [891]:
def seats_p_model(man_tables):

    dfs=[]
    
    # List of manufactures name
    man_names = [re.findall("^[^\(]+", i[0].text)[0].strip() for i in man_tables]

    for j, manufacturer in enumerate(man_tables):

        #starting from row 2 get values of column-1 (model) and column-4 (seatings)
        models_final = []

        for i, row in enumerate(manufacturer[2:]):
            columns = row.find_all("td")
            models_temp = [(",").join(re.findall("^\w+\s?\d{2,3}-?\w+[\/]?\w+|^\w{1,8}[\s-]?\w+[\-\w]*", line.strip())) for line in columns[0].strings] # get different linebreaks
            pax_temp = [line for line in columns[3].strings]

            # Calculate avg number of seats 
            no_seats_avg = [list(map(int, re.findall("(?<![\s][-(\/\w])\d{3}(?![\n\)])|(?<!\/)^\d{2,3}", text))) for text in pax_temp] # find all numbers which are not somehow related to aircraft model 

            #map mean seat number to the models mentioned in one row
            try: 
                regex_res = [re.findall("(?<=\s-)\d{2,3}", text)[0] for text in pax_temp] # info about model in seatings text
            except:
                regex_res=[]

            # create dict for all model types in one row and fill with avg seats
            avg_flat = [item for sublist in no_seats_avg for item in sublist]
            if len(avg_flat)!=0:
                mu = np.round(np.mean(avg_flat),0).astype(int)
            else:
                mu = -1
                
            dict_model_seats = {model.strip():mu for model in models_temp}

            if len(regex_res)!=0:

                # find out if and in which line a model is mentioned int the seats text 
                temp=[]
                for m in models_temp:
                    temp += [(m, i) for i, string in enumerate(regex_res) if m[-3:] in string] #list of tuples #1 idx of sentence in which model was metioned

                #update dict values with average for specific model
                for model, idx_sent in temp:
                    dict_model_seats[model]=np.round(np.mean(no_seats_avg[idx_sent]),0).astype(int)

            models_final.append(dict_model_seats)

        #create df
        df = pd.DataFrame(pd.DataFrame(models_final, dtype="int").T.sum(axis=1), columns=["avg_no_seats"])
        df=df[["avg_no_seats"]].astype("int16", copy=False)
        df["manufacturer"]=man_names[j]
        dfs.append(df)
    
    # Combine all Dataframes
    df_final = pd.concat(dfs, axis=0)
    
    #Drop some rows which have not been covered by REGEX
    drop_idx = ["", "Twodecks", "Note", "fourpage", "four page", "formerly MD95", "new engines","Two decks", "To be", "all 747s", "to as", "AirForce One", "freight and", "SP", "Performance", "first version", "vertical wingtips", "renamed Avro", "repeated crashes", "now renamed", "now the", "generic series", "first was", "also known"]
    df_final = df_final[~df_final.index.isin(drop_idx)]
    
    # Set some seat values manually which have not been covered by REGEX or where no values exist
    df_final.loc["A 320neo", "avg_no_seats"]=df_final.loc["A 320", "avg_no_seats"]
    df_final.loc["Q-300", "avg_no_seats"]=54
    df_final.loc[df_final.index.str.startswith("IL-114"), "avg_no_seats"]=64
    df_final.loc[df_final.index.str.startswith("CRJ70"), "avg_no_seats"]=75
    
    df_final=df_final[df_final.avg_no_seats!=-1]
    
    #Remove whitespace for airbus models
    old_idx = df_final.loc[df_final.manufacturer=="Airbus"].index.to_list()
    new_idx = df_final.loc[df_final.manufacturer=="Airbus"].index.str.replace(" ","").to_list()
    df_final.rename(index={o:n for o,n in zip(old_idx, new_idx)}, inplace=True)
    
    #Fix naming Embraer models
    old_idx = df_final.loc[(df_final.manufacturer=="Embraer")&(df_final.index.str.startswith("1"))].index.to_list()
    new_idx = "ERJ-"+df_final.loc[(df_final.manufacturer=="Embraer")&(df_final.index.str.startswith("1"))].index.astype("str")#.to_list()
    df_final.rename(index={o:n for o,n in zip(old_idx, new_idx)}, inplace=True)
    
    old_idx = df_final.loc[(df_final.manufacturer=="Embraer")&(df_final.index.str.startswith("ERJ "))].index.to_list()
    new_idx = df_final.loc[(df_final.manufacturer=="Embraer")&(df_final.index.str.startswith("ERJ "))].index.str.replace(" ","-").to_list()
    df_final.rename(index={o:n for o,n in zip(old_idx, new_idx)}, inplace=True)
    
    return df_final




In [892]:
df_final = seats_p_model(man_tables)

In [895]:
df_final

Unnamed: 0,avg_no_seats,manufacturer
ATR 42,46,Aerospatiale & Avions de Transport Régional
ATR 42-500,46,Aerospatiale & Avions de Transport Régional
ATR 72,66,Aerospatiale & Avions de Transport Régional
A220,132,Airbus
A220-100,120,Airbus
A220-300,144,Airbus
A300,281,Airbus
A300-B2/B4,281,Airbus
A300-600,281,Airbus
A310,212,Airbus


In [896]:
df_final.to_csv("../raw_data/seats_p_aircraft.csv")

### Prepare for matching with pflights data

In [29]:
# Load seat data (webscraped)
seat_data = pd.read_csv("../raw_data/seats_p_aircraft.csv")
seat_data.rename(columns={"Unnamed: 0":"model_no"},inplace=True)

#Use Regex on seat_data for matching
seat_data["model_no_stripped"]= seat_data.model_no
seat_data["model_no_stripped"] = seat_data.model_no.str.findall("\w{1,4}[-\s][\w]?|^\w{4}?|^\w{3,4}\Z").transform("".join).copy()
seat_data.loc[seat_data.model_no.str.contains("A-320neo|A320neo", na=False),"model_no_stripped"]="A320neo"
seat_data.loc[seat_data.model_no.str.contains("A-321neo|A321neo", na=False),"model_no_stripped"]="A321neo"
seat_data.loc[seat_data.model_no.str.contains("ERJ", na=False),"model_no_stripped"]=seat_data.loc[seat_data.model_no.str.contains("ERJ", na=False),"model_no"].to_list()
indexer = seat_data[(seat_data.model_no.str.contains("CRJ\d{3}|Q-\d+", na=False, regex=True, case=False))].index
seat_data.loc[indexer, 'model_no_stripped'] = seat_data.loc[indexer, 'model_no'].str.findall("CRJ\d{3}|Q-\d+").transform("".join)


In [30]:
# Add two missing airplanes from Airbus to list
line1 = pd.DataFrame({"model_no": "A330-9", "avg_no_seats": 287, "manufacturer": "Airbus", "model_no_stripped": "A330-9"}, index=[22])
line2 = pd.DataFrame({"model_no": "A330-8", "avg_no_seats": 257, "manufacturer": "Airbus", "model_no_stripped": "A330-8"}, index=[23])
seat_data = pd.concat([seat_data.iloc[:22], line1, line2, seat_data.iloc[22:]]).reset_index(drop=True)

In [31]:
#Add abbreviation ATR to manufacturer name "Aerospatiale & Avions de Transport Régional"
seat_data.loc[seat_data.manufacturer.str.contains("Aerospatiale & Avions de Transport Régional", na=False),"manufacturer"]="Aerospatiale & Avions de Transport Régional/ATR"

In [32]:
# Add missing airplane from Saab to list
line1 = pd.DataFrame({"model_no": "340B", "avg_no_seats": 30, "manufacturer": "Saab", "model_no_stripped": "340 B"}, index=[208])
seat_data = pd.concat([seat_data.iloc[:208], line1, seat_data.iloc[208:]]).reset_index(drop=True)

In [37]:
seat_data[seat_data.manufacturer=="Embraer"]

Unnamed: 0,model_no,avg_no_seats,manufacturer,model_no_stripped
136,ERJ-120 Brasilia,30,Embraer,ERJ-120 Brasilia
137,ERJ-135,37,Embraer,ERJ-135
138,ERJ-145,50,Embraer,ERJ-145
139,ERJ-170,66,Embraer,ERJ-170
140,ERJ-175,78,Embraer,ERJ-175
141,ERJ-190,102,Embraer,ERJ-190
142,ERJ-195,111,Embraer,ERJ-195


In [None]:
#Change Order of abbreviation to manufacturer name "Aerospatiale & Avions de Transport Régional"
#seat_data.loc[seat_data.manufacturer.str.contains("Aerospatiale & Avions de Transport Régional", na=False),"manufacturer"]="Aerospatiale & Avions de Transport Régional/ATR"

In [35]:
seat_data.to_csv("../raw_data/seats_p_aircraft_final.csv")