In [1]:
%matplotlib inline
import pandas as pd
import numpy as np
import re
from matplotlib.ticker import MaxNLocator
import matplotlib.pyplot as plt
from requests import get,codes
from bs4 import BeautifulSoup
import time
import json
import os
import pickle
import lzma

In [2]:
url = 'https://en.wikipedia.org/wiki/Fuel_economy_in_aircraft'
path = os.path.join(".", "data","wikipedia","wiki_page.data.zx")
if(os.path.isfile(path)):
    with lzma.open(path, "rb") as wiki_file:
        print("loading wikipedia data from hard drive")
        r = pickle.load(wiki_file)
else:
    try:
        r = get(url)
        print("fetching online wikipedia page")
        if(r.status_code == codes.ok): 
            with lzma.open(path, "wb") as wiki_file:
                pickle.dump(r, wiki_file)
        else:
            raise Exception("Cannot find local version of wikipedia data and website not responding with 200.")
    except:
        raise Exception("Cannot access the internet")
soup = BeautifulSoup(r.text, 'html.parser')

loading wikipedia data from hard drive


In [3]:
dataframes = []
for table in soup.find_all("table"):
    table_headers = []
    dict_plane = {}
    for header in table.find_all("th"):
        head = header.text.replace("\n", "")
        table_headers.append(head)
        dict_plane[head] = []
    for line in table.find_all("tr"):
        for i, elem in enumerate(line.find_all('td')):
            dict_plane[table_headers[i]].append(elem.text.replace("\n", ""))
    dataframes.append(dict_plane)
    
dataframes = dataframes[:-1] #We don't want the private jets
print("There are " + str(len(dataframes))+ " dataframes")

There are 5 dataframes


In [4]:
commuter = pd.DataFrame.from_dict(dataframes[0]) # 560km
regional = pd.DataFrame.from_dict(dataframes[1]) # 926km - 1267km
short_haul = pd.DataFrame.from_dict(dataframes[2])# 1900km
medium_haul = pd.DataFrame.from_dict(dataframes[3]) # 3240km - 6300km
long_haul = pd.DataFrame.from_dict(dataframes[4]) # 8610km - 13330km

# Add missing sector
commuter["Sector"] = "560"
short_haul["Sector"] = "1900"

# Rename columns so that all the DF have same column names
regional = regional.rename(columns={"Fuel efficiency per seat": "Fuel per seat"})
short_haul = short_haul.rename(columns={"Fuel efficiency per seat": "Fuel per seat", "Fuel Burn": "Fuel burn"})

In [5]:
# Helper method to reorder the DF with sector at the end
def reorder_df(df):
    return df[["Model", "First flight", "Seats", "Fuel burn", "Fuel per seat", "Sector"]]

In [6]:
# No need to reorder commuter and short_haul since sector is already at the end
regional = reorder_df(regional)
medium_haul = reorder_df(medium_haul)
long_haul = reorder_df(long_haul)

# CO2 emissions

For 1kg of fuel, 3.304kg of CO2 is emitted

In [25]:
#CO2_ratio = 3.086

## Keep only useful field

In [8]:
def clean_sector(x):
    sector = x.split("(")
    # If no "(" in x, the sector is already clean
    if(len(sector) < 2):
        return float(x)
    return float(sector[1].split("k")[0].replace(",", ""))

In [9]:
def clean_df(df):
    new_df = df[["Model", "Seats", "First flight", "Fuel burn", "Fuel per seat", "Sector"]].copy()
    new_df["First flight"] = new_df["First flight"].apply(lambda x: int(x))
    new_df["Fuel burn"] = new_df['Fuel burn'].apply(lambda x: float(x.split("k")[0]))
    new_df["Fuel per seat"] = new_df['Fuel per seat'].apply(lambda x: float(x.split("L")[0])) 
    #new_df["CO2 kg/km"] = new_df["Fuel burn"].apply(lambda x: x*CO2_ratio)
    new_df["Seats"] = new_df["Seats"].apply(lambda x: int(x))
    new_df["Sector"] = new_df["Sector"].apply(lambda x: clean_sector(x))
    return new_df.rename(columns={"Fuel burn": "Fuel burn kg/km", "Fuel per seat": "Fuel per seat L/100km", "Sector": "Sector km"})

In [10]:
commuter_clean = clean_df(commuter)
regional_clean = clean_df(regional)
short_haul_clean = clean_df(short_haul)
medium_haul_clean = clean_df(medium_haul)
long_haul_clean = clean_df(long_haul)

In [11]:
regional_clean

Unnamed: 0,Model,Seats,First flight,Fuel burn kg/km,Fuel per seat L/100km,Sector km
0,Airbus A319neo,144,2015,3.37,2.92,1100.0
1,Airbus A319neo,124,2015,2.82,2.82,1220.0
2,Airbus A320neo,154,2015,2.79,2.25,1220.0
3,Airbus A321neo,192,2015,3.3,2.19,1220.0
4,Antonov An-148,89,2004,2.89,4.06,1267.0
5,Antonov An-158,99,2010,3.0,3.79,1267.0
6,Boeing 737-300,126,1984,3.49,3.46,939.0
7,Boeing 737-600,110,1998,3.16,3.59,930.0
8,Boeing 737-700,126,1997,3.21,3.19,930.0
9,Boeing 737 MAX 7,128,2017,2.85,2.77,1220.0


In [12]:
regional_clean.mean()

Seats                     110.976190
First flight             2004.833333
Fuel burn kg/km             2.717619
Fuel per seat L/100km       3.351667
Sector km                1070.166667
dtype: float64

In [13]:
commuter_clean

Unnamed: 0,Model,Seats,First flight,Fuel burn kg/km,Fuel per seat L/100km,Sector km
0,Antonov An-148 (241 nmi),89,2004,4.23,5.95,560.0
1,Antonov An-158 (241 nmi),99,2010,4.34,5.47,560.0
2,ATR 42-500,48,1995,1.26,3.15,560.0
3,ATR 72-500,70,1997,1.42,2.53,560.0
4,Beechcraft 1900D (226 nm),19,1982,1.0,6.57,560.0
5,Bombardier CRJ100,50,1991,2.21,5.5,560.0
6,Bombardier CRJ200,50,1995,2.18,5.43,560.0
7,Bombardier CRJ700,70,1999,2.95,5.25,560.0
8,Bombardier CRJ900,88,2001,3.47,4.91,560.0
9,Bombardier Dash 8 Q400,78,1998,2.16,3.46,560.0


In [14]:
commuter_clean.mean()

Seats                      54.944444
First flight             1995.222222
Fuel burn kg/km             2.013333
Fuel per seat L/100km       4.683333
Sector km                 560.000000
dtype: float64

In [15]:
short_haul_clean

Unnamed: 0,Model,Seats,First flight,Fuel burn kg/km,Fuel per seat L/100km,Sector km
0,Airbus A319,124,1995,2.93,2.95,1900.0
1,Airbus A319Neo,136,2015,2.4,1.93,1900.0
2,Airbus A320,150,1987,3.13,2.61,1900.0
3,Airbus A321-200,180,1996,3.61,2.5,1900.0
4,Airbus A330-200,293,1997,5.6,2.37,1900.0
5,Antonov An-148 (1190 nmi),89,2004,2.75,3.86,1900.0
6,Antonov An-158 (1190 nmi),99,2010,2.83,3.57,1900.0
7,Boeing 737-600,110,1998,2.77,3.15,1900.0
8,Boeing 737-700,126,1997,2.82,2.79,1900.0
9,Boeing 737-700,128,1997,2.8,2.71,1900.0


In [16]:
short_haul_clean.mean()

Seats                     152.9200
First flight             2002.3600
Fuel burn kg/km             3.1080
Fuel per seat L/100km       2.6872
Sector km                1900.0000
dtype: float64

In [17]:
medium_haul_clean

Unnamed: 0,Model,Seats,First flight,Fuel burn kg/km,Fuel per seat L/100km,Sector km
0,Airbus A320,150,1987,2.91,2.43,3984.0
1,Airbus A321NeoLR,154,2016,2.99,2.43,6300.0
2,Airbus A330-200,241,1997,6.0,3.11,5600.0
3,Airbus A330-300,262,1992,6.25,2.98,5600.0
4,Airbus A330neo-900,310,2016,6.0,2.42,6200.0
5,Airbus A340-300,262,1992,6.81,3.25,5600.0
6,Boeing 737 MAX-8,168,2017,2.86,2.13,6300.0
7,Boeing 737 MAX-9,144,2017,2.91,2.53,6300.0
8,Boeing 747-400,416,1988,10.77,3.24,3984.0
9,Boeing 747-8,467,2011,9.9,2.65,5600.0


In [18]:
medium_haul_clean.mean()

Seats                     254.000000
First flight             1998.592593
Fuel burn kg/km             5.582222
Fuel per seat L/100km       2.743704
Sector km                5575.518519
dtype: float64

In [19]:
long_haul_clean

Unnamed: 0,Model,Seats,First flight,Fuel burn kg/km,Fuel per seat L/100km,Sector km
0,Airbus A330-200,241,1997,6.4,3.32,11000.0
1,Airbus A330neo-800,248,2017,5.45,2.75,8610.0
2,Airbus A330neo-900,300,2017,5.94,2.48,8610.0
3,Airbus A340-300,262,1992,7.32,3.49,11000.0
4,Airbus A350-900,315,2013,6.03,2.39,9208.0
5,Airbus A350-900,315,2013,7.07,2.81,12116.0
6,Airbus A380,525,2005,13.78,3.27,13300.0
7,Airbus A380,544,2005,13.78,3.16,11000.0
8,Boeing 747-400,416,1988,11.11,3.34,11000.0
9,Boeing 747-8,467,2011,10.54,2.82,11000.0


In [20]:
long_haul_clean.mean()

Seats                      341.285714
First flight              2006.761905
Fuel burn kg/km              8.138095
Fuel per seat L/100km        2.958095
Sector km                10833.142857
dtype: float64

In [23]:
all_flights = pd.concat([commuter_clean, regional_clean, short_haul_clean, medium_haul_clean, long_haul_clean])

In [24]:
all_flights.mean()

Seats                     176.676692
First flight             2002.105263
Fuel burn kg/km             4.133083
Fuel per seat L/100km       3.221429
Sector km                3613.248120
dtype: float64