In [1]:
# Mithil Patel
# DSC 540 - Milestone 3 Assignment
# 11/02/2022

# Import Libraries
from bs4 import BeautifulSoup
import pandas as pd
import requests as r

# Wikipedia URL
url = 'https://en.wikipedia.org/wiki/List_of_countries_by_carbon_dioxide_emissions'

# Requesting data from website
page = r.get(url)

# Parsing HTML
soup = BeautifulSoup(page.text, 'html.parser')

# Finding table from html
table = soup.find_all('table',{'class':"wikitable"})
# Reading html format
df=pd.read_html(str(table))

# convert list to dataframe
web_df=pd.DataFrame(df[0])
web_df

Unnamed: 0_level_0,Country[20],Fossil CO2 emissions(Mt CO2),Fossil CO2 emissions(Mt CO2),Fossil CO2 emissions(Mt CO2),Fossil CO2 emissions,Fossil CO2 emissions,2017 – Fossil CO2 emissions,2017 – Fossil CO2 emissions,2018 CO2 emissions[21],2018 CO2 emissions[21]
Unnamed: 0_level_1,Country[20],1990,2005,2017,2017(% of world),2017 vs 1990: change (%),Per land area(t CO2/km2/yr),Per capita(t CO2/cap/yr),Total including LUCF[22],Total excluding LUCF[23]
0,World,22674.116,30049.809,37077.404,100.00%,63.5%,73.0,4.9,36431.11,35247.21
1,World – International Aviation,258.941,422.777,543.381,1.47%,109.8%,,,,
2,World – International Shipping,371.804,572.169,677.248,1.83%,82.2%,,,,
3,Afghanistan,2.546,1.063,11.422,0.03%,348.6%,18.0,0.3,7.59,7.44
4,Albania,6.583,4.196,5.026,0.01%,-23.7%,175.0,1.7,5.32,5.56
...,...,...,...,...,...,...,...,...,...,...
207,Vietnam,20.182,99.231,218.729,0.59%,983.8%,660.0,2.3,245.36,257.86
208,Western Sahara,0.144,0.227,0.276,0.00%,91.7%,1.0,0.5,,
209,Yemen,6.887,21.768,12.503,0.03%,81.5%,24.0,0.4,9.31,9.31
210,Zambia,2.955,2.457,4.967,0.01%,68.1%,7.0,0.3,40.47,7.74


In [2]:
# Transformation 1: removing a layer of header
# Reason: The dataframe contained two headers showing more less the same information. Additionally, 
#         the double headers made it difficult to index columns.
web_df.columns = web_df.columns.droplevel(0)
web_df.head()

Unnamed: 0,Country[20],1990,2005,2017,2017(% of world),2017 vs 1990: change (%),Per land area(t CO2/km2/yr),Per capita(t CO2/cap/yr),Total including LUCF[22],Total excluding LUCF[23]
0,World,22674.116,30049.809,37077.404,100.00%,63.5%,73.0,4.9,36431.11,35247.21
1,World – International Aviation,258.941,422.777,543.381,1.47%,109.8%,,,,
2,World – International Shipping,371.804,572.169,677.248,1.83%,82.2%,,,,
3,Afghanistan,2.546,1.063,11.422,0.03%,348.6%,18.0,0.3,7.59,7.44
4,Albania,6.583,4.196,5.026,0.01%,-23.7%,175.0,1.7,5.32,5.56


In [3]:
# Transformation 2: removing irrelevant rows
# Reason: We would like to see whether the emission percentage has increase in approximately three decades;
#         therefore, we can filter data to show percentage difference for each country.
web_df = web_df[['Country[20]','2017 vs 1990: change (%)']]
web_df.head()

Unnamed: 0,Country[20],2017 vs 1990: change (%)
0,World,63.5%
1,World – International Aviation,109.8%
2,World – International Shipping,82.2%
3,Afghanistan,348.6%
4,Albania,-23.7%


In [4]:
# Transformation 3: rename column title
# Reason: Renaming header for readability as well as to have a common header to merge with other dataframes.
import warnings
warnings.filterwarnings("ignore")

web_df.rename({'Country[20]': 'country','2017 vs 1990: change (%)': '2017_vs_1990:_change(%)'}, axis=1, inplace=True)
web_df.head()

Unnamed: 0,country,2017_vs_1990:_change(%)
0,World,63.5%
1,World – International Aviation,109.8%
2,World – International Shipping,82.2%
3,Afghanistan,348.6%
4,Albania,-23.7%


In [5]:
# Transformation 4: Removing first 3 rows
# Reason: For this study, we are only concerned with country data; therefore, world data is 
#         irrelevant
web_df = web_df.iloc[3:,:]
web_df

Unnamed: 0,country,2017_vs_1990:_change(%)
3,Afghanistan,348.6%
4,Albania,-23.7%
5,Algeria,143.5%
6,Angola,427.7%
7,Anguilla,366.7%
...,...,...
207,Vietnam,983.8%
208,Western Sahara,91.7%
209,Yemen,81.5%
210,Zambia,68.1%


In [6]:
# Transformation 5: converting percentage column from object to float type
# Reason: Converting to float type in order to create visualizations for analysis

# Removing percentage sign from the column
web_df["2017_vs_1990:_change(%)"] = web_df['2017_vs_1990:_change(%)'].str[:-1]

# Convert from object to float
web_df["2017_vs_1990:_change(%)"] = web_df['2017_vs_1990:_change(%)'].replace(',','', regex=True).astype(float)
web_df

Unnamed: 0,country,2017_vs_1990:_change(%)
3,Afghanistan,348.6
4,Albania,-23.7
5,Algeria,143.5
6,Angola,427.7
7,Anguilla,366.7
...,...,...
207,Vietnam,983.8
208,Western Sahara,91.7
209,Yemen,81.5
210,Zambia,68.1


In [7]:
# Checking for duplicates
web_df[web_df.duplicated()]

Unnamed: 0,country,2017_vs_1990:_change(%)


In [8]:
# checking for empty rows
web_df.isnull().sum()

country                    0
2017_vs_1990:_change(%)    0
dtype: int64

In [9]:
# Storing web dataframe to be able to call the variable on different notebook
%store web_df

Stored 'web_df' (DataFrame)
