# Web Scraping using Selenium

In [1]:
#pip install selenium

# Load Necessary Library

In [2]:
from selenium import webdriver
from selenium.webdriver.common.by import By
import pandas as pd
import time

# Create an instance of ChromeOptions

In [3]:
options = webdriver.ChromeOptions()

# Run Browser in headless mode

In [4]:
options.add_argument("--headless=new")

# Initialize the Chrome driver with the specified options

In [5]:
driver = webdriver.Chrome(options=options)

# URL where to be scraped

In [6]:
url = "https://en.wikipedia.org/wiki/List_of_countries_by_carbon_dioxide_emissions"

# Navigate to web page

In [7]:
driver.get(url)
time.sleep(3)

# To get element from Web

In [8]:
rows = driver.find_elements(By.XPATH,'//*[@id="mw-content-text"]/div[1]/table[1]/tbody/tr[212]')

# Save the result into the list

In [9]:
for row in rows:
    cols = row.find_elements(By.TAG_NAME, "td")
    data = [col.text for col in cols]
    print(data)

[' Faroe Islands', '0.0000%', '0.00', '0.00', ' +20%']


# Get element at bigger scope

In [10]:
country = driver.find_elements(By.XPATH,'//*[@id="mw-content-text"]/div[1]/table[1]')

# Save the result into the list

In [11]:
for cnt in country:
    rows = cnt.find_elements(By.TAG_NAME,"tr")
    datas = [row.text for row in rows]
    #print(datas)

# Checking the result

In [12]:
print(datas[3:5])

[' China 34.0% 13,259.64 3,666.95  +262%', ' United States 12.0% 4,682.04 5,928.97  −21%']


# Checking length data

In [13]:
print(len(datas))

214


# Checking list content at last data

In [14]:
datas[213]

' Faroe Islands 0.0000% 0.00 0.00  +20%'

# Change list dimension from 214 x 1 to 214 x 5

In [15]:
new_data = [item.split() for item in datas]

# Checking result change dimension at 1st list

In [16]:
new_data[3]

['China', '34.0%', '13,259.64', '3,666.95', '+262%']

# Checking result change dimension at last list

In [17]:
new_data[213]

['Faroe', 'Islands', '0.0000%', '0.00', '0.00', '+20%']

# Create copy data for manipulation

In [18]:
countries = new_data[3:214]

# Checking copy data result

In [19]:
countries[210]

['Faroe', 'Islands', '0.0000%', '0.00', '0.00', '+20%']

# Check data at correct dimension 1 x 5

In [20]:
for i,j in enumerate(countries):
    if(len(j) > 5):
        print(i, countries[i])

1 ['United', 'States', '12.0%', '4,682.04', '5,928.97', '−21%']
3 ['European', 'Union', '6.4%', '2,512.07', '3,563.26', '−30%']
7 ['International', 'Shipping', '1.8%', '706.32', '503.29', '+40%']
9 ['Saudi', 'Arabia', '1.6%', '622.91', '265.24', '+135%']
12 ['South', 'Korea', '1.5%', '573.54', '474.16', '+21%']
13 ['International', 'Aviation', '1.3%', '491.63', '355.32', '+38%']
17 ['South', 'Africa', '1.0%', '397.37', '347.30', '+14%']
20 ['Italy,', 'San', 'Marino', 'and', 'Vatican', 'City', '0.8%', '305.49', '454.72', '−33%']
21 ['United', 'Kingdom', '0.8%', '302.10', '551.68', '−45%']
24 ['France', 'and', 'Monaco', '0.7%', '282.43', '401.21', '−30%']
29 ['Spain', 'and', 'Andorra', '0.6%', '217.26', '313.24', '−31%']
30 ['United', 'Arab', 'Emirates', '0.5%', '205.99', '88.46', '+133%']
45 ['Czech', 'Republic', '0.2%', '90.51', '132.31', '−32%']
52 ['North', 'Korea', '0.2%', '64.27', '73.81', '−13%']
54 ['Israel', 'and', 'Palestine', '0.2%', '61.25', '59.30', '+3%']
58 ['Serbia', 'and

# Data correction for list > 5 for a list

In [21]:
countries[209] = [' '.join(countries[209][:7])]+countries[209][7:]

# Checking data correction result

In [22]:
countries[209]

['Saint Helena, Ascension and Tristan da Cunha',
 '0.0000%',
 '0.02',
 '0.01',
 '+58%']

# Data correction for all data

In [23]:
for i, j in enumerate(countries):
    if(len(j) > 5):
        x = len(j) - 4
        countries[i] = [' '.join(countries[i][:x])] + countries[i][x:]

# Check if all data correct dimension

In [24]:
# Check if any list > 5
for i in countries:
    if(len(i) > 5):
        print("Data correction needed")
        break
    elif(len(i) < 5):
        print("Data correction needed")
        break
    else:
        print("Data already OK")
        break

Data already OK


# List manipulation for row names

In [25]:
new_data[0]

['Location',
 '%',
 'of',
 'global',
 'total',
 'Fossil',
 'emissions',
 '(1,000,000',
 'tons',
 'per',
 'year)',
 '%',
 'change',
 'from',
 '2000']

In [26]:
new_data[1][0]

'2023'

In [27]:
des = new_data[0] + new_data[1]
print(des)
print(len(des))

['Location', '%', 'of', 'global', 'total', 'Fossil', 'emissions', '(1,000,000', 'tons', 'per', 'year)', '%', 'change', 'from', '2000', '2023', '2000']
17


In [28]:
description = [des[0], ' '.join(des[1:5]), ' '.join(des[5:7])+' '+des[15], ' '.join(des[5:7])+' '+des[16], ' '.join(des[11:15])]

# Row names result

In [29]:
description

['Location',
 '% of global total',
 'Fossil emissions 2023',
 'Fossil emissions 2000',
 '% change from 2000']

# Data frame for save to document format

In [30]:
df = pd.DataFrame(countries, columns=description)

# Data Frame result

In [31]:
df

Unnamed: 0,Location,% of global total,Fossil emissions 2023,Fossil emissions 2000,% change from 2000
0,China,34.0%,13259.64,3666.95,+262%
1,United States,12.0%,4682.04,5928.97,−21%
2,India,7.6%,2955.18,995.65,+197%
3,European Union,6.4%,2512.07,3563.26,−30%
4,Russia,5.3%,2069.50,1681.14,+23%
...,...,...,...,...,...
206,Saint Pierre and Miquelon,0.0001%,0.04,0.02,+129%
207,Anguilla,0.0001%,0.02,0.02,+48%
208,Falkland Islands,0.0000%,0.02,0.01,+170%
209,"Saint Helena, Ascension and Tristan da Cunha",0.0000%,0.02,0.01,+58%


In [32]:
df[1:6]

Unnamed: 0,Location,% of global total,Fossil emissions 2023,Fossil emissions 2000,% change from 2000
1,United States,12.0%,4682.04,5928.97,−21%
2,India,7.6%,2955.18,995.65,+197%
3,European Union,6.4%,2512.07,3563.26,−30%
4,Russia,5.3%,2069.5,1681.14,+23%
5,Japan,2.4%,944.76,1248.81,−24%


# Save to csv format

In [33]:
df.to_csv("CO2 Emission Country.csv", index=False)

# Close the driver connection

In [34]:
driver.quit()