# web scraping using BeautifulSoup

Scrap data from this link: https://en.wikipedia.org/wiki/List_of_African_countries_by_area
        
focus on the first table:
- take rank , county and area.
- using pandas split are into two columns: km**2 and square mi

In [1]:
# importing relevant libraries
from bs4 import BeautifulSoup
import requests

In [3]:
# Assign link to variable
url = 'https://en.wikipedia.org/wiki/List_of_African_countries_by_area'

# requests component is for getting raw html content from the website URL provided
rawdata = requests.get(url)

In [4]:
#check for errors
rawdata

<Response [200]>

In [5]:
#extract the text from the html script(rawdata)

rawtext = rawdata.text
rawtext

'<!DOCTYPE html>\n<html class="client-nojs" lang="en" dir="ltr">\n<head>\n<meta charset="UTF-8"/>\n<title>List of African countries by area - Wikipedia</title>\n<script>document.documentElement.className="client-js";RLCONF={"wgBreakFrames":!1,"wgSeparatorTransformTable":["",""],"wgDigitTransformTable":["",""],"wgDefaultDateFormat":"dmy","wgMonthNames":["","January","February","March","April","May","June","July","August","September","October","November","December"],"wgRequestId":"d88ebad7-bd1f-4a7f-a7b5-102eaf4b538c","wgCSPNonce":!1,"wgCanonicalNamespace":"","wgCanonicalSpecialPageName":!1,"wgNamespaceNumber":0,"wgPageName":"List_of_African_countries_by_area","wgTitle":"List of African countries by area","wgCurRevisionId":974427570,"wgRevisionId":974427570,"wgArticleId":50165241,"wgIsArticle":!0,"wgIsRedirect":!1,"wgAction":"view","wgUserName":null,"wgUserGroups":["*"],"wgCategories":["Articles with short description","Short description is different from Wikidata","Lists of countries in

In [6]:
#using beautifulsoup to translate the html script(rawtext)
myscript = BeautifulSoup(rawtext, "html.parser")

#use prettify module to show readble htmlscript
print(myscript.prettify)

<bound method Tag.prettify of <!DOCTYPE html>

<html class="client-nojs" dir="ltr" lang="en">
<head>
<meta charset="utf-8"/>
<title>List of African countries by area - Wikipedia</title>
<script>document.documentElement.className="client-js";RLCONF={"wgBreakFrames":!1,"wgSeparatorTransformTable":["",""],"wgDigitTransformTable":["",""],"wgDefaultDateFormat":"dmy","wgMonthNames":["","January","February","March","April","May","June","July","August","September","October","November","December"],"wgRequestId":"d88ebad7-bd1f-4a7f-a7b5-102eaf4b538c","wgCSPNonce":!1,"wgCanonicalNamespace":"","wgCanonicalSpecialPageName":!1,"wgNamespaceNumber":0,"wgPageName":"List_of_African_countries_by_area","wgTitle":"List of African countries by area","wgCurRevisionId":974427570,"wgRevisionId":974427570,"wgArticleId":50165241,"wgIsArticle":!0,"wgIsRedirect":!1,"wgAction":"view","wgUserName":null,"wgUserGroups":["*"],"wgCategories":["Articles with short description","Short description is different from Wikidat

Get the TITLE

In [7]:
print(myscript.title.text)

List of African countries by area - Wikipedia


Get the TABLE we need

In [9]:
table = myscript.find('table', {'class' : 'wikitable sortable'})

In [10]:
#rows = table.find_all('tr')
rows = table.find_all('tr')

In [11]:
rank = [row.find('td').text for row in rows[1:]]
print(rank)

['1\n', '2\n', '3\n', '4\n', '5\n', '6\n', '7\n', '8\n', '9\n', '10\n', '11\n', '12\n', '13\n', '14\n', '15\n', '16\n', '17\n', '18\n', '19\n', '20\n', '21\n', '22\n', '23\n', '24\n', '25\n', '26\n', '27\n', '28\n', '29\n', '30\n', '31\n', '32\n', '33\n', '34\n', '35\n', '36\n', '37\n', '38\n', '39\n', '40\n', '41\n', '42\n', '43\n', '44\n', '45\n', '46\n', '47\n', '48\n', '49\n', '50\n', '51\n', '52\n', '53\n', '54\n']


In [12]:
country = [row.find('a').text for row in rows[1:]]
print(country)

['Algeria', 'Democratic Republic of the Congo', 'Sudan', 'Libya', 'Chad', 'Niger', 'Angola', 'Mali', 'South Africa', 'Ethiopia', 'Mauritania', 'Egypt', 'Tanzania', 'Nigeria', 'Namibia', 'Mozambique', 'Zambia', 'Morocco', 'South Sudan', 'Somalia', 'Central African Republic', 'Madagascar', 'Botswana', 'Kenya', 'Cameroon', 'Zimbabwe', 'Republic of the Congo', "Ivory Coast (Côte d'Ivoire)", 'Burkina Faso', 'Gabon', 'Guinea', 'Ghana', 'Uganda', 'Senegal', 'Tunisia', 'Malawi', 'Eritrea', 'Benin', 'Liberia', 'Sierra Leone', 'Togo', 'Guinea-Bissau', 'Lesotho', 'Equatorial Guinea', 'Burundi', 'Rwanda', 'Djibouti', 'Eswatini (Swaziland)', 'Gambia, The', 'Cape Verde', 'Comoros', 'Mauritius', 'São Tomé and Príncipe', 'Seychelles']


In [13]:
area = [row.find_all('td')[2].text for row in rows[1:]]
print(area)

['2,381,741\xa0km2 (919,595\xa0sq\xa0mi)\n', '2,344,858\xa0km2 (905,355\xa0sq\xa0mi)\n', '1,861,484\xa0km2 (718,723\xa0sq\xa0mi)\n', '1,759,540\xa0km2 (679,362\xa0sq\xa0mi)\n', '1,284,000\xa0km2 (495,755\xa0sq\xa0mi)\n', '1,267,000\xa0km2 (489,191\xa0sq\xa0mi)\n', '1,246,700\xa0km2 (481,354\xa0sq\xa0mi)\n', '1,240,192\xa0km2 (478,841\xa0sq\xa0mi)\n', '1,221,037\xa0km2 (471,445\xa0sq\xa0mi)\n', '1,104,300\xa0km2 (426,373\xa0sq\xa0mi)\n', '1,030,700\xa0km2 (397,955\xa0sq\xa0mi)\n', '1,001,449\xa0km2 (386,662\xa0sq\xa0mi)\n', '945,203\xa0km2 (364,945\xa0sq\xa0mi)\n', '923,768\xa0km2 (356,669\xa0sq\xa0mi)\n', '825,418\xa0km2 (318,696\xa0sq\xa0mi)\n', '801,590\xa0km2 (309,496\xa0sq\xa0mi)\n', '752,614\xa0km2 (290,586\xa0sq\xa0mi)\n', '710,850\xa0km2 (274,461\xa0sq\xa0mi)\n', '644,329\xa0km2 (248,777\xa0sq\xa0mi)\n', '637,657\xa0km2 (246,201\xa0sq\xa0mi)\n', '622,984\xa0km2 (240,535\xa0sq\xa0mi)\n', '587,041\xa0km2 (226,658\xa0sq\xa0mi)\n', '581,726\xa0km2 (224,606\xa0sq\xa0mi)\n', '580,367\

export the data to a file

In [14]:
import pandas as pd
df = pd.DataFrame()
df['rank'] = rank
df['country'] = country
df['area'] = area

df.head()

Unnamed: 0,rank,country,area
0,1\n,Algeria,"2,381,741 km2 (919,595 sq mi)\n"
1,2\n,Democratic Republic of the Congo,"2,344,858 km2 (905,355 sq mi)\n"
2,3\n,Sudan,"1,861,484 km2 (718,723 sq mi)\n"
3,4\n,Libya,"1,759,540 km2 (679,362 sq mi)\n"
4,5\n,Chad,"1,284,000 km2 (495,755 sq mi)\n"


In [15]:
df.to_csv("wikiarea scrap.csv", index = False)

Split the area column

In [16]:
#read the file
areascrap = pd.read_csv('wikiarea scrap.csv')
areascrap.head()

Unnamed: 0,rank,country,area
0,1,Algeria,"2,381,741 km2 (919,595 sq mi)\n"
1,2,Democratic Republic of the Congo,"2,344,858 km2 (905,355 sq mi)\n"
2,3,Sudan,"1,861,484 km2 (718,723 sq mi)\n"
3,4,Libya,"1,759,540 km2 (679,362 sq mi)\n"
4,5,Chad,"1,284,000 km2 (495,755 sq mi)\n"


In [17]:
#split the column area
areascrap[['Area_sq_km','Area_sq_mi']] = areascrap.area.str.split(' ',2, expand=True)

In [18]:
#remove )\n from the area column

areascrap.Area_sq_mi = areascrap.Area_sq_mi.str.replace( '\n','')
areascrap.head()

Unnamed: 0,rank,country,area,Area_sq_km,Area_sq_mi
0,1,Algeria,"2,381,741 km2 (919,595 sq mi)\n","2,381,741 km2","(919,595 sq mi)"
1,2,Democratic Republic of the Congo,"2,344,858 km2 (905,355 sq mi)\n","2,344,858 km2","(905,355 sq mi)"
2,3,Sudan,"1,861,484 km2 (718,723 sq mi)\n","1,861,484 km2","(718,723 sq mi)"
3,4,Libya,"1,759,540 km2 (679,362 sq mi)\n","1,759,540 km2","(679,362 sq mi)"
4,5,Chad,"1,284,000 km2 (495,755 sq mi)\n","1,284,000 km2","(495,755 sq mi)"


In [19]:
#drop area column
areascrap.drop(columns ="area",axis=3, inplace=True)
areascrap.head()

Unnamed: 0,rank,country,Area_sq_km,Area_sq_mi
0,1,Algeria,"2,381,741 km2","(919,595 sq mi)"
1,2,Democratic Republic of the Congo,"2,344,858 km2","(905,355 sq mi)"
2,3,Sudan,"1,861,484 km2","(718,723 sq mi)"
3,4,Libya,"1,759,540 km2","(679,362 sq mi)"
4,5,Chad,"1,284,000 km2","(495,755 sq mi)"


In [20]:
areascrap.to_csv('new wiki arescrap.csv', index=False)