# web scraping using BeautifulSoup

Scrap data from this link: https://en.wikipedia.org/wiki/List_of_African_countries_by_area
        
focus on the first table:
- take rank , county and area.
- using pandas split are into two columns: km**2 and square mi

In [94]:
#importing BeautifulSoup and libraries
from bs4 import BeautifulSoup
import requests
import pandas as pd

In [95]:
#storing the link in the 'link' variable
link = " https://en.wikipedia.org/wiki/List_of_African_countries_by_area"
#make a get request for the link(web address)
HTTP_script = requests.get(link)

In [96]:
#checking the status of the request
HTTP_script

<Response [200]>

In [97]:
#Storing the html as text
text_script = HTTP_script.text

In [98]:
#parsing the page through BeautifulSoup and using the prettify method to make the text readable
# and also show the html layout of the page
relevant_text = BeautifulSoup(text_script, "lxml") 
print(relevant_text.prettify())

<!DOCTYPE html>
<html class="client-nojs" dir="ltr" lang="en">
 <head>
  <meta charset="utf-8"/>
  <title>
   List of African countries by area - Wikipedia
  </title>
  <script>
   document.documentElement.className="client-js";RLCONF={"wgBreakFrames":!1,"wgSeparatorTransformTable":["",""],"wgDigitTransformTable":["",""],"wgDefaultDateFormat":"dmy","wgMonthNames":["","January","February","March","April","May","June","July","August","September","October","November","December"],"wgRequestId":"8e9e3a02-9e9a-4022-ad77-c1a8e35ca1a4","wgCSPNonce":!1,"wgCanonicalNamespace":"","wgCanonicalSpecialPageName":!1,"wgNamespaceNumber":0,"wgPageName":"List_of_African_countries_by_area","wgTitle":"List of African countries by area","wgCurRevisionId":987823386,"wgRevisionId":987823386,"wgArticleId":50165241,"wgIsArticle":!0,"wgIsRedirect":!1,"wgAction":"view","wgUserName":null,"wgUserGroups":["*"],"wgCategories":["Articles with short description","Short description is different from Wikidata","Lists of 

In [99]:
#storing the all elements of the table to be scraped with the class it has attached to it
table = relevant_text.find('table',{'class':'wikitable sortable jquery-tablesorter'})

In [100]:
# storing finding and storing the elements of the table with the 'tr' tag 
table_rows = relevant_text.tbody.find_all('tr')

In [101]:
# Getting the Rank
# Creating an empty list to store the rank
Rank = []
#looping through all the rows in the table, finding and appending the elments with the 'td' tag to the list
# with the .strip() ensuring that it is returned as the proper data type
for row in table_rows[1:]:
    Rank.append(row.find('td').text.strip())

In [102]:
#Gettiing the countries
#Create an empty list to contain the countries
Countries =[]
#Iterating through the rows to find the content inside the 'a' tag, having the countries in them
#appending them to the list as well
for row in table_rows[1:]:
    Countries.append(row.find('a').text)

In [103]:
#Getting the area off the table
#Creating an empty list to hold the area
Area = []
#Iterating through the rows...
for row in table_rows[1:]:
    #Finding all the 'td' tags in the table
    cells=row.find_all('td')
    #Stating the conditions of the iteration, since we know from the table layout that the number of columns we have is 4
    if len(cells)==4:
        #Appending the content of the specific cell we want to the empty list and ensuring it is returned as the proper data type
        Area.append(cells[2].text.strip())

In [104]:
#Creating  a Pandas Dataframe and entering the lists we have gotten in it
df = pd.DataFrame()
df['Rank'] = Rank
df["Countries"] = Countries
df["Area"] = Area

In [105]:
#Splitting the two values of the area, with the split at the beginning of the parenthesis
split_data= df["Area"].str.split("(", n = 1, expand = True)
#Storing the split into different columns
df["Area(km**2)"]= split_data[0] 
df["Area(sq mi)"] = split_data[1]
#Dropping the original 'Area' column
df.drop(columns =["Area"], inplace = True) 

In [93]:
#The final dataframe
df

Unnamed: 0,Rank,Countries,Area(km**2),Area(sq mi)
0,1,Algeria,"2,381,741 km2","919,595 sq mi)"
1,2,Democratic Republic of the Congo,"2,344,858 km2","905,355 sq mi)"
2,3,Sudan,"1,861,484 km2","718,723 sq mi)"
3,4,Libya,"1,759,540 km2","679,362 sq mi)"
4,5,Chad,"1,284,000 km2","495,755 sq mi)"
5,6,Niger,"1,267,000 km2","489,191 sq mi)"
6,7,Angola,"1,246,700 km2","481,354 sq mi)"
7,8,Mali,"1,240,192 km2","478,841 sq mi)"
8,9,South Africa,"1,221,037 km2","471,445 sq mi)"
9,10,Ethiopia,"1,104,300 km2","426,373 sq mi)"
