# web scraping using BeautifulSoup

Scrap data from this link: https://en.wikipedia.org/wiki/List_of_African_countries_by_area
        
focus on the first table:
- take rank , country and area.
- using pandas split are into two columns: km**2 and square mi

In [1]:
#import neccessary libraries
import requests
from bs4 import BeautifulSoup
import pandas as pd
import lxml.html as lh

In [2]:
#assingling link to the variable
link = 'https://en.wikipedia.org/wiki/List_of_African_countries_by_area'
page = requests.get(link)
print(page)

<Response [200]>


In [3]:
#Store the contents of the website under doc
doc = lh.fromstring(page.content)

In [4]:
#Parse data that are stored between <tr>..</tr> of HTML
tr_elements = doc.xpath('//tr')

In [5]:
#Check the length of the first 12 rows
[len(T) for T in tr_elements[:12]]

[4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4]

In [6]:
#Create empty list
col=[]
i=0
#For each row, store each first element (header) and an empty list
for t in tr_elements[0]:
    i+=1
    name=t.text_content()
    print ('%d:"%s"'%(i,name))
    col.append((name,[]))

1:"Rank
"
2:"Country[1][2][3]
"
3:"Area[a][b][4]
"
4:"Notes
"


In [7]:
# Since our first row is the header, data is stored on the second row onwards
for j in range(1,len(tr_elements)):
    #T is our j'th row
    T=tr_elements[j]
    
    #If row is not of size 4, the //tr data is not from our table 
    if len(T)!=4:
        break
    
    #i is the index of our column
    i=0
    
    #Iterate through each element of the row
    for t in T.iterchildren():
        data=t.text_content() 
        #Check if row is empty
        if i>0:
        #Convert any numerical value to integers
            try:
                data=int(data)
            except:
                pass
        #Append the data to the empty list of the i'th column
        col[i][1].append(data[:-1])
        #Increment i for the next column
        i+=1

In [8]:
[len(C) for (title,C) in col]

[54, 54, 54, 54]

In [9]:
Dict={title:column for (title,column) in col}
df=pd.DataFrame(Dict)

In [10]:
# new data frame with split value columns 
new = df["Area[a][b][4]\n"].str.split("(", n = 1, expand = True) 
  
# making separate first name column from new data frame 
df["Areakm**2"]= new[0] 
  
# making separate last name column from new data frame 
df["Areasquare mi"]= new[1] 
  
# Dropping old Name columns 
df.drop(columns =["Area[a][b][4]\n"], inplace = True)

In [11]:
df

Unnamed: 0,Rank,Country[1][2][3],Notes,Areakm**2,Areasquare mi
0,1,Algeria,,"2,381,741 km2","919,595 sq mi)"
1,2,Democratic Republic of the Congo,,"2,344,858 km2","905,355 sq mi)"
2,3,Sudan,,"1,861,484 km2","718,723 sq mi)"
3,4,Libya,,"1,759,540 km2","679,362 sq mi)"
4,5,Chad,,"1,284,000 km2","495,755 sq mi)"
5,6,Niger,,"1,267,000 km2","489,191 sq mi)"
6,7,Angola,,"1,246,700 km2","481,354 sq mi)"
7,8,Mali,,"1,240,192 km2","478,841 sq mi)"
8,9,South Africa,,"1,221,037 km2","471,445 sq mi)"
9,10,Ethiopia,,"1,104,300 km2","426,373 sq mi)"
