# Data Scraping using Beautiful Soup¶
* Import Beautiful Soup
* Make a GET request to fetch Page Data
* Parse HTML
* Filter Relvant parts

In [109]:
import bs4
from urllib.request import urlopen
from bs4 import BeautifulSoup as soup
import numpy as np
import pandas as pd

In [110]:
android_url = "https://en.wikipedia.org/wiki/Android_version_history"

data = urlopen(android_url)

android_html = data.read()

data.close()

# 2. Parsing Data

In [111]:
android_soup = soup(android_html,'html.parser')

In [112]:
android_tables = android_soup.find('table',{'class':'wikitable'})
print(len(android_tables))

2


# 3. Extracting Useful Information
* Remove underired tags
* Extract Table header And Data

In [113]:
headers = android_tables.find_all('th')
print(headers)

[<th>Name
</th>, <th>Version number(s)
</th>, <th>Initial release date
</th>, <th>Latest release
</th>, <th>API level
</th>, <th>References
</th>]


In [114]:
for i in range(len(headers)):
    headers[i] = headers[i].text[:-1]
headers

['Name',
 'Version number(s)',
 'Initial release date',
 'Latest release',
 'API level',
 'References']

In [115]:
rows_table = android_tables.find_all('tr')[1:] 
list_rows = []
for i in range(len(rows_table)):
    arow = rows_table[i].find_all('td',{})
    list_ele = []
    for d in arow:
        if d.text[-1] == "\n": 
            list_ele.append(d.text[:-1])
        else:
            list_ele.append(d.text)
    list_rows.append(list_ele)

In [116]:
print(list_rows)
list_rows = list_rows[0:1] + list_rows[2:]

[['No codename', '1.0', 'September 23, 2008', '', '1', '[9]'], ['1.1', 'February 9, 2009', '', '2', '[9][11]'], ['Cupcake', '1.5', 'April 27, 2009', '', '3', ''], ['Donut', '1.6', 'September 15, 2009', '', '4', '[12]'], ['Eclair', '2.0 – 2.1', 'October 26, 2009', 'January 12, 2010', '5 – 7', '[13]'], ['Froyo', '2.2 – 2.2.3', 'May 20, 2010', 'November 21, 2011', '8', '[14]'], ['Gingerbread', '2.3 – 2.3.7', 'December 6, 2010', 'September 21, 2011', '9 – 10', '[15]'], ['Honeycomb', '3.0 – 3.2.6', 'February 22, 2011', 'February 15, 2014', '11 – 13', '[16]'], ['Ice Cream Sandwich', '4.0 – 4.0.4', 'October 18, 2011', 'June 6, 2012', '14 – 15', '[17]'], ['Jelly Bean', '4.1 – 4.3.1', 'July 9, 2012', 'October 7, 2013', '16 – 18', '[18]'], ['KitKat', '4.4 – 4.4.4', 'October 31, 2013', 'July 7, 2014', '19 – 20', '[19]'], ['Lollipop', '5.0 – 5.1.1', 'November 12, 2014', 'April 21, 2015', '21 – 22', '[20]'], ['Marshmallow', '6.0 – 6.0.1', 'October 5, 2015', 'October 3, 2017', '23', '[21]'], ['Nouga

In [117]:
df = pd.DataFrame(list_rows,columns=headers)

In [118]:
df

Unnamed: 0,Name,Version number(s),Initial release date,Latest release,API level,References
0,No codename,1.0,"September 23, 2008",,1,[9]
1,Cupcake,1.5,"April 27, 2009",,3,
2,Donut,1.6,"September 15, 2009",,4,[12]
3,Eclair,2.0 – 2.1,"October 26, 2009","January 12, 2010",5 – 7,[13]
4,Froyo,2.2 – 2.2.3,"May 20, 2010","November 21, 2011",8,[14]
5,Gingerbread,2.3 – 2.3.7,"December 6, 2010","September 21, 2011",9 – 10,[15]
6,Honeycomb,3.0 – 3.2.6,"February 22, 2011","February 15, 2014",11 – 13,[16]
7,Ice Cream Sandwich,4.0 – 4.0.4,"October 18, 2011","June 6, 2012",14 – 15,[17]
8,Jelly Bean,4.1 – 4.3.1,"July 9, 2012","October 7, 2013",16 – 18,[18]
9,KitKat,4.4 – 4.4.4,"October 31, 2013","July 7, 2014",19 – 20,[19]
