# Data Scrapping via Python

This notebook is taken from:
https://medium.com/analytics-vidhya/how-to-scrape-data-from-a-website-using-python-for-beginner-5c770a1fbe2d
    

In [4]:
!pip install beautifulsoup4

Defaulting to user installation because normal site-packages is not writeable


In [5]:
from bs4 import BeautifulSoup
import requests

In [6]:
# Define URL
url = 'https://webscraper.io/test-sites/e-commerce/allinone/computers/laptops'
# Ask hosting server to fetch url
requests.get(url)

<Response [200]>

In [7]:
pages = requests.get(url)
pages.text

'<!DOCTYPE html>\n<html lang="en">\n<head>\n\n\t\t\t<!-- Anti-flicker snippet (recommended)  -->\n<style>.async-hide {\n\t\topacity: 0 !important\n\t} </style>\n<script>(function (a, s, y, n, c, h, i, d, e) {\n\t\ts.className += \' \' + y;\n\t\th.start = 1 * new Date;\n\t\th.end = i = function () {\n\t\t\ts.className = s.className.replace(RegExp(\' ?\' + y), \'\')\n\t\t};\n\t\t(a[n] = a[n] || []).hide = h;\n\t\tsetTimeout(function () {\n\t\t\ti();\n\t\t\th.end = null\n\t\t}, c);\n\t\th.timeout = c;\n\t})(window, document.documentElement, \'async-hide\', \'dataLayer\', 4000,\n\t\t{\'GTM-NVFPDWB\': true});</script>\n\t\n\t<!-- Google Tag Manager -->\n<script>(function (w, d, s, l, i) {\n\t\tw[l] = w[l] || [];\n\t\tw[l].push({\n\t\t\t\'gtm.start\':\n\t\t\t\tnew Date().getTime(), event: \'gtm.js\'\n\t\t});\n\t\tvar f = d.getElementsByTagName(s)[0],\n\t\t\tj = d.createElement(s), dl = l != \'dataLayer\' ? \'&l=\' + l : \'\';\n\t\tj.async = true;\n\t\tj.src =\n\t\t\t\'https://www.googletagma

In [8]:
# parser-lxml = Change html to Python friendly format
soup = BeautifulSoup(pages.text, 'lxml')
soup

<!DOCTYPE html>
<html lang="en">
<head>
<!-- Anti-flicker snippet (recommended)  -->
<style>.async-hide {
		opacity: 0 !important
	} </style>
<script>(function (a, s, y, n, c, h, i, d, e) {
		s.className += ' ' + y;
		h.start = 1 * new Date;
		h.end = i = function () {
			s.className = s.className.replace(RegExp(' ?' + y), '')
		};
		(a[n] = a[n] || []).hide = h;
		setTimeout(function () {
			i();
			h.end = null
		}, c);
		h.timeout = c;
	})(window, document.documentElement, 'async-hide', 'dataLayer', 4000,
		{'GTM-NVFPDWB': true});</script>
<!-- Google Tag Manager -->
<script>(function (w, d, s, l, i) {
		w[l] = w[l] || [];
		w[l].push({
			'gtm.start':
				new Date().getTime(), event: 'gtm.js'
		});
		var f = d.getElementsByTagName(s)[0],
			j = d.createElement(s), dl = l != 'dataLayer' ? '&l=' + l : '';
		j.async = true;
		j.src =
			'https://www.googletagmanager.com/gtm.js?id=' + i + dl;
		f.parentNode.insertBefore(j, f);
	})(window, document, 'script', 'dataLayer', 'GTM-NVFPDWB')

In [9]:
# Access h1 tag
soup.h1

<h1>Test Sites</h1>

In [10]:
# Access header tag
soup.header
# Access div tag
soup.div

<div class="container">
<div class="navbar-header">
<a data-target=".side-collapse" data-target-2=".side-collapse-container" data-toggle="collapse-side">
<button aria-controls="navbar" aria-expanded="false" class="navbar-toggle pull-right collapsed" data-target="#navbar" data-target-2=".side-collapse-container" data-target-3=".side-collapse" data-toggle="collapse" type="button">
<span class="sr-only">Toggle navigation</span>
<span class="icon-bar top-bar"></span>
<span class="icon-bar middle-bar"></span>
<span class="icon-bar bottom-bar"></span>
</button>
</a>
<div class="navbar-brand">
<a href="/"><img alt="Web Scraper" src="/img/logo_white.svg"/></a>
</div>
</div>
<div class="side-collapse in">
<nav class="navbar-collapse collapse" id="navbar" role="navigation">
<ul class="nav navbar-nav navbar-right">
<li class="hidden">
<a href="#page-top"></a>
</li>
<li>
<a class="menuitm" href="/">
<p>Web Scraper</p>
<div class="crta"></div>
</a>
</li>
<li>
<a class="menuitm" href="/cloud-scraper

In [11]:
# Access string from nested tags
soup.header.p

<p>Web Scraper</p>

In [12]:
# Access string from nested tags
soup.header.p
soup.header.p.string

'Web Scraper'

In [13]:
# Access ‘a’ tag in <header>
a_start = soup.header.a
a_start# Access only the attributes using attrs
a_start.attrs

{'data-toggle': 'collapse-side',
 'data-target': '.side-collapse',
 'data-target-2': '.side-collapse-container'}

In [14]:
# Searching specific attributes of tags
#h4 shows the price
soup.find('h4', class_= 'pull-right price')

<h4 class="pull-right price">$295.99</h4>

In [15]:
# Using find_all
soup.find_all('h4', class_= 'pull-right price')

[<h4 class="pull-right price">$295.99</h4>,
 <h4 class="pull-right price">$299.00</h4>,
 <h4 class="pull-right price">$299.00</h4>,
 <h4 class="pull-right price">$306.99</h4>,
 <h4 class="pull-right price">$321.94</h4>,
 <h4 class="pull-right price">$356.49</h4>,
 <h4 class="pull-right price">$364.46</h4>,
 <h4 class="pull-right price">$372.70</h4>,
 <h4 class="pull-right price">$379.94</h4>,
 <h4 class="pull-right price">$379.95</h4>,
 <h4 class="pull-right price">$391.48</h4>,
 <h4 class="pull-right price">$393.88</h4>,
 <h4 class="pull-right price">$399.00</h4>,
 <h4 class="pull-right price">$399.99</h4>,
 <h4 class="pull-right price">$404.23</h4>,
 <h4 class="pull-right price">$408.98</h4>,
 <h4 class="pull-right price">$409.63</h4>,
 <h4 class="pull-right price">$410.46</h4>,
 <h4 class="pull-right price">$410.66</h4>,
 <h4 class="pull-right price">$416.99</h4>,
 <h4 class="pull-right price">$433.30</h4>,
 <h4 class="pull-right price">$436.29</h4>,
 <h4 class="pull-right price">$4

In [16]:
#Slicing the results of find_all
soup.find_all('h4', class_= 'pull-right price')[2:5]

[<h4 class="pull-right price">$299.00</h4>,
 <h4 class="pull-right price">$306.99</h4>,
 <h4 class="pull-right price">$321.94</h4>]

In [17]:
# Using filter to find multiple tags
soup.find_all(['h4', 'a', 'p'])
soup.find_all(['header', 'div'])
soup.find_all(id = True) # class and id are special attribute so it can be written like this
soup.find_all(class_= True)

[<header class="navbar navbar-fixed-top navbar-static" role="banner">
 <div class="container">
 <div class="navbar-header">
 <a data-target=".side-collapse" data-target-2=".side-collapse-container" data-toggle="collapse-side">
 <button aria-controls="navbar" aria-expanded="false" class="navbar-toggle pull-right collapsed" data-target="#navbar" data-target-2=".side-collapse-container" data-target-3=".side-collapse" data-toggle="collapse" type="button">
 <span class="sr-only">Toggle navigation</span>
 <span class="icon-bar top-bar"></span>
 <span class="icon-bar middle-bar"></span>
 <span class="icon-bar bottom-bar"></span>
 </button>
 </a>
 <div class="navbar-brand">
 <a href="/"><img alt="Web Scraper" src="/img/logo_white.svg"/></a>
 </div>
 </div>
 <div class="side-collapse in">
 <nav class="navbar-collapse collapse" id="navbar" role="navigation">
 <ul class="nav navbar-nav navbar-right">
 <li class="hidden">
 <a href="#page-top"></a>
 </li>
 <li>
 <a class="menuitm" href="/">
 <p>Web

By using a filter we can collect the data that we want from the website, in this case, we want to collect name, price, reviews, and descriptions. So we need to define the variables first.

In [44]:
# Filter by name
name = soup.find_all('a', class_='title')
# Filter by price
price = soup.find_all('h4', class_ = 'pull-right price')
# Filter by reviews
reviews = soup.find_all('p', class_ = 'pull-right')
# Filter by description
description = soup.find_all('p', class_ ='description')

In [19]:
name

[<a class="title" href="/test-sites/e-commerce/allinone/product/545" title="Asus VivoBook X441NA-GA190">Asus VivoBook X4...</a>,
 <a class="title" href="/test-sites/e-commerce/allinone/product/546" title="Prestigio SmartBook 133S Dark Grey">Prestigio SmartB...</a>,
 <a class="title" href="/test-sites/e-commerce/allinone/product/547" title="Prestigio SmartBook 133S Gold">Prestigio SmartB...</a>,
 <a class="title" href="/test-sites/e-commerce/allinone/product/517" title="Aspire E1-510">Aspire E1-510</a>,
 <a class="title" href="/test-sites/e-commerce/allinone/product/548" title="Lenovo V110-15IAP">Lenovo V110-15IA...</a>,
 <a class="title" href="/test-sites/e-commerce/allinone/product/549" title="Lenovo V110-15IAP">Lenovo V110-15IA...</a>,
 <a class="title" href="/test-sites/e-commerce/allinone/product/550" title="Hewlett Packard 250 G6 Dark Ash Silver">Hewlett Packard...</a>,
 <a class="title" href="/test-sites/e-commerce/allinone/product/551" title="Acer Aspire 3 A315-31 Black">Acer As

In [20]:
price

[<h4 class="pull-right price">$295.99</h4>,
 <h4 class="pull-right price">$299.00</h4>,
 <h4 class="pull-right price">$299.00</h4>,
 <h4 class="pull-right price">$306.99</h4>,
 <h4 class="pull-right price">$321.94</h4>,
 <h4 class="pull-right price">$356.49</h4>,
 <h4 class="pull-right price">$364.46</h4>,
 <h4 class="pull-right price">$372.70</h4>,
 <h4 class="pull-right price">$379.94</h4>,
 <h4 class="pull-right price">$379.95</h4>,
 <h4 class="pull-right price">$391.48</h4>,
 <h4 class="pull-right price">$393.88</h4>,
 <h4 class="pull-right price">$399.00</h4>,
 <h4 class="pull-right price">$399.99</h4>,
 <h4 class="pull-right price">$404.23</h4>,
 <h4 class="pull-right price">$408.98</h4>,
 <h4 class="pull-right price">$409.63</h4>,
 <h4 class="pull-right price">$410.46</h4>,
 <h4 class="pull-right price">$410.66</h4>,
 <h4 class="pull-right price">$416.99</h4>,
 <h4 class="pull-right price">$433.30</h4>,
 <h4 class="pull-right price">$436.29</h4>,
 <h4 class="pull-right price">$4

In [21]:
reviews

[<p class="pull-right">14 reviews</p>,
 <p class="pull-right">8 reviews</p>,
 <p class="pull-right">12 reviews</p>,
 <p class="pull-right">2 reviews</p>,
 <p class="pull-right">5 reviews</p>,
 <p class="pull-right">6 reviews</p>,
 <p class="pull-right">12 reviews</p>,
 <p class="pull-right">2 reviews</p>,
 <p class="pull-right">0 reviews</p>,
 <p class="pull-right">9 reviews</p>,
 <p class="pull-right">10 reviews</p>,
 <p class="pull-right">9 reviews</p>,
 <p class="pull-right">4 reviews</p>,
 <p class="pull-right">3 reviews</p>,
 <p class="pull-right">12 reviews</p>,
 <p class="pull-right">10 reviews</p>,
 <p class="pull-right">9 reviews</p>,
 <p class="pull-right">14 reviews</p>,
 <p class="pull-right">4 reviews</p>,
 <p class="pull-right">2 reviews</p>,
 <p class="pull-right">1 reviews</p>,
 <p class="pull-right">1 reviews</p>,
 <p class="pull-right">2 reviews</p>,
 <p class="pull-right">6 reviews</p>,
 <p class="pull-right">9 reviews</p>,
 <p class="pull-right">2 reviews</p>,
 <p c

In [22]:
description

[<p class="description">Asus VivoBook X441NA-GA190 Chocolate Black, 14", Celeron N3450, 4GB, 128GB SSD, Endless OS, ENG kbd</p>,
 <p class="description">Prestigio SmartBook 133S Dark Grey, 13.3" FHD IPS, Celeron N3350 1.1GHz, 4GB, 32GB, Windows 10 Pro + Office 365 1 gadam</p>,
 <p class="description">Prestigio SmartBook 133S Gold, 13.3" FHD IPS, Celeron N3350 1.1GHz, 4GB, 32GB, Windows 10 Pro + Office 365 1 gadam</p>,
 <p class="description">15.6", Pentium N3520 2.16GHz, 4GB, 500GB, Linux</p>,
 <p class="description">Lenovo V110-15IAP, 15.6" HD, Celeron N3350 1.1GHz, 4GB, 128GB SSD, Windows 10 Home</p>,
 <p class="description">Asus VivoBook 15 X540NA-GQ008T Chocolate Black, 15.6" HD, Pentium N4200, 4GB, 500GB, Windows 10 Home, En kbd</p>,
 <p class="description">Hewlett Packard 250 G6 Dark Ash Silver, 15.6" HD, Celeron N3060 1.6GHz, 4GB, 128GB SSD, DOS</p>,
 <p class="description">Acer Aspire 3 A315-31 Black, 15.6" HD, Celeron N3350 1.1GHz, 4GB, 128GB SSD, Windows 10 Home</p>,
 <p clas

 CLEANING OUTPUT RESULTS FROM FILTER

In [40]:
# Try to call price
price1 = soup.find('h4', class_ = 'pull-right price')
price1.text

'$295.99'

CREATE A FOR LOOP TO MAKE STRING INTO LIST

In [37]:
product_name_list = []
for i in name:
    name = i
    product_name_list.append(name)
    

In [43]:
price_list = []
for i in price:
    price = i
    price_list.append(price)


In [26]:
review_list = []
for i in reviews:
    rev = i
    review_list.append(rev)

In [27]:
description_list = []
for i in description:
    desc = i
    description_list.append(desc)

CREATE A DATAFRAME FROM LIST

In [28]:
import pandas as pd

In [29]:
a = {'Product Name':product_name_list,'Price': price_list,'Reviews':review_list,'Description':description_list}

In [30]:
df = pd.DataFrame.from_dict(a, orient='columns')

In [31]:
df

Unnamed: 0,Product Name,Price,Reviews,Description
0,Asus VivoBook X4...,[$295.99],[14 reviews],"[Asus VivoBook X441NA-GA190 Chocolate Black, 1..."
1,Prestigio SmartB...,[$299.00],[8 reviews],"[Prestigio SmartBook 133S Dark Grey, 13.3"" FHD..."
2,Prestigio SmartB...,[$299.00],[12 reviews],"[Prestigio SmartBook 133S Gold, 13.3"" FHD IPS,..."
3,Aspire E1-510,[$306.99],[2 reviews],"[15.6"", Pentium N3520 2.16GHz, 4GB, 500GB, Linux]"
4,Lenovo V110-15IA...,[$321.94],[5 reviews],"[Lenovo V110-15IAP, 15.6"" HD, Celeron N3350 1...."
...,...,...,...,...
112,Lenovo Legion Y7...,[$1399.00],[8 reviews],"[Lenovo Legion Y720, 15.6"" FHD IPS, Core i7-77..."
113,Asus ROG Strix G...,[$1399.00],[10 reviews],"[Asus ROG Strix GL702VM-GC146T, 17.3"" FHD, Cor..."
114,Asus ROG Strix G...,[$1769.00],[7 reviews],"[Asus ROG Strix GL702ZC-GC154T, 17.3"" FHD, Ryz..."
115,Asus ROG Strix G...,[$1769.00],[8 reviews],"[Asus ROG Strix GL702ZC-GC209T, 17.3"" FHD IPS,..."


In [32]:
#Saving csv file
df.to_csv('names.csv', index=True, encoding='utf-8')

In [None]:
#saving excell file
df.to_excel('names.xlsx', index= True, encoding='utf-8')

# more tutorial:


https://oxylabs.io/blog/python-web-scraping

web scrapper extension: 

https://www.webscraper.io/documentation?utm_source=extension&utm_medium=popup

https://www.webscraper.io/tutorials?utm_source=extension&utm_medium=popup