# Searching and extracting: `find_all()`

In [8]:
# import libraries
import requests
from bs4 import BeautifulSoup
import pandas as pd

## Get HTML into Python

In [9]:
# assign HTML url to varible: url
# url ="https://webscraper.io/test-sites/e-commerce/allinone/phones"
url ="https://webscraper.io/test-sites/e-commerce/allinone/computers/tablets"

# get the HTML from url and assign to variable: page
page = requests.get(url)
page # The HTTP 200 OK success status response code = request has succeeded.

<Response [200]>

In [23]:
# grab the HTML as text(string)
# parses it into Python and assign to variable: soup
soup = BeautifulSoup(page.text,"lxml")
soup # retruns the HTML 

<!DOCTYPE html>
<html lang="en">
<head>
<!-- Google Tag Manager -->
<script nonce="sMJVGJZ4dj1wPqTBiqFtpRogHz4aZL1g">(function (w, d, s, l, i) {
		w[l] = w[l] || [];
		w[l].push({
			'gtm.start':
				new Date().getTime(), event: 'gtm.js'
		});
		var f = d.getElementsByTagName(s)[0],
			j = d.createElement(s), dl = l != 'dataLayer' ? '&l=' + l : '';
		j.async = true;
		j.src =
			'https://www.googletagmanager.com/gtm.js?id=' + i + dl;
		f.parentNode.insertBefore(j, f);
	})(window, document, 'script', 'dataLayer', 'GTM-NVFPDWB');</script>
<!-- End Google Tag Manager -->
<title>Allinone | Web Scraper Test Sites</title>
<meta charset="utf-8"/>
<meta content="IE=edge,chrome=1" http-equiv="X-UA-Compatible"/>
<meta content="web scraping,Web Scraper,Chrome extension,Crawling,Cross platform scraper" name="keywords"/>
<meta content="Test Web Scraper's features and performance on mock e-commerce sites. Extract product data, prices, and categories in a controlled environment." name="description"/>

## How do we return all prices, not just the first one?
Even though the class and the tag are the same, you can use `find_all()` to return all of of the prices for `h4` with a class of `pull-right price`.

In [25]:
# returns all of the prices in a Python list
type(soup.find_all('h4', {'class':'pull-right price'}))

bs4.element.ResultSet

In [33]:
# again, this returns the same outout as above ⬆️
soup.find_all('h4', class_ = 'pull-right price')

[]

In [31]:
# return the last price using list indexing
soup.find_all('h4', class_ = 'pull-right price')[-1]

IndexError: list index out of range

In [30]:
# return the second price using list indexing
soup.find_all('h4', class_ = 'pull-right price')[1:3]

[]

In [36]:
# use the list to return any price as a string assigned to: title_list
title_list = soup.find_all('h4', class_ = 'pull-right price')
title_list[1].string

IndexError: list index out of range

In [19]:
# use the list to return any price as a string assigned to: title_list
new_list = title_list[-3:-1]
new_list

[<h4 class="pull-right price">$537.99</h4>,
 <h4 class="pull-right price">$587.99</h4>]

## Return a Python list of prices

In [20]:
out = []
# for loop list
for elem in title_list:
    out.append(elem.text)
out

['$69.99',
 '$88.99',
 '$96.99',
 '$97.99',
 '$99.99',
 '$101.99',
 '$102.99',
 '$103.99',
 '$107.99',
 '$121.99',
 '$130.99',
 '$148.99',
 '$172.99',
 '$233.99',
 '$251.99',
 '$320.99',
 '$399.99',
 '$489.99',
 '$537.99',
 '$587.99',
 '$603.99']

In [22]:
out[2:4]

['$96.99', '$97.99']

In [23]:
out[-3:-1]

['$537.99', '$587.99']

## Multiple tags and Bools

In [24]:
# use a list of multiple tags 
soup.find_all(['a','p','h4'])

[<a data-target=".side-collapse" data-target-2=".side-collapse-container" data-toggle="collapse-side">
 <button aria-controls="navbar" aria-expanded="false" class="navbar-toggle pull-right collapsed" data-target="#navbar" data-target-2=".side-collapse-container" data-target-3=".side-collapse" data-toggle="collapse" type="button">
 <span class="sr-only">Toggle navigation</span>
 <span class="icon-bar top-bar"></span>
 <span class="icon-bar middle-bar"></span>
 <span class="icon-bar bottom-bar"></span>
 </button>
 </a>,
 <a href="/"><img alt="Web Scraper" src="/img/logo_white.svg"/></a>,
 <a href="#page-top"></a>,
 <a class="menuitm" href="/">
 <p>Web Scraper</p>
 <div class="crta"></div>
 </a>,
 <p>Web Scraper</p>,
 <a class="menuitm" href="/cloud-scraper">
 <p>Cloud Scraper</p>
 <div class="crta"></div>
 </a>,
 <p>Cloud Scraper</p>,
 <a class="menuitm" href="/pricing">
 <p>Pricing</p>
 <div class="crta"></div>
 </a>,
 <p>Pricing</p>,
 <a class="menuitm dropdown-toggle" data-toggle="dro

In [25]:
# use comparison operator and bools to return all id tags
soup.find_all(id = True)

[<nav class="navbar-collapse collapse" id="navbar" role="navigation">
 <ul class="nav navbar-nav navbar-right">
 <li class="hidden">
 <a href="#page-top"></a>
 </li>
 <li>
 <a class="menuitm" href="/">
 <p>Web Scraper</p>
 <div class="crta"></div>
 </a>
 </li>
 <li>
 <a class="menuitm" href="/cloud-scraper">
 <p>Cloud Scraper</p>
 <div class="crta"></div>
 </a>
 </li>
 <li>
 <a class="menuitm" href="/pricing">
 <p>Pricing</p>
 <div class="crta"></div>
 </a>
 </li>
 <li class="dropdown">
 <a class="menuitm dropdown-toggle" data-toggle="dropdown" href="#section3">
 <p>Learn</p>
 <div class="crta"></div>
 </a>
 <ul class="dropdown-menu">
 <li>
 <a href="/documentation">Documentation</a>
 </li>
 <li>
 <a href="/tutorials">Video Tutorials</a>
 </li>
 <li>
 <a href="/how-to-videos">How to</a>
 </li>
 <li>
 <a href="/test-sites">Test Sites</a>
 </li>
 <li>
 <a href="https://forum.webscraper.io/" rel="noopener" target="_blank">Forum</a>
 </li>
 </ul>
 </li>
 <li>
 <a class="btn-menu1 install-e

In [26]:
soup.find_all(string = 'Galaxy Note')

['Galaxy Note', 'Galaxy Note']

## Complie regular expression patterns with `re.complie()`

In [27]:
import re
soup.find_all(string = re.compile('Galaxy'))

['Galaxy Tab 3',
 'Galaxy Tab 3',
 'Galaxy Tab 4',
 'Galaxy Tab',
 'Galaxy Note',
 'Galaxy Note',
 'Galaxy Note 10.1']

In [28]:
 # for example, this returns only two items
soup.find_all(string = 'Galaxy Tab 3')

['Galaxy Tab 3', 'Galaxy Tab 3']

In [29]:
# compiling multiple regular expression pattens
soup.find_all(string = ['Galaxy Tab 3', 'Acer Iconia'])

['Acer Iconia', 'Galaxy Tab 3', 'Galaxy Tab 3']

In [30]:
# compiling regular expression pattens
soup.find_all(string = re.compile('Ide'))

['Lenovo IdeaTab',
 'IdeaTab A3500L',
 'IdeaTab A8-50',
 'IdeaTab A3500-H',
 'IdeaTab S5000']

## Use `re.compile()` to return `h4`s with `class="pull-right price"`
As you can see, this is very useful. Intead of writing multiple lines, you can use `re.compile()` to keep your code DRY.

In [None]:
# instead of soup.find_all('h4', class_ = 'pull-right price')
soup.find_all(class_ = re.compile('price'))

In [None]:
# you can also be explicit, to filter more precisely
soup.find_all('h4', class_ = re.compile('price'))

In [None]:
# you can also be even more explicit
# why does it also return the main div? deFAult is why
soup.find_all(class_ = re.compile('fa'))

In [None]:
# you can also be explicit, to filter more precisely
soup.find_all('span', class_ = re.compile('fa'))

In [None]:
# you can also be explicit, to filter more precisely
soup.find_all('p', class_ = re.compile('pull'))

## Limit your results

In [None]:
# you can also be explicit, to filter more precisely
soup.find_all('p', class_ = re.compile('pull'))