In [None]:
%%HTML
<style>
div.heading{
    padding: 0 10%;
    text-align:center;
    }

p.text{
    text-align:center;
    padding: 0 10%;

}
</style>

# <p class="text">Python for Automation - Lesson 8</p> 

<div class="heading">
    <ul style="list-style-type:none">
        <li><b>Lesson 8 Structure:</b></li>
        <li>Parsing HTML</li>
    </ul>
</div>

## <p class="text">What is HTML?</p>

<p class="text"><b>H</b>yper<b>T</b>ext <b>M</b>arkup <b>L</b>anguage or <b>HTML</b> is the standard markup language for documents designed to be displayed in a web browser. It defines the content and structure of web content. It is often assisted by technologies such as <b>C</b>ascading <b>S</b>tyle <b>S</b>heets (<b>CSS</b>) and scripting languages such as JavaScript.

Web browsers receive HTML documents from a web server or from local storage and render the documents into multimedia web pages. HTML describes the structure of a web page semantically and originally included cues for its appearance. HTML files by nature have a nested structure</p> 

## <p class="text">Example HTML</p>

In [None]:
example_html = """<!DOCTYPE html>
<html>
<body>

<h1>My First Heading</h1>

<p>My first paragraph.</p>

</body>
</html>"""
print(example_html)

In [None]:
explanation = """Example Explained
The <!DOCTYPE html> declaration defines that this document is an HTML5 document

The <html> element is the root element of an HTML page

The <head> element contains meta information about the HTML page

The <title> element specifies a title for the HTML page (which is shown in the browser's title bar or in the page's tab)

The <body> element defines the document's body, and is a container for all the visible contents, such as headings, paragraphs, images, hyperlinks, tables, lists, etc.

The <h1> element defines a large heading

The <p> element defines a paragraph</p>"""
print(explanation)

<p class="text"><b>There are numerous uses for parsing HTML files - scraping sites (getting their content and organizing it in a useful fashion), analyzing data retrieved from websites without a REST interface and automating QA tests related to webpage content.</b></p> 

## <p class="text">Parsing HTML</p>

<p class="text">There are numerous libraries that can be used to strictly parse HTML content. I'm going to show you 2 of the most prevelent ones - <code>Beautiful Soup</code> and <code>Selenium</code>.</p> 

## <p class="text">Beautiful Soup</p>

<p class="text">Beautiful Soup is a Python package for parsing HTML and XML documents, including those with malformed markup. It creates a parse tree for documents that can be used to extract data from HTML, which is useful for web scraping.</p> 

### <p class="text">Installing Beautiful Soup: <code>python -m pip install beautifulsoup4</code></p>

In [None]:
# Sample use

# We import the BeautifulSoup class from bs4
from bs4 import BeautifulSoup
import requests

url = requests.get("https://www.mobile.bg/obiavi/avtomobili-dzhipove/honda/accord")
url.encoding = 'windows-1251' #  Needed as the site contains Unicode characters
htmltext = url.text

In [None]:
# Show part of received HTML
htmltext[:1000]

<p class="text">When we have a site in mind, the proper way to scrape it is to familiarize with the part of HTML code that we want to review and then extract only the needed parts with Beautiful Soup.</p> 

In [None]:
# In our example, we want to take all titles, pricings and urls in site of the cars here

# This is how we pass the .html to BeautifulSoup and create a instance that we can manipulate
soup = BeautifulSoup(htmltext) # We can also pass a string, taken from a .html file

In [None]:
# Here we are searching for the span cell (<span>) tag with class 'price' to get all prices
prices = soup.find_all('span', class_='price')
print(prices[0])

In [None]:
prices = [int(price.text[:-4].replace(' ', '')) for price in prices] # We are only interest in the actual prices string, not the whole html structure
print(prices[0])

In [None]:
# We also need the listings title, upon investigation it's contained in the anchor tag (<a>) with class mmmL
# Lucky for us, also the url for the offer is contained in the same tag
listing_names_raw = soup.find_all('a', class_='mmmL')
print(listing_names_raw[0])

In [None]:
# Get title text
listing_names = [listing.text for listing in listing_names_raw]
print(listing_names[0])

In [None]:
# Get listing url
listing_url = [listing['href'][2:] for listing in listing_names_raw]
print(listing_url[0])

In [None]:
# We can now combine the both to form a complete entry
title_price_url = [list(i) for i in zip(listing_names, prices, listing_url)]

In [None]:
title_price_url[:10]

In [None]:
# Or we can for example create a dataframe for more appealing inspection
import pandas as pd

# Set pandas options, so it does not truncate columns
pd.set_option('display.max_column', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.max_seq_items', None)
pd.set_option('display.max_colwidth', 500)
pd.set_option('expand_frame_repr', True)

offers_df = pd.DataFrame(title_price_url, columns=['Car Title', 'Pricing', 'Listing URL'])

In [None]:
offers_df

## <p class="text">Selenium</p>

<p class="text">Selenium of Selenium Web Driver how it's officially called, is a module that we can use to simulate human interaction with a website. Below we are going to extend the above example by first going over all pages, not just the first one, adding pictures and car data to the dataframe. Selenium is often used in tandem with Beautiful Soup - we use selenium for interaction with the website and Beautiful Soup for scraping the content.</p> 

### <p class="text">Installing Selenium: <code>python -m pip install selenium</code></p>

In [None]:
# Sample use
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
import time
from bs4 import BeautifulSoup
from selenium.common.exceptions import NoSuchElementException

# Initialize a Firefox driver
driver = webdriver.Firefox()

# Go to a specific site
driver.get("http://www.python.org")

# Check if string is in title of said page
assert "Python" in driver.title

# Find the query bar
elem = driver.find_element(By.NAME, "q")
time.sleep(1)

elem.clear()
# Write the search term pycon
elem.send_keys("pycon")
time.sleep(1)

# Press go
elem.send_keys(Keys.RETURN)
time.sleep(1)

# Verify that there are displayed results
assert "No results found." not in driver.page_source
# driver.close() - We want to close it by ourselves

In [None]:
def scrape_cars(soup:BeautifulSoup) -> list:
    """
    Scrape car entries from page

    : param soup: HTML to parse
    : return: List of car entries
    """
    prices = soup.find_all('span', class_='price')
    prices = [price.text[:-4].replace(' ', '') for price in prices]
    listing_names_raw = soup.find_all('a', class_='mmmL')
    listing_names = [listing.text for listing in listing_names_raw]
    listing_url = [listing['href'][2:] for listing in listing_names_raw]
    title_price_url = [list(i) for i in zip(listing_names, prices, listing_url)]

    return title_price_url

In [None]:
# Scrape the car site so we can create our dataframe
driver = webdriver.Firefox()

from selenium.webdriver.firefox.options import Options as FirefoxOptions

# options = FirefoxOptions()
# options.add_argument("--headless") # This is used so no visible browser is opened
# driver = webdriver.Firefox(options=options)

# Create a list for all cars
all_cars = []

# Go to page
driver.get("https://www.mobile.bg/obiavi/avtomobili-dzhipove/honda/accord")

# Generate a BeautifulSoup instance from the current page
soup = BeautifulSoup(driver.page_source, 'html.parser')
total_pages = soup.find('span', class_='pageNumbersInfo')
total_pages = int(total_pages.b.text[total_pages.b.text.rindex(' ')+1:])

# Iterate all pages and pull the name, pricing and url of each offer
for page in range(1, total_pages + 1):
    print(f"Scraping page {page}")
    title_price_url = scrape_cars(soup)
    all_cars.extend(title_price_url)

    if page != total_pages:
        try:
            next_button =  driver.find_element(By.XPATH, f"//a[text()='{page + 1}']")
            next_button.send_keys(Keys.RETURN)
        except NoSuchElementException as e:
            next_button =  driver.find_element(By.XPATH, f"//a[text()='Напред']")
            next_button.send_keys(Keys.RETURN)

    time.sleep(0.5)
    print(f"Cars scraped: {len(all_cars)}")
    soup = BeautifulSoup(driver.page_source, 'html.parser')

driver.close()

In [None]:
all_cars_list = [{"title": name, "price": int(price) if price.isdigit() else price, "url": link} for name, price, link in all_cars]

In [None]:
all_cars_list[:10]

In [None]:
all_cars_list[1]

In [None]:
# Iterate over all cars and add supplementary parameters from offer
for car in all_cars_list:
    url = requests.get(f"https://{car['url']}")
    url.encoding = 'windows-1251' #  Needed as the site contains Unicode characters
    htmltext = url.text
    soup = BeautifulSoup(htmltext)
    data = soup.find('ul', class_='dilarData')
    split_data = [i.text for i in data if i != '\n']
    for param in range(0, len(split_data), 2):
        car[split_data[param]] = split_data[param + 1]
    print(f"Processing offer: {car['title']}")

In [None]:
all_cars_list[1]

In [None]:
import pandas as pd

pd.set_option('display.max_column', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.max_seq_items', None)
pd.set_option('display.max_colwidth', 500)
pd.set_option('expand_frame_repr', True)

df = pd.DataFrame(all_cars_list)
df = df.rename(columns={'title': 'Заглавие', 'price':'Цена', 'url':'Линк към обява'})

In [None]:
df.head(20)

# <p class="text">Thank you for your time!</p>