In [1]:
from bs4 import BeautifulSoup
import requests
import re
import pandas as pd
import time
from lxml import html

#### 4 tables - Beautiful Soup vs LXML
1. total time
2. process time
3. memory usage
4. data usage

## Beautiful Soup

In [5]:
def getHTMLdocument(url):
      
        response = requests.get(url)      
        return response.text

rownum = 0
book_table = pd.DataFrame(columns=["Sr. No.", "Name", "Link", "Cost", "Availability"])
total_time_bs = []
process_time_bs = []
over_all_pages_start_bs = time.time()

for j in range(1, 51):
        start_time_bs = time.process_time()
        start_time_total_bs = time.time()

        url_to_scrape = "http://books.toscrape.com/catalogue/page-" + str(j) + ".html"  
        html_document = getHTMLdocument(url_to_scrape)  
        soup = BeautifulSoup(html_document, 'html.parser')


        for row in soup.find_all('li', attrs={'class': re.compile("col-xs-6")}):

                rownum += 1
                name = row.find('h3').find('a').get('title')
                link = "http://books.toscrape.com/catalogue/" + row.find('h3').find('a').get('href')
                cost = row.find('p', attrs = {'class': re.compile("price_color")}).text.strip()
                cost = cost[2:]
                availability = row.find('p', attrs = {'class': re.compile("instock availability")}).text.strip()        
                book_table.loc[len(book_table.index)] = [rownum, name, link, cost, availability ] 

        stop_time_bs = time.process_time()
        stop_time_total_bs = time.time()

        tt = stop_time_total_bs - start_time_total_bs
        pt = stop_time_bs - start_time_bs

        total_time_bs.append(tt)
        process_time_bs.append(pt)

over_all_pages_stop_bs = time.time()

print(f"Avg Total Time Taken - beautifulsoup: {sum(total_time_bs)/len(total_time_bs)} sec")
print(f"Avg Process Time Taken - beautifulsoup: {sum(process_time_bs)/len(process_time_bs)} sec")
print(f"Total time taken to iterate through 50 pages - beautifulsoup: {over_all_pages_stop_bs-over_all_pages_start_bs} sec")

book_table.to_excel('py_request_bs.xlsx',index=False)

Avg Total Time Taken - beautifulsoup: 0.8521325588226318 sec
Avg Process Time Taken - beautifulsoup: 0.2725 sec
Total time taken to iterate through 50 pages - beautifulsoup: 42.60862112045288 sec


## LXML

In [6]:
''' LXML '''

def getHTMLdocument(url):      
        response = requests.get(url)      
        return response.content

rownum = 0
book_table_lxml = pd.DataFrame(columns=["Sr. No.", "Name", "Link", "Cost", "Availability"])
total_time_lxml = []
process_time_lxml = []
over_all_pages_start_lxml = time.time()

for j in range(1,51):
        start_time_lxml = time.process_time()
        start_time_total_lxml = time.time()

        url = "http://books.toscrape.com/catalogue/page-" + str(1) + ".html"
        byte_data = getHTMLdocument(url)
        source_code = html.fromstring(byte_data)
        
        for i in range(1,21):

                rownum += 1
                name_path = '//*[@id="default"]/div/div/div/div/section/div[2]/ol/li[' + str(i) + ']/article/h3/a/@title' 
                book_link_path = '//*[@id="default"]/div/div/div/div/section/div[2]/ol/li[' + str(i) + ']/article/h3/a/@href'
                cost_path = '//*[@id="default"]/div/div/div/div/section/div[2]/ol/li[' + str(i) + ']/article/div[2]/p[1]' 
                stock_path = '//*[@id="default"]/div/div/div/div/section/div[2]/ol/li[' + str(i) + ']/article/div[2]/p[2]' 

                name = source_code.xpath(name_path)[0]
                book_link = "http://books.toscrape.com/catalogue/" + source_code.xpath(book_link_path)[0]
                cost = source_code.xpath(cost_path)[0].text_content()
                stock = source_code.xpath(stock_path)[0].text_content().strip()

                book_table_lxml.loc[len(book_table_lxml.index)] = [rownum, name, book_link, cost, stock ]

        stop_time_lxml = time.process_time()
        stop_time_total_lxml = time.time()

        tt = stop_time_total_lxml - start_time_total_lxml
        pt = stop_time_lxml - start_time_lxml

        total_time_lxml.append(tt)
        process_time_lxml.append(pt)

over_all_pages_stop_lxml = time.time()

print(f"Avg Total Time Taken - lxml: {sum(total_time_lxml)/len(total_time_lxml)} sec")
print(f"Avg Process Time Taken - lxml: {sum(process_time_lxml)/len(process_time_lxml)} sec")
print(f"Total time taken to iterate through 50 pages - lxml: {over_all_pages_stop_lxml-over_all_pages_start_lxml} sec")

book_table_lxml.to_excel('py_request_lxml.xlsx', index = False)           

Avg Total Time Taken - lxml: 0.8434636974334717 sec
Avg Process Time Taken - lxml: 0.26875 sec
Total time taken to iterate through 50 pages - lxml: 42.173184871673584 sec


## Exporting in EXCEL

In [17]:
total_time = pd.DataFrame(columns=['Sr. No.', 'bs4', 'LXML'])
process_time = pd.DataFrame(columns = ['Sr. No.', 'bs4', 'LXML'])

for i in range(0,50):
    num = i + 1
    total_time.loc[len(total_time.index)] = [int(num), total_time_bs[i], total_time_lxml[i]]
    process_time.loc[len(process_time.index)] = [int(num), process_time_bs[i], process_time_lxml[i]]

total_time.to_csv('Over_50_url_total_time.csv', index = False)
process_time.to_csv('Over_50_url_process_time.csv', index = False)

