# Web Scraping with lazy load

Page to be scraped: El Espectador

In [None]:
# Importing required libraries

import time

from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.common.exceptions import WebDriverException

from pymongo import MongoClient

In [None]:
# Creating a connection to MongoDB
client = MongoClient('localhost', 27017)
db = client['news']
collection = db['elespectador']

In [None]:
# Base URL of the site to be analyzed
SITE_URL = 'https://www.elespectador.com'

In [None]:
# Chrome web driver path
DRIVER_PATH = './chromedriver'

In [None]:
# Creating a new chrome window
browser = webdriver.Chrome(executable_path = DRIVER_PATH)

In [None]:
def make_request(browser, relative_path):
    # Making the request and rendering the browser
    browser.get(SITE_URL + relative_path)
    
    # Simulating vertical scrolling for handling lazy load
    check_height = browser.execute_script('return document.body.scrollHeight;')
    while True:
        browser.execute_script('window.scrollTo(0, document.body.scrollHeight);')
        time.sleep(3)
        height = browser.execute_script('return document.body.scrollHeight;')
        if height == check_height: 
            break 
        check_height = height
    
    # Getting HTML content and passing it to BeautifulSoup for scraping analysis
    return BeautifulSoup(browser.page_source, 'html.parser')

In [None]:
# Getting HTML content for a particular news listing page
soup = make_request(browser, '/archivo/politica/')
soup = make_request(browser, '/archivo/politica/2')
soup = make_request(browser, '/archivo/politica/3')
soup = make_request(browser, '/archivo/politica/4')
soup = make_request(browser, '/archivo/politica/5')
soup = make_request(browser, '/archivo/politica/6')
soup = make_request(browser, '/archivo/politica/7')
soup = make_request(browser, '/archivo/politica/8')
soup = make_request(browser, '/archivo/politica/9')
soup = make_request(browser, '/archivo/politica/10')

In [None]:
# Finding the section where news are contained 
layout = soup.find(class_ = 'Layout-flexAds')

In [None]:
# Getting blocks from layout
blocks = layout.find_all(class_ = 'Container Block', recursive = True)
print(len(blocks))

In [None]:
# Finding and concatenating news cards
cards = blocks[0].find_all(class_ = 'Card_rowCardLeft') + blocks[1].find_all(class_ = 'Card_rowCardLeft')
len(cards)

In [None]:
# Building a list with title and relative path of the news founded

news = []

for card in cards:
    news.append({
        'title': card.find('h2', class_ = 'Card-Title').find('a').get_text(),
        'relative_path': card.find('h2', class_ = 'Card-Title').find('a')['href']
    })

In [None]:
news

In [None]:
for n in news:
        
    # Getting HTML content for each news page
    soup = make_request(browser, n['relative_path'])
    
    # Extracting news metadata
    n['datetime'] = soup.find(class_ = 'ArticleHeader-Date').get_text()
    n['author'] = soup.find(class_ = 'ACredit-Author').find('a').get_text()
    n['summary'] = soup.find(class_ = 'ArticleHeader-Hook').find('div').get_text()
    
    # Extracting and concatenating news full text
    paragraphs = soup.find_all(class_ = 'font--secondary')
    n['full_text'] = ' '.join([p.get_text() for p in paragraphs])

In [None]:
news

In [None]:
# Storing extracted information for further analysis
collection.insert_many(news)

In [None]:
pg_amount = 5
page = BeautifulSoup(DRIVER_PATH.page_source,'html.parser')

for i in range (0, pg_amount):
    
    next_page = DRIVER_PATH.find_element_by_css_selector('.Pagination-Nav')
    next_page.click()
    time.sleep(2)

# nextpage = browser.find_element_by_xpath('//*[@id="main-layout"]/div[1]/div/section[2]/div[2]/div/div[3]/a')
# nextpage.click()