# Web Scraping with lazy load

Page to be scraped: El Espectador

# Parte 1

In [1]:
# Importing required libraries

import time

from bs4 import BeautifulSoup

# Solved deprecation warning: https://exerror.com/deprecationwarning-executable_path-has-been-deprecated-please-pass-in-a-service-object/
from selenium import webdriver
from selenium.webdriver.firefox.service import Service
#from selenium.webdriver.chrome.service import Service

from pymongo import MongoClient

In [2]:
# Creating a connection to MongoDB
client = MongoClient('localhost', 27017)
db = client['news']
collection = db['elespectador']

In [3]:
# Base URL of the site to be analyzed
SITE_URL = 'https://www.elespectador.com'

In [4]:
# Download the driver for you S.O. here: https://github.com/mozilla/geckodriver/releases

# Firefox1 web driver path
s1 = Service('./geckodriverC')

# Firefox web driver path
s2 = Service('./geckodriverF')

In [5]:
# Creating a firefox window for pages for categories
#browser_archive = webdriver.Firefox(service=s1)

In [6]:
def make_request(browser, relative_path):
    # Making the request and rendering the browser
    browser.get(SITE_URL + relative_path)
    
    # Simulating vertical scrolling for handling lazy load
    check_height = browser.execute_script('return document.body.scrollHeight;')
    while True:
        browser.execute_script('window.scrollTo(0, document.body.scrollHeight);')
        time.sleep(6)
        height = browser.execute_script('return document.body.scrollHeight;')
        if height == check_height: 
            break 
        check_height = height
    
    # Getting HTML content and passing it to BeautifulSoup for scraping analysis
    return BeautifulSoup(browser.page_source, 'html.parser')

In [7]:
# List with the categories of topics
path_categories = ['politica', 'judicial', 'colombia', 'economia', 'bogota']

# Number of pages to consult in each category
number_pages = 2

In [8]:
# Building a list with title and relative path of the news founded
news = []

# Iterate over the list of categories
for cat in path_categories:
    
    # Check number of pages
    for num in range(number_pages):
                    
        # Creating a firefox window for the archive for each category
        browser_archive = webdriver.Firefox(service=s1)
                      
        # Getting HTML content for a particular news listing page
        soup = make_request(browser_archive, '/archivo/' + cat + '/' + str(num+1) + '/')
        
        # Finding the section where news are contained
        layout = soup.find(class_ = 'Layout-flexAds')
        
        # Getting blocks from layout
        blocks = layout.find_all(class_ = 'Container Block', recursive = True)
        #print(len(blocks))
        
        # Finding and concatenating news cards
        cards = blocks[0].find_all(class_ = 'Card_rowCardLeft') + blocks[1].find_all(class_ = 'Card_rowCardLeft')
        #print(blocks[0].find_all(class_ = 'Card_rowCardLeft'))
        #print(len(cards))
        #print(cards)
        
        # Close the firefox window
        browser_archive.close()
        
        # Iterate over each card obtained in the blocks
        for card in cards:
            
            # Avoiding premium articles
            if card.find(class_ = 'Card-ExclusiveContainer') == None:
                
                # Add the items to the list
                news.append({
                    'title': card.find('h2', class_ = 'Card-Title').find('a').get_text(),
                    'relative_path': card.find('h2', class_ = 'Card-Title').find('a')['href'],
                    'category': cat
                })

In [9]:
type(news)

list

In [10]:
news

[{'title': '‘Hay un plan para infiltrar mi campaña con dineros del narcotráfico’: Petro',
  'relative_path': '/politica/elecciones-colombia-2022/hay-un-plan-para-infiltrar-mi-campana-con-dineros-del-narcotrafico-denuncia-gustavo-petro/',
  'category': 'politica'},
 {'title': '¿Duque rompió relaciones con Rusia por invasión a Ucrania?',
  'relative_path': '/politica/duque-rompio-relaciones-con-rusia-por-invasion-a-ucrania/',
  'category': 'politica'},
 {'title': 'Francia Márquez no conoce la paz ni la tranquilidad hace veinte años',
  'relative_path': '/politica/francia-marquez-no-conoce-la-paz-ni-la-tranquilidad-hace-veinte-anos/',
  'category': 'politica'},
 {'title': 'Reforma rural, apertura de macro casos: las peticiones de víctimas en el Congreso',
  'relative_path': '/politica/reforma-rural-apertura-de-macro-casos-en-la-jep-las-peticiones-de-victimas-en-el-congreso/',
  'category': 'politica'},
 {'title': 'Las indirectas de Iván Duque a las propuestas de Petro',
  'relative_path':

In [11]:
 for n in news:
                            
        # Creating a firefox window for article
        browser_article = webdriver.Firefox(service=s2)

        # Getting HTML content for each news page
        soup = make_request(browser_article, n['relative_path'])

        # Extracting news metadata
        n['datetime'] = soup.find(class_ = 'ArticleHeader-Date').get_text()
        n['author'] = soup.find(class_ = 'ACredit-Author').get_text()
        n['summary'] = soup.find(class_ = 'ArticleHeader-Hook').find('div').get_text()

        # Extracting and concatenating news full text
        paragraphs = soup.find_all(class_ = 'font--secondary')
        n['full_text'] = ' '.join([p.get_text() for p in paragraphs])                    
        browser_article.close()

In [12]:
# Getting HTML content for a particular news listing page
#soup = make_request(browser, '/archivo/politica/')

In [13]:
# Finding the section where news are contained 
#layout = soup.find(class_ = 'Layout-flexAds')

In [14]:
# Getting blocks from layout
#blocks = layout.find_all(class_ = 'Container Block', recursive = True)
#print(len(blocks))

In [15]:
# Finding and concatenating news cards
#cards = blocks[0].find_all(class_ = 'Card_rowCardLeft') + blocks[1].find_all(class_ = 'Card_rowCardLeft')
#len(cards)

In [16]:
# Building a list with title and relative path of the news founded

#news = []

#for card in cards:
    #news.append({
        #'title': card.find('h2', class_ = 'Card-Title').find('a').get_text(),
        #'relative_path': card.find('h2', class_ = 'Card-Title').find('a')['href']
    #})

In [17]:
news

[{'title': '‘Hay un plan para infiltrar mi campaña con dineros del narcotráfico’: Petro',
  'relative_path': '/politica/elecciones-colombia-2022/hay-un-plan-para-infiltrar-mi-campana-con-dineros-del-narcotrafico-denuncia-gustavo-petro/',
  'category': 'politica',
  'datetime': '10 Apr 2022  - 10:07 a.\xa0m.',
  'author': 'Redacción Política',
  'summary': 'El candidato presidencial Gustavo Petro denunció que, desde las cárceles, se estaría gestando un plan para ingresar dineros ilícitos a su campaña. El Inpec lo invitó a que denuncie formalmente lo que sabe.',
  'full_text': 'A contadas semanas de las elecciones presidenciales, el candidato del Pacto Histórico, Gustavo Petro denunció en su cuenta de Twitter un supuesto plan para infiltrar su campaña con dineros del narcotráfico. La estrategia, dijo el senador y exalcalde de Bogotá, se estaría gestando al interior de las cárceles. Desde el Inpec le contestaron que denuncie ante las autoridades con la información que tiene. El 9 de abril

In [20]:
len(news)

108

In [33]:
# Storing extracted information for further analysis
collection.insert_many(news)

<pymongo.results.InsertManyResult at 0x7f02500b4e50>

In [31]:
news.elespectador.countDocuments()

AttributeError: 'list' object has no attribute 'elespectador'