In [1]:
!pip install selenium
!pip install requests



In [2]:
from selenium import webdriver
import requests
import os
from os import path
from datetime import datetime
from pathlib import Path
from collections import defaultdict

In [3]:
def setup(url):
    driver = webdriver.Chrome() 
    driver.maximize_window()
    driver.get(url)
    return driver

def teardown(driver):
    driver.quit()

In [4]:
url = "https://elpais.com"
driver = setup(url)

In [5]:
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.common import NoSuchElementException, ElementNotInteractableException

# Set implicit wait to be 25 seconds at start to wait for the notice card
driver.implicitly_wait(25)

# Find the notice element which asks for cookie acceptance
noticeEle = driver.find_element(By.CSS_SELECTOR, "[data-testid=notice]")

# Find the accept button and click it
acceptCookies = noticeEle.find_element(By.ID, "didomi-notice-agree-button")

wait = WebDriverWait(driver, timeout=5)
wait.until(lambda _: acceptCookies.text == "Accept")
assert acceptCookies.text == "Accept"

acceptCookies.click()

# Set the implicit wait to be shorter for the rest of the operations
driver.implicitly_wait(2)

In [6]:
# Navigate to the nav bar
nav = driver.find_element(By.TAG_NAME, "nav")
navElements = nav.find_elements(By.TAG_NAME, "a")
assert len(navElements) > 2

# Find the opinion anchor
opinionEle = navElements[1]
assert opinionEle.text.lower() == "opinión"
opinionEle.click()
assert driver.title == "Opinión en EL PAÍS"

In [8]:
# Save image using the url
def saveImage(url, index):
    response = requests.get(url, stream=True)
    imagePath = path.join(currPath, f"{index}.jpg")
    
    with open(imagePath, "wb") as f:
        for chunk in response.iter_content(chunk_size=8192):
            f.write(chunk)

In [11]:
project_num = os.getenv("GCP_PROJECT_NUM")
api_key = os.getenv("GCP_API_KEY")

assert project_num != None
assert api_key != None

def translateText(texts):
    req_url = f"https://translation.googleapis.com/language/translate/v2?key={api_key}"
    parameters = {
      "format": "text",
      "q": texts,
      "target": "en",
      "source": "es"
    }

    response = requests.post(req_url, json=parameters)
    response.raise_for_status()

    data = response.json().get("data")
    assert data != None
    
    translations = map(lambda item: item.get("translatedText"), data.get("translations", []))
    return list(translations)

In [12]:
# Extract the article details
def extractArticle(article):
    headerEle = article.find_element(By.CSS_SELECTOR, "h2")
    header = headerEle.text
    
    headerUrl = headerEle.find_element(By.TAG_NAME, "a").get_attribute("href")
    content = article.find_element(By.TAG_NAME, "p").text

    imageUrl = ""
    try:
        imageEle = article.find_element(By.CSS_SELECTOR, "figure > a > img")
    except NoSuchElementException:
        pass
    except Exception as e:
        raise e

    return {
        "header": header,
        "headerURL": headerUrl,
        "imageURL": imageUrl,
        "content": content
    }

In [16]:
# Get the articles and its container
articleContainer = driver.find_element(By.CSS_SELECTOR, "main > div")
articles = articleContainer.find_elements(By.TAG_NAME, "article")

# Save a screenshot of the content that is being tested
contentPath = path.join(currPath, "content.png")
articleContainer.screenshot(contentPath)

extractedData = []
headers = []
for i in range(5):
    article = articles[i]
    data = extractArticle(article)
    extractedData.append(data)

    headers.append(data["header"])
    if data.get("imageURL", "") != "":
        saveImage(data.get("imageURL"), i)

# Translate texts to english
translatedHeaders = translateText(headers)

In [25]:
count = defaultdict(int)

for header in translatedHeaders:
    words = header.split(" ")
    for word in words:
        count[word] += 1

unique = []
print()
print("-----------------Repeated Words-----------------")
for key in count.keys():
    if count[key] > 1:
        print(key, ": ", count[key])
    else:
        unique.append(key)

print()
print("--------------------Unique Words--------------------")
print(", ".join(unique))
print()


-----------------Repeated Words-----------------
for :  2

--------------------Unique Words--------------------
Science, rebels, against, Trump, The, far, right, erupts, in, Japan, Closed, vacation, Every, man, himself, ‘Inventory, of, a, perfect, trip’

