In [None]:
from extraction import extract_articles_from_google
from extraction import extract_data_from_page
from selenium.webdriver.remote.webelement import WebElement
from selenium.webdriver.common.by import By

In [None]:
pages_file_name = "emol_articles.json"
content_file_name = "emol_articles_with_content.json"

In [None]:
extract_articles_from_google(
	website="emol.com",
	newscast="Emol",
    pages=2,
    output=pages_file_name,
    keywords="Estallido Social",
    )

In [None]:
def get_content(body: WebElement):
	words_to_avoid = [
		"noticias relacionadas"
	]
	content = ""
	for element in body.find_element(By.ID, "cuDetalle_cuTexto_textoNoticia").find_elements(By.TAG_NAME, "div"):
		if any(phrase in element.text.lower() for phrase in words_to_avoid):
			continue
		content += element.text.strip().replace('"', "'") + "\n"
	return content

def get_author(body: WebElement):
	try:
		return body.find_element(By.CLASS_NAME, "info-notaemol-porfecha").text.split("Por")[1].strip()
	except Exception:
		return "The website does not provide an author"

def get_description(body: WebElement):
	try:
		return body.find_element(By.ID, "cuDetalle_cuTitular_bajadaNoticia").text.strip()
	except Exception:
		return "The website does not provide a description"

extract_data_from_page(
	input_file=pages_file_name, 
	output_file=content_file_name,
	get_author=get_author,
	get_description=get_description,
	get_content=get_content,
	limit_of_pages=5,
	wait=60,
)

In [None]:

json_pages_info = {"pages": []}

# Configurar opciones para Chrome (modo headless)
options = Options()
options.add_argument("--headless") 
options.add_argument("--disable-gpu")
options.add_argument("--no-sandbox")

driver = webdriver.Chrome(options=options)

articles_count = 0

pages = 100

# from_value = (page - 1) * 10
url = f"https://www.emol.com/buscador/?query=estallido%20social%20chile"

driver.get(url)

# Esperar a que la página cargue completamente
time.sleep(2)

for page in range(2, pages + 1):

	elements = driver.find_element(By.ID, "listNews")
	articles = elements.find_elements(By.ID, "ContenedorLinkNoticia")
	print(f"{len(articles)} articles found")

	for article in articles:
		print("Analyzing article...")

		date = article.find_element(By.CLASS_NAME, "bus_txt_fuente")
		originalDate = date.text[-10:] if date else "not found"
		splitDate = originalDate.split("/")

		if(len(splitDate) != 3):
			# print("Fecha no encontrada")
			continue
		
		timestamp = calendar.timegm(time.strptime(f'{splitDate[2]}-{splitDate[1]}-{splitDate[0]} 00:00:00', '%Y-%m-%d %H:%M:%S'))

		# print("timestamp: " + str(timestamp))

		if (timestamp < 1573786800 or timestamp > 1702782000):
			print("La noticia no corresponde a la fecha solicitada")
			continue

		print(splitDate)

		newcast = article.find_element(By.ID, "linkNombreSitio").text
		if (newcast != "Emol"):
			print("El newscast no es Emol!")
			continue

		link = article.find_element(By.ID, "LinkNoticia").get_attribute("href")
		title = article.find_element(By.ID, "LinkNoticia").text
		image_link = article.find_element(By.ID, "ImgSitio").get_attribute("src")
		description = article.find_element(By.ID, "BajadaNoticia").text

		link_info = {
			"newscast" : "Emol",
			"title": title,
			"description": description,
			"category": "The site does not provide a category",
			"date": originalDate,
			"image_link": image_link,
			"author": "not found initially",
			"link": link,
		}
		articles_count += 1
		print("Añadiendo articulo")
		json_pages_info["pages"].append(link_info)

	buttons = driver.find_element(By.ID, "listPages")
	buttons = buttons.find_elements(By.TAG_NAME, "li")

	print(len(buttons))

	button_found = False

	for button in buttons:
		if(button.text == str(page)):
			print("Button found: " + button.text)
			button = button.find_element(By.TAG_NAME, "a")
			button.click()
			button_found = True
			time.sleep(2)
			break

	if(not button_found):
			print("The button for page " + str(page) + " was not found.")
			break

driver.quit()

with open(f"../archive/temp/emol_pages.json", 'w', encoding='utf-8') as file:
	json.dump(json_pages_info, file, ensure_ascii=False, indent=4)


In [None]:
def get_page_content(page, url):

	options = Options()
	options.add_argument("--headless") 
	options.add_argument("--disable-gpu")
	options.add_argument("--no-sandbox")

	driver = webdriver.Chrome(options=options)

	driver.get(url)

	time.sleep(2)

	try:
			title = driver.find_element(By.ID, "cuDetalle_cuTitular_tituloNoticia")
			content = driver.find_element(By.ID, "cuDetalle_cuTexto_textoNoticia")
			paragraphs = content.find_elements(By.TAG_NAME, "div")

			text = ""
			for p in paragraphs:
				text += p.text.strip() + "\n"

			page_content = {
				"header": title.text.strip() if title else "No header found",
				"content": text.strip() if text else "No content found",
			}

			page["page_content"] = page_content
			return {"error": False}
	except Exception as e:
			print(f"Error al acceder al contenido de la página {url}: {e}")
			return {"error": True}


In [None]:
original_json_data = {}
json_data_with_content ={"pages": []}
error_count = 0

with open("../archive/temp/emol_pages.json", 'r', encoding='utf-8') as file:
    original_json_data = json.load(file)
    print("Page to check: " + str(len(original_json_data["pages"])))
    
for page in original_json_data["pages"]:
	link = page["link"]
	page["page_content"] = {}
    
	response = get_page_content(page, link)
	if response["error"]:
		error_count += 1
		page["page_content"] = {
			"header": "Error retrieving content",
			"content": "Error retrieving content"
		}
	json_data_with_content["pages"].append(page)
	
with open("../archive/temp/emol_pages_with_content.json", 'w', encoding='utf-8') as file:
	json.dump(json_data_with_content, file, ensure_ascii=False, indent=4)