In [1]:
from extraction import extract_articles_from_google
from extraction import extract_data_from_page
from selenium.webdriver.remote.webelement import WebElement
from selenium.webdriver.common.by import By

In [2]:
pages_file_name = "ciper_articles.json"
content_file_name = "ciper_articles_with_content.json"

In [None]:
extract_articles_from_google(
    website="ciperchile.cl",
    newscast="Ciper",
    pages=2,
    output=pages_file_name,
    keywords="Estallido Social",
    )

In [5]:
def get_content(body: WebElement):
    content = body.find_element(By.CLASS_NAME, "col-lg-9").find_element(By.CLASS_NAME, "col-lg-9").text.strip()
    content = content.split("NOTAS Y REFERENCIAS")[0].strip()
    return content

def get_author(body: WebElement):
	try:
		return body.find_element(By.CLASS_NAME, "article-big-text__author").text.split("Por")[1].strip()
	except Exception:
		return "The website does not provide an author"

def get_description(body: WebElement):
    return "The website does not provide a description"

extract_data_from_page(
	input_file=pages_file_name, 
	output_file=content_file_name,
	get_author=get_author,
	get_description=get_description,
	get_content=get_content,
	limit_of_pages=5,
	wait=60,
)

[INFO] Input file: ciper_articles.json
[INFO] Output file: ciper_articles_with_content.json
[INFO] Limit of pages to process: 5

[INFO] Loading extracted pages from JSON file...
[INFO] Starting extraction process...

[INFO] Checking page: 0 | Link: https://www.ciperchile.cl/2021/03/20/la-teoria-del-complot-en-el-estallido-chileno-un-examen-critico/
[INFO] Storing information in a JSON file...

[INFO] Checking page: 1 | Link: https://www.ciperchile.cl/2020/03/02/escuchando-a-los-chilenos-en-medio-del-estallido-liberacion-emocional-reflexividad-y-el-regreso-de-la-palabra-pueblo/
[INFO] Storing information in a JSON file...

[INFO] Checking page: 2 | Link: https://www.ciperchile.cl/2020/10/31/demandas-organizaciones-y-violencias-perspectivas-para-entender-la-revuelta-de-2019/
[INFO] Storing information in a JSON file...

[INFO] Checking page: 3 | Link: https://www.ciperchile.cl/2020/10/14/de-la-desafeccion-al-estallido-social/
[INFO] Storing information in a JSON file...

[INFO] Checking 

## Pages Extraction - Ciper

In [None]:

json_pages_info = {"pages": []}

options = Options()
options.add_argument("--headless") 
options.add_argument("--disable-gpu")
options.add_argument("--no-sandbox")

driver = webdriver.Chrome(options=options)

articles_count = 0

pages = 100

url = "https://www.ciperchile.cl/page/1/?s=Estallido+Social"

driver.get(url)

# Esperar a que la página cargue completamente
time.sleep(2)

with open(f"output.html", "w", encoding="utf-8") as f:
		f.write(driver.page_source)

for page in range(2, pages + 1):

	try:
		elements = driver.find_element(By.CLASS_NAME, "pb-4")
		articles = elements.find_elements(By.CLASS_NAME, "col-md-12")
	except Exception as e:
		print("Fatal error al encontrar los elementos de la página: ", e)
		break

	print(f"{len(articles)} articles found")

	for article in articles:
		print("Analyzing article...")

		date = article.find_element(By.CLASS_NAME, "article__text--date")
		print(date.text)
		originalDate = date.text[:10] if date else "not found"
		splitDate = originalDate.split("-")

		if(len(splitDate) != 3 or date == None):
			print("Fecha no encontrada")
			continue
		
		timestamp = calendar.timegm(time.strptime(f'{splitDate[2]}-{splitDate[1]}-{splitDate[0]} 00:00:00', '%Y-%m-%d %H:%M:%S'))

		# print("timestamp: " + str(timestamp))

		if (timestamp < 1573786800 or timestamp > 1702782000):
			print("La noticia no corresponde a la fecha solicitada")
			continue

		print(splitDate)

		try:
			link = article.find_elements(By.CLASS_NAME, "alticle-link")[1].get_attribute("href")
		except Exception as e:
			print("Error al obtener el enlace del artículo: ", e)
			continue

		try:
			category = article.find_element(By.CLASS_NAME, "alticle-link").text
			title = article.find_element(By.CLASS_NAME, "article__text--title").text
			description = article.find_element(By.CLASS_NAME, "article__text--epigraph").text

			link_info = {
				"newscast" : "Ciper",
				"title": title,
				"description": description,
				"category": category,
				"date": originalDate,
				"image_link": "The site does not provide a image",
				"author": "not found initially",
				"link": link,
			}

		except Exception as e:
			print("Error al obtener los datos del artículo: ", e)

			link_info = {
				"newscast": "Error",
				"title": "Error",
				"description": "Error",
				"category": "Error",
				"date": originalDate,
				"image_link": "Error",
				"author": "Error",
				"link": link
			}

		articles_count += 1
		print("Añadiendo articulo")
		json_pages_info["pages"].append(link_info)

	url = f"https://www.ciperchile.cl/page/{page}/?s=Estallido+Social"
	driver.get(url)
	time.sleep(2)

driver.quit()

with open(f"../archive/temp/ciper_pages.json", 'w', encoding='utf-8') as file:
	json.dump(json_pages_info, file, ensure_ascii=False, indent=4)


## Content Extraction - Ciper

In [None]:

extracted_pages = {"pages": []}
extracted_pages_with_content = {"pages": []}

with open("../archive/pages_extracted/ciper/ciper_090725_cleanData.json", 'r', encoding='utf-8') as file:
	extracted_pages = json.load(file)

data_extracted = True
re = False

for page_index, page in enumerate(extracted_pages["pages"]):

	print(f"Link - {page['link']}" )
	url = page['link']
	first = True

	while (re or first):
		first = False
		
		try: 
			time.sleep(1)

			if (not re): 
				options = uc.ChromeOptions()
				options.add_argument("--no-sandbox")
				options.add_argument("--disable-blink-features=AutomationControlled")

				driver = uc.Chrome(options=options)

				driver.get(url)

			re = False
			time.sleep(random.uniform(6, 12))

			try: 
				title = driver.find_element(By.TAG_NAME, "h1").text
				content = driver.find_element(By.CLASS_NAME, "col-lg-9").find_element(By.CLASS_NAME, "col-lg-9").text
				page["content"] = content
				extracted_pages_with_content["pages"].append(page)
				print(title)
			except Exception as e:
				print(f"Error al procesar el artículo: {e}")
				continue

		except Exception as e:
			print(f"Error al procesar. Página: {page_index}")
			print(f"Sitio: {url}")
			print(e)
			response = input("Reintentar? y/n")
			if response.lower() == 'y':
				print("Reintentando.")
				re = True
			else: 
				print("Cancelando...")
				re = False
		
		if (not re):
			driver.quit()

	print("Almacenando información en el archivo JSON...")
	with open(f"../archive/temp/ciper_pages_with_content.json", 'w', encoding='utf-8') as file:
		json.dump(extracted_pages_with_content, file, ensure_ascii=False, indent=4)

	driver.quit()
	time.sleep(random.uniform(6, 12))