In [None]:
from extraction import extract_articles_from_google
from extraction import extract_data_from_page
from selenium.webdriver.remote.webelement import WebElement
from selenium.webdriver.common.by import By

In [None]:
pages_file_name = "cnn_español_articles.json"
content_file_name = "cnn_español_articles_with_content.json"

In [None]:
extract_articles_from_google(
    website="cnnespanol.cnn.com",
    newscast="CNN en Español",
    pages=2,
    output=pages_file_name,
    keywords="Estallido Social",
    )

In [None]:
def get_content(body: WebElement):
	content = ""
	for element in body.find_element(By.CLASS_NAME, "article__content").find_elements(By.TAG_NAME, "p"):
		content += element.text.strip() + "\n"
	return content

def get_author(body: WebElement):
	try:
		return body.find_element(By.CLASS_NAME, "vossi-byline__names").text.split("Por")[1].strip()
	except Exception:
		return "The website does not provide an author"

def get_description(body: WebElement):
    return "The website does not provide a description"

extract_data_from_page(
	input_file=pages_file_name, 
	output_file=content_file_name,
	get_author=get_author,
	get_description=get_description,
	get_content=get_content,
	limit_of_pages=5,
	wait=60,
)

[INFO] Input file: cnn_español_articles.json
[INFO] Output file: cnn_español_articles_with_content.json
[INFO] Limit of pages to process: 5

[INFO] Loading extracted pages from JSON file...
[INFO] Starting extraction process...

[INFO] Checking page: 0 | Link: https://cnnespanol.cnn.com/2021/10/19/chile-protestas-violentas-detenidos-carabineros-orix

[INFO] Storing information in a JSON file...

[INFO] Checking page: 1 | Link: https://cnnespanol.cnn.com/2023/04/12/sebastian-pinera-declara-imputado-delitos-lesa-humanidad-estallido-social-2019-chile-orix

[INFO] Storing information in a JSON file...

[INFO] Checking page: 2 | Link: https://cnnespanol.cnn.com/2021/05/12/logros-protestas-populares-america-latina-orix

[INFO] Storing information in a JSON file...

[INFO] Checking page: 3 | Link: https://cnnespanol.cnn.com/2023/05/15/opinion-triunfo-derecha-chile-tendencias-preocupantes

[INFO] Storing information in a JSON file...

[INFO] Checking page: 4 | Link: https://cnnespanol.cnn.com/

In [None]:
import pyautogui

months = {
    "ene": "Jan", "feb": "Feb", "mar": "Mar", "abr": "Apr",
    "may": "May", "jun": "Jun", "jul": "Jul", "ago": "Aug",
    "sept": "Sep", "oct": "Oct", "nov": "Nov", "dic": "Dec"
}

locale.setlocale(locale.LC_TIME, 'C')

json_pages_info = {"pages": []}

pages = 6

re = False

for page in range(0, pages):

	first = True
	while (re or first):
		first = False
		
		try: 
			time.sleep(1)

			if (not re): 
				options = uc.ChromeOptions()
				options.add_argument("--no-sandbox")
				options.add_argument("--disable-blink-features=AutomationControlled")

				driver = uc.Chrome(options=options)

				url = f'https://www.google.com/search?q=%22Estallido+social%22+site%3Acnnespanol.cnn.com&tbs=cdr:1,cd_min:11/15/2019,cd_max:12/17/2023&start={page * 10}'
				driver.get(url)

			re = False
			time.sleep(random.uniform(6, 12))

			articles_section = driver.find_element(By.CLASS_NAME, "dURPMd")
			articles = articles_section.find_elements(By.CLASS_NAME, "MjjYud")

			for article in articles:
				print("revisando artículo...")
				try: 
					originalDate = article.find_element(By.CLASS_NAME, "YrbPuc").find_element(By.TAG_NAME, "span").text
					for es, en in months.items():
						if es in originalDate:
								originalDate = originalDate.replace(es, en)
								break
						
					date_epoch = int(time.mktime(time.strptime(originalDate, "%d %b %Y")))

					if (date_epoch < 1573786800 or date_epoch > 1702782000):
						print("La noticia no corresponde a la fecha solicitada")
						continue

					# check if the article is from El Mostrador
					if (not ("cnnespanol.cnn.com" in article.text)):
						print("El artículo no es de CNN Español")
						continue

					try:
						title = article.find_element(By.TAG_NAME, "h3").text
						description = article.find_element(By.CLASS_NAME, "kb0PBd ").find_elements(By.TAG_NAME, "span")[1].text
						link = article.find_element(By.TAG_NAME, "a").get_attribute("href")

						link_info = {
							"newscast" : "CNN Español",
							"title": title,
							"description": description,
							"category": "The site does not provide a category",
							"date": originalDate,
							"image_link": "not found initially",
							"author": "not found initially",
							"link": link,
						}

						print("información incluida!")
						json_pages_info["pages"].append(link_info)

					except Exception as e:
						print(f"Error al extraer información del artículo: {e}")
						continue

				except Exception as e:
					print(f"Error al procesar el artículo: {e}")
					continue
		except Exception as e:
			print(f"Error al procesar el resultado de busqueda. Página: {page}")
			print(f"Sitio: {url}")
			print(e)
			response = input("Reintentar? y/n")
			if response.lower() == 'y':
				print("Reintentando.")
				re = True
			else: 
				print("Cancelando...")
				re = False
		
		if (not re):
			driver.quit()

print("Almacenando información en el archivo JSON...")
with open(f"../archive/temp/cnn_español.json", 'w', encoding='utf-8') as file:
	json.dump(json_pages_info, file, ensure_ascii=False, indent=4)


# with open(f"output.html", "w", encoding="utf-8") as f:
# 	f.write(driver.page_source)

In [None]:
extracted_pages = {"pages": []}
extracted_pages_with_content = {"pages": []}

total_pages = 0
pages_succeeded = 0
pages_failed = 0
error_occurred = False

with open("../archive/pages_extracted/cnn_español/cnn_español_240725_cleanData.json", 'r', encoding='utf-8') as file:
	extracted_pages = json.load(file)

data_extracted = True
re = False

for page_index, page in enumerate(extracted_pages["pages"]):

	total_pages += 1
	print(f"{page_index + 1} Link - {page['link']}" )
	url = page['link']
	first = True

	while (re or first):
		first = False
		
		try: 
			time.sleep(1)

			if (not re): 
				options = uc.ChromeOptions()
				options.add_argument("--no-sandbox")
				options.add_argument("--disable-blink-features=AutomationControlled")

				driver = uc.Chrome(options=options)

				driver.get(url)

			re = False
			time.sleep(random.uniform(6, 12))

			try: 
				title = driver.find_element(By.TAG_NAME, "h1").text
				content = driver.find_element(By.CLASS_NAME, "article__content").text

				print(title)
			except Exception as e:
				print(f"Error al procesar el artículo: {e}")
				raise

		except Exception as e:
			print(f"Error al procesar. Página: {page_index}")
			print(f"Sitio: {url}")
			print(e)
			response = input("Reintentar? y/n")
			if response.lower() == 'y':
				print("Reintentando.")
				re = True
			else: 
				print("Cancelando...")
				pages_failed += 1
				error_occurred = True
				re = False
		
		if (not re):
			driver.quit()

	if error_occurred:
		content = "A error occurred while extracting content"
		error_occurred = False
	else:
		pages_succeeded += 1

	page["content"] = content
	extracted_pages_with_content["pages"].append(page)

	print("Almacenando información en el archivo JSON...")
	with open(f"../archive/temp/pages_with_content/cnn_español_240725_cleanData_with_content.json", 'w', encoding='utf-8') as file:
		json.dump(extracted_pages_with_content, file, ensure_ascii=False, indent=4)

	driver.quit()

	time.sleep(random.uniform(6, 12))

# ----------------- Stats -----------------
print("\n\n")
print("----- STATS -----")

try:
	print(f"Amount of pages processed: {total_pages}")
	print(f"% Pages succeeded [{pages_succeeded}]: {round(100 * pages_succeeded/total_pages, 2)}")
	print(f"% Pages failed [{pages_failed}]: {round(100 * pages_failed/total_pages, 2)}")
except: 
	print("A error occurred while processing the stats")