# Gets pages on Google News published between November 15, 2019 and December 17, 2023

The data is extracted by looking for the following websites:

- Fast Check
- Fact Checking UC
- Biobio Chile"


In [None]:
import requests
from bs4 import BeautifulSoup
import json
from datetime import datetime
from dateutil.relativedelta import relativedelta
from colorama import Fore, Style

# Get pages from Fast Check


In [None]:
def get_pages(start, end, term, document_path):

    search_term = term
    search_term = search_term.replace(' ', '+')  
    num_pages = 20  
    start_date = start
    start_year = int(start_date.split("/")[2])
    end_date = end
    end_year = int(end_date.split("/")[2])
    json_pages_info = {
        "pages": []
    }

    headers = {
        'User-Agent': 'Chrome/58.0.3029.110'
    }

    skipped_pages = 0


    all_pages_links = []

    for page in range(0, num_pages):
        url = f'https://www.google.com/search?q={search_term}&tbm=nws&start={page*10}&tbs=cdr:1,cd_min:{start_date},cd_max:{end_date}'
        response = requests.get(url, headers=headers)

        if response.status_code == 200:
            print("status 200")

            response.encoding = 'ISO-8859-1'
            content = response.content.decode('ISO-8859-1')
            soup = BeautifulSoup(content, 'html.parser')
            
            with open('output.html', 'w', encoding='utf-8') as f:
                f.write(soup.prettify())

            pages_section = soup.find_all('div', class_='Gx5Zad xpd EtOod pkphOe')

            for page_section in pages_section:
                link_section = page_section.find('a', href=True)
                href = link_section['href']

                if(href.startswith("/url?q=")):
                    
                    link = href.split("/url?q=")[1].split("&")[0]
                    if (all_pages_links.count(link) > 0):
                        print(all_pages_links)
                        print("ya ingresada")
                        continue
                    else:
                        all_pages_links.append(link)

                    date = page_section.find('span', class_='r0bn4c rQMQod')
                    date = date.text.split(" ")
                    if(date[2] != "meses" and date[2] != "años"):
                        continue
                    date = datetime.now() - relativedelta(months=int(date[1]))
                    if(date.year < start_year or date.year > end_year):
                        skipped_pages += 1
                        continue
                    date = date.strftime("%B") + ", " + str(date.year)

                    newscast = page_section.find('div', class_='BNeawe UPmit AP7Wnd lRVwie')
                    title = page_section.find('div', class_='BNeawe vvjwJb AP7Wnd')
                    description = page_section.find('div', class_='BNeawe s3v9rd AP7Wnd')
                    image = page_section.find('img', class_="h1hFNe")
                    
                    if image is None:
                        image = "none"
                    else: 
                        image = image['src']
                    
                    link_info = {
                        "newscast" : newscast.text,
                        "title": title.text,
                        "description": description.text,
                        "link": link,
                        "date": f"(preliminary) {date}",
                        "image": image,
                        "author": "not yet extracted",
                        "text": "not yet extracted",
                        "links": "not yet extracted"
                    }
                    json_pages_info["pages"].append(link_info)
        else:
            print(f"Error al acceder a la página {page + 1}: {response.status_code}")

    print(f"Se han encontrado: {len(json_pages_info['pages'])} páginas")

    with open(f"{document_path}", 'w', encoding='utf-8') as file:
        json.dump(json_pages_info, file, ensure_ascii=False, indent=4)

In [None]:
#Fast Check | Primero proceso constitucional
get_pages('11/15/2019', '05/14/2022', 'Constitución Chile Fast Check', './archive/fast_check_data/1st_fast_check_pages.json')

In [None]:
#Fast Check | Segundo proceso constitucional
get_pages('05/14/2022', '12/17/2023', 'Constitución Chile Fast Check', './archive/fast_check_data/2nd_fast_check_pages.json')

In [None]:
get_pages('11/15/2019', '12/17/2023', 'Constitución Chile biobioChile', './archive/biobiochile_data/biobiochile_pages.json')

# Get Others Pages besides Fact Check and Fast Checking UC


In [None]:
get_pages('11/15/2019', '05/14/2022', 'Constitución Chile', 'others_pages.json')

# Data Extract from Biobio Chile


In [None]:
months = [
    "enero",
    "febrero",
    "marzo",
    "abril",
    "mayo",
    "junio",
    "julio",
    "agosto",
    "septiembre",
    "octubre",
    "noviembre",
    "diciembre"
]

biobiochile_variations = [
    "especial/nuevo-proceso-constituyente",
    "especial/una-constitucion-para-chile",
    "noticias/nacional"
]

def biobiochile_data_extract(): 
    biobiochile_links = {}
    biobiochile_data_extracted = {"pages" : []}

    with open(f"./archive/biobiochile_data/biobiochile_pages.json", 'r', encoding='utf-8') as file:
        biobiochile_links = json.load(file)

    for page in biobiochile_links["pages"]:
        link = page["link"]
        try:
            link_split = link.split("/")
            variation = link_split[3] + "/" + link_split[4]
            variation_index = biobiochile_variations.index(variation)
            page_data = {}
            if variation_index == -1:
                print(Fore.RED + f"Error: {link} no es un link de noticias o especial" + Style.RESET_ALL)
            if variation_index == 0:
                page_data = biobiochile_special_new_process(page)
            if variation_index == 1:
                page_data = biobiochile_special_a_constitution(page)
            if variation_index == 2 or link_split[3] == "noticias":
                page_data = biobiochile_news(page)
            biobiochile_data_extracted["pages"].append(page_data)
        except:
            print(Fore.RED + f"Error: {link} no se pudo procesar" + Style.RESET_ALL)
    
    with open(f"./archive/biobiochile_data/biobiochile_data_extracted.json", 'w', encoding='utf-8') as file:
        json.dump(biobiochile_data_extracted, file, ensure_ascii=False, indent=4)

def biobiochile_news(page):
    link = page["link"]
    soup = BeautifulSoup(requests.get(link).content, 'html.parser')
    date = soup.find("div", class_="post-date")
    if date is None:
        date = "no year found"
    else:
        date = date.text.split(" ")
        month = date[2]
        if month in months:
            month = months.index(month) + 1
            date = f"{date[1]}/{month}/{date[4]}"
    title = soup.find("h1", class_="post-title").text
    author = soup.find("div", class_="autores").text
    text = "".join([p.text for p in soup.find("div", class_="post-content clearfix").find_all("p")])
    links = list(set([a["href"] for a in soup.find("div", class_="post-content clearfix").find_all("a")]))
    images = [{"image": soup.find("div", class_="post-image").find("a")["href"], "text": "", "error": ""}]
    page_data = {
        "link": link,
        "veracity": "Es un sitio que no verifica noticias",
        "title": title,
        "date": date,
        "author": author,
        "text": text,
        "links": links,
        "images": images
    }
    return page_data
    
def biobiochile_special_a_constitution(page):
    link = page["link"]
    soup = BeautifulSoup(requests.get(link).content, 'html.parser')
    date = soup.find("div", class_="fecha")
    if date is None:
        date = "no year found"
    else:
        date = date.text.split(" ")
        month = date[2]
        if month in months:
            month = months.index(month) + 1
            date = f"{date[1]}/{month}/{date[4]}"
    title = soup.find("h1", class_="titular").text
    author = soup.find("div", class_="autores").text.split("Por")[0]
    text = soup.find("div", class_="contenido-nota").text
    links = ""
    images = []
    page_data = {
        "link": link,
        "veracity": "Es un sitio que no verifica noticias",
        "title": title,
        "date": date,
        "author": author,
        "text": text,
        "links": links,
        "images": images
    }
    return page_data

def biobiochile_special_new_process(page):
    link = page["link"]
    soup = BeautifulSoup(requests.get(link).content, 'html.parser')
    date = soup.find("div", class_="fecha")
    if date is None:
        date = "not found"
    else:
        date = date.text.split(" ")
        month = date[2]
        if month in months:
            month = months.index(month) + 1
            date = f"{date[1]}/{month}/{date[4]}"
    title = soup.find("h1", class_="titulo").text
    author = soup.find("div", class_="nota").find("p", class_="autor").text
    text = "".join([p.text for p in soup.find("div", class_="nota").find("div", class_="container-redes-contenido").find_all("p")])
    links = list(set(a["href"] for a in soup.find("div", class_="nota").find("div", class_="container-redes-contenido").find_all("a")))
    images = [{"image": soup.find("div", class_="imagen-container").find("img")["src"], "text": "", "error": ""}]
    page_data = {
        "link": link,
        "veracity": "Es un sitio que no verifica noticias",
        "title": title,
        "date": date,
        "author": author,
        "text": text,
        "links": links,
        "images": images
    }
    return page_data

biobiochile_data_extract()

biobiochile_data = {}

with open(f"./archive/biobiochile_data/biobiochile_data_extracted.json", 'r', encoding='utf-8') as file:
        biobiochile_data = json.load(file)

print(f'Se han analizado correctamente {len([page for page in biobiochile_data["pages"] if "link" in page.keys()])} páginas de biobiochile')

# Get pages from Fact Checking UC

# Posiblemente es necesario eliminar este código


In [None]:


# def get_pages_fact_checking():

#     page_number = 1
#     fact_checking_links = {"facts": []}
    
#     print("Obteniendo links")
#     while(True):
#         fact_checking_link = f"https://factchecking.cl/page/{page_number}/?s=Constitucion"
#         soup = BeautifulSoup(requests.get(fact_checking_link).content, 'html.parser')
#         if soup.find('div', class_='gp-entry-content') != None:
#             print("termino de busqueda")
#             print(len(fact_checking_links["facts"]))
#             break

#         facts = [a for a in soup.find('div', class_='gp-inner-loop ajax-loop').find_all("a")]
#         for fact in facts:
#             if fact["href"].split("/")[3] != "user-review":
#                 continue
#             else:
#                 link_data = {
#                     "link": fact["href"],
#                     "veracity": fact.find("span", class_="label").find("span").text
#                 }
#                 print(link_data)
#                 fact_checking_links["facts"].append(link_data)
        
         
#         page_number += 1

#     with open(f"fact_checking_data/fact_checking_links.json", 'w', encoding='utf-8') as file:
#         json.dump(fact_checking_links, file, ensure_ascii=False, indent=4)



# #Fact Checking
# get_pages_fact_checking()

# Data Extract | Fact Checking UC


In [None]:
def fact_checking_data_extract(): 
    fact_checking_links = {}
    fact_checking_data_extracted = {"facts" : []}

    with open(f"fact_checking_data/fact_checking_links.json", 'r', encoding='utf-8') as file:
        fact_checking_links = json.load(file)

    for fact in fact_checking_links["facts"]:
        soup = BeautifulSoup(requests.get(fact["link"]).content, 'html.parser')

        try:
            year = int(soup.find("h2", class_="gp-entry-title").text.split(" ")[-1])
        except:
            year = "no year found"
        
        link = fact["link"]
        veracity = fact["veracity"]
        title = soup.find('h1')
        author = soup.find('h6', class_="gp-share-icons")
        content = soup.find("div", class_="gp-entry-text")
        links = content.find_all("a")
        images = content.find_all("img")

        fact_data = {
            "link": link,
            "veracity": veracity,
            "title": title.text,
            "year": year,
            "author": author.text,
            "content": content.text,
            "links": [link["href"] for link in links],
            "images": [image["src"] for image in images]
        }

        fact_checking_data_extracted["facts"].append(fact_data)

    with open(f"fact_checking_data/fact_checking_data_extracted.json", 'w', encoding='utf-8') as file:
        json.dump(fact_checking_data_extracted, file, ensure_ascii=False, indent=4)

fact_checking_data_extract()

# Data Extract | Fast Check


In [None]:

pages_with_errors = {
    "errors": []
}

fast_check_1st = {}
fast_check_2nd = {}

with open('fast_check_data/1st_fast_check_pages.json', 'r', encoding='utf-8') as file:
    fast_check_1st = json.load(file)

with open('fast_check_data/2nd_fast_check_pages.json', 'r', encoding='utf-8') as file:
    fast_check_2nd= json.load(file)

def extract_data(response, page, index): 
    response.encoding = 'ISO-8859-1'
    content = response.content.decode('ISO-8859-1')
    soup = BeautifulSoup(content, 'html.parser')

    date = soup.find('time')
    if date is not None:
        page["date"] = date.text

    author = ""
    veracity = ""
    text = []
    links = []
    images = []

    extracting = ""
    try: 
        if (page["newscast"] == "Fast Check CL"):
            title = soup.find("h1", class_="single-post-title entry-title").text
            if "#" in title:
                extracting = "veracity"
                veracity = title[title.find("#") + 1:]
                extracting = "author"
                author = soup.find("ul", class_="meta ospm-modern clr").find("a").text
                extracting = "text"
                text = soup.find("div", class_="entry-content clr").text
                links = [a["href"] for a in soup.find("div", class_="container clr").find_all("a", href=True)]
                images = soup.find("div", class_="entry-content clr").find_all("img", class_="size-full wp-image-")
            else:
                print("     No se encontró la veracidad")
                veracity = "No se encontró la veracidad"
                author = "No se encontró la veracidad"
                text = "No se encontró la veracidad"
                links = "No se encontró la veracidad"
                images = "No se encontró la veracidad"
 
    except Exception as e:
        error = {
            "page": page["link"],
            "error_type": type(e).__name__,
            "error_message": str(e),
            "extracting": extracting,
            "index": index
        }
        pages_with_errors["errors"].append(error)
        print(f'   [{page["newscast"]}]---> Error: {e}')

    print("     Revisión sin errores")
    page["author"] = author
    page["veracity"] = veracity
    page["text"] = text
    page["links"] = links
    page["images"] = images

def initial_request(): 

    global fast_check_1st
    global fast_check_2nd

    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36'
    }

    status_200_count = 0
    status_403_count = 0
    status_404_count = 0
    status_500_count = 0

    count = 0
    for index, page in enumerate(fast_check_1st["pages"]):
        url = page["link"]

        try: 
            response = requests.get(url, headers=headers)
        except requests.exceptions.RequestException as e:
            print(f'Error: {e} | {url}')
        
        if response.status_code == 404:
            status_404_count += 1
        if response.status_code == 403:
            status_403_count += 1
        if response.status_code == 500:
            status_500_count += 1
        if response.status_code == 200:
            status_200_count += 1

        if response.status_code != 200:
            print(f'{Fore.RED} {response.status_code } {Style.RESET_ALL} | {url}')
            fast_check_1st["pages"][index]["error"] = "True"
        else:
            print(f'{Fore.GREEN} {response.status_code } {Style.RESET_ALL} | {url}')
            extract_data(response, page, index)

    for index, page in enumerate(fast_check_2nd["pages"]):
        url = page["link"]

        try: 
            response = requests.get(url, headers=headers)
        except requests.exceptions.RequestException as e:
            print(f'Error: {e} | {url}')
        
        if response.status_code == 404:
            status_404_count += 1
        if response.status_code == 403:
            status_403_count += 1
        if response.status_code == 500:
            status_500_count += 1
        if response.status_code == 200:
            status_200_count += 1

        if response.status_code != 200:
            print(f'{Fore.RED} {response.status_code } {Style.RESET_ALL} | {url}')
            fast_check_2nd["pages"][index]["error"] = "True"
        else:
            print(f'{Fore.GREEN} {response.status_code } {Style.RESET_ALL} | {url}')
            extract_data(response, page, index)

    print("Resume: ")
    print(f"Status 200: {status_200_count}")
    print(f"Status 403: {status_403_count}")
    print(f"Status 404: {status_404_count}")
    print(f"Status 500: {status_500_count}")

initial_request()

with open("fast_check_data/1st_fast_check_pages.json", 'w', encoding='utf-8') as file:
    json.dump(fast_check_1st, file, ensure_ascii=False, indent=4)

with open("fast_check_data/2nd_fast_check_pages.json", 'w', encoding='utf-8') as file:
    json.dump(fast_check_2nd, file, ensure_ascii=False, indent=4)

with open("pages_with_errors_fast_check.json", 'w', encoding='utf-8') as file:
    json.dump(pages_with_errors, file, ensure_ascii=False, indent=4)

# Show Results


In [None]:
total_pages = 0
fast_check_1st_true = 0
fast_check_1st_false = 0
fast_check_1st_other = 0

fast_check_2nd_true = 0
fast_check_2nd_false = 0
fast_check_2nd_other = 0

fact_checking_true = 0
fact_checking_false = 0
fact_checking_other = 0

# fact_checking_creible = 0
# fact_checking_no_creible = 0
# fact_checking_impreciso = 0
# fact_checking_engañoso = 0

with open('fact_checking_data/fact_checking_links.json', 'r', encoding='utf-8') as file:
    json_pages_info = json.load(file)
    total_pages += len(json_pages_info["facts"])
    print(len(json_pages_info["facts"]))
    for fact in json_pages_info["facts"]:
        if fact["veracity"] == "Verdadero":
            fact_checking_true += 1
        elif fact["veracity"] == "Falso":
            fact_checking_false += 1
        else:
            fact_checking_other += 1
        # else:
        #     fact_checking_false += 1

        # elif fact["veracity"] == "Creíble" or fact["veracity"] == "Creíble, pero..." or fact["veracity"] == "Sería creíble, pero...":
        #     fact_checking_creible += 1
        # elif fact["veracity"] == "No es creíble":
        #     fact_checking_no_creible += 1
        # elif fact["veracity"] == "Impreciso" or fact["veracity"] == "Se puso creativ@":
        #     fact_checking_impreciso += 1
        # elif fact["veracity"] == "Engañoso" or fact["veracity"] == "Ciencia Ficción":
        #     fact_checking_engañoso += 1
        # else:
        #     print(fact["veracity"])

with open('fast_check_data/1st_fast_check_pages.json', 'r', encoding='utf-8') as file:
    json_pages_info = json.load(file)
    total_pages += len(json_pages_info["pages"])
    print(len(json_pages_info["pages"]))
    for link in json_pages_info["pages"]:
        if link["veracity"] == "Real":
            fast_check_1st_true += 1
        elif link["veracity"] == "Falso":
            fast_check_1st_false += 1
        else:
            fast_check_1st_other += 1
            

with open('fast_check_data/2nd_fast_check_pages.json', 'r', encoding='utf-8') as file:
    json_pages_info = json.load(file)
    total_pages += len(json_pages_info["pages"])
    print(len(json_pages_info["pages"]))
    for link in json_pages_info["pages"]:
        if link["veracity"] == "Real":
            fast_check_2nd_true += 1
        elif link["veracity"] == "Falso":
            fast_check_2nd_false += 1
        else:
            fast_check_2nd_other += 1
print(f"Total pages {total_pages}")
print("Fast Check 1st")
print(f"True: {fast_check_1st_true}")
print(f"False: {fast_check_1st_false}")
print(f"Other: {fast_check_1st_other}")
print("Fast Check 2nd")
print(f"True: {fast_check_2nd_true}")
print(f"False: {fast_check_2nd_false}")
print(f"Other: {fast_check_2nd_other}")
print("Fact Checking")
print(f"True: {fact_checking_true}")
print(f"False: {fact_checking_false}")
print(f"Other: {fact_checking_other}")
# print(f"Creible: {fact_checking_creible}")
# print(f"No Creible: {fact_checking_no_creible}")
# print(f"Impreciso: {fact_checking_impreciso}")
# print(f"Engañoso: {fact_checking_engañoso}")
print(f"suma : {fact_checking_false + fact_checking_true + fact_checking_other}")