# Gets pages on Google News published between November 15, 2019 and December 17, 2023


In [40]:
import requests
from bs4 import BeautifulSoup
import json
from datetime import datetime
from dateutil.relativedelta import relativedelta
from colorama import Fore, Style


search_term = 'Constitución Chile Fast Check'
search_term = search_term.replace(' ', '+')  
num_pages = 20  
start_date = '15/11/2019'
end_date = '17/12/2023'
json_pages_info = {
    "pages": []
}

headers = {
    'User-Agent': 'Chrome/58.0.3029.110'
}

skipped_pages = 0


for page in range(0, num_pages):
    url = f'https://www.google.com/search?q={search_term}&tbm=nws&start={page*10}&tbs=cdr:1,cd_min:{start_date},cd_max:{end_date}'
    response = requests.get(url, headers=headers)
    
    if response.status_code == 200:
        print("status 200")

        response.encoding = 'ISO-8859-1'
        content = response.content.decode('ISO-8859-1')
        soup = BeautifulSoup(content, 'html.parser')
        
        with open('output.html', 'w', encoding='utf-8') as f:
            f.write(soup.prettify())

        pages_section = soup.find_all('div', class_='Gx5Zad xpd EtOod pkphOe')
        print(len(pages_section))

        for page_section in pages_section:
            link_section = page_section.find('a', href=True)
            href = link_section['href']

            if(href.startswith("/url?q=")):
                date = page_section.find('span', class_='r0bn4c rQMQod')
                date = date.text.split(" ")
                if date[2] != "meses" or int(date[1]) < 9:
                    skipped_pages += 1
                    print("skipped page")
                    continue
                date = datetime.now() - relativedelta(months=int(date[1]))
                date = date.strftime("%B") + ", " + str(date.year)

                newscast = page_section.find('div', class_='BNeawe UPmit AP7Wnd lRVwie')
                title = page_section.find('div', class_='BNeawe vvjwJb AP7Wnd')
                description = page_section.find('div', class_='BNeawe s3v9rd AP7Wnd')
                image = page_section.find('img', class_="h1hFNe")
                link = href.split("/url?q=")[1].split("&")[0]

                link_info = {
                    "newscast" : newscast.text,
                    "title": title.text,
                    "description": description.text,
					"date": f"(preliminary) {date}",
                    "link": link,
                    "image": image['src'],
                    "author": "not yet extracted",
                    "text": "not yet extracted",
                    "links": "not yet extracted"
                }
                json_pages_info["pages"].append(link_info)
            else:
                print("skipped page 2")
    else:
        print(f"Error al acceder a la página {page + 1}: {response.status_code}")

print(f"Se han encontrado: {len(json_pages_info['pages'])} páginas")

with open("json_pages_info_fast_check.json", 'w', encoding='utf-8') as file:
    json.dump(json_pages_info, file, ensure_ascii=False, indent=4)


status 200
9
skipped page
status 200
4
skipped page
status 200
4
status 200
9
status 200
5
status 200
9
status 200
9
status 200
0
status 200
0
status 200
0
status 200
0
status 200
0
status 200
0
status 200
0
status 200
0
status 200
0
status 200
0
status 200
0
status 200
0
status 200
0
Se han encontrado: 47 páginas


# Extract data


In [41]:

pages_with_errors = {
    "errors": []
}

# Extract author, date, links and images
def extract_data(response, page, index): 
    response.encoding = 'ISO-8859-1'
    content = response.content.decode('ISO-8859-1')
    soup = BeautifulSoup(content, 'html.parser')

    date = soup.find('time')
    if date is not None:
        json_pages_info["pages"][index]["date"] = date.text

    author = ""
    veracity = ""
    text = []
    links = []
    images = []

    extracting = ""
    try: 
        # if(page["newscast"] == "BBC"): 
        #     extracting = "author"
        #     author = soup.find("span", class_="bbc-1y5sx98").text
        #     extracting = "text"
        #     text = [p.text for p in soup.find_all("p", class_="bbc-hhl7in e17g058b0")]
        #     extracting = "links"
        #     links = [a["href"] for a in soup.find_all("a", class_="bbc-1qrye4j eyj10mi1")]
        #     extracting = "images"
        #     images = [img["src"] for img in soup.find("main", class_="bbc-fa0wmp").find_all("img", class_="bbc-139onq")]
        # elif(page["newscast"] == "EL PAÍS"):
        #     extracting = "author"
        #     author = soup.find(class_="a_md_a_n").text
        #     extracting = "text"
        #     text = [p.text for p in soup.find("div", class_="a_c clearfix").find_all("p")]
        #     extracting = "links"
        #     links = [a["href"] for a in soup.find("div", class_="a_c clearfix").find_all("a", href=True)]
        #     extracting = "images"
        #     images = [img["src"] for img in soup.find("div", class_="a_e_m").find_all("img", class_="_re  a_m-h")]

        # elif(page["newscast"] == "France 24"):
        #     extracting = "author"
        #     author = soup.find("a", class_="m-from-author__name").text
        #     extracting = "text"
        #     text = [p.text for p in soup.find("div", class_="t-content__body u-clearfix").find_all("p")]
        #     extracting = "links"
        #     links = [a["href"] for a in soup.find_all("a", href=True)]

        # elif(page["newscast"] == "DW"):
        #     extracting = "text"
        #     text = [p.text for p in soup.find("div", class_="c1ebneao s17j8gzx rich-text t1it8i9i r1wgtjne wgx1hx2 b1ho1h07").find_all("p")]
        #     extracting = "links"
        #     links = [a["href"] for a in soup.find_all("a", href=True)]

        if (page["newscast"] == "Fast Check CL"):
            title = soup.find("h1", class_="single-post-title entry-title").text
            if "#" in title:
                veracity = title[title.find("#") + 1:]
                extracting = "author"
                author = soup.find("ul", class_="meta ospm-modern clr").find("a").text
                extracting = "veracity"
                extracting = "text"
                text = soup.find("div", class_="entry-content clr").text
                links = [a["href"] for a in soup.find("div", class_="container clr").find_all("a", href=True)]
                images = soup.find("div", class_="entry-content clr").find_all("img", class_="size-full wp-image-")
                
            else:
                print("     No se encontró la veracidad")
                veracity = "No se encontró la veracidad"
                author = "No se encontró la veracidad"
                text = "No se encontró la veracidad"
                links = "No se encontró la veracidad"
                images = "No se encontró la veracidad"
 
    except Exception as e:
        error = {
            "page": page["link"],
            "error_type": type(e).__name__,
            "error_message": str(e),
            "extracting": extracting,
            "index": index
        }
        pages_with_errors["errors"].append(error)
        print(f'   [{page["newscast"]}]---> Error: {e}')

    print("     Revisión sin errores")
    json_pages_info["pages"][index]["author"] = author
    json_pages_info["pages"][index]["veracity"] = veracity
    json_pages_info["pages"][index]["text"] = text
    json_pages_info["pages"][index]["links"] = links
    json_pages_info["pages"][index]["images"] = images

# Check the status code of the URLs and extract the data
def initial_request(): 
    json_pages_info = {}
    with open('json_pages_info_fast_check.json', 'r', encoding='utf-8') as file:
        json_pages_info = json.load(file)

    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36'
    }

    status_200_count = 0
    status_403_count = 0
    status_404_count = 0
    status_500_count = 0

    # Loop through the URLs and check the status code of each one.
    count = 0
    for index, page in enumerate(json_pages_info["pages"]):
        url = page["link"]

        # if(count == 10):
        #     print("Se ham revisado 10 páginas")
        #     break
        # count += 1

        try: 
            response = requests.get(url, headers=headers)
        except requests.exceptions.RequestException as e:
            print(f'Error: {e} | {url}')
        
        if response.status_code == 404:
            status_404_count += 1
        if response.status_code == 403:
            status_403_count += 1
        if response.status_code == 500:
            status_500_count += 1
        if response.status_code == 200:
            status_200_count += 1

        if response.status_code != 200:
            print(f'{Fore.RED} {response.status_code } {Style.RESET_ALL} | {url}')
            json_pages_info["pages"][index]["error"] = "True"
        else:
            print(f'{Fore.GREEN} {response.status_code } {Style.RESET_ALL} | {url}')
            extract_data(response, page, index)

    print("Resume: ")
    print(f"Status 200: {status_200_count}")
    print(f"Status 403: {status_403_count}")
    print(f"Status 404: {status_404_count}")
    print(f"Status 500: {status_500_count}")

initial_request()

# overwrite the json file with the new data
with open("json_pages_info_fast_check.json", 'w', encoding='utf-8') as file:
    json.dump(json_pages_info, file, ensure_ascii=False, indent=4)

with open("pages_with_errors_fast_check.json", 'w', encoding='utf-8') as file:
    json.dump(pages_with_errors, file, ensure_ascii=False, indent=4)

[32m 200 [0m | https://www.fastcheck.cl/2023/12/22/el-segundo-proceso-constitucional-tuvo-un-79-menos-desinformacion-que-el-primer-proceso/
     No se encontró la veracidad
     Revisión sin errores
[32m 200 [0m | https://www.fastcheck.cl/2023/11/29/secretario-general-de-la-onu-dijo-si-chile-aprueba-la-nueva-constitucion-no-se-podra-aplicar-la-agenda-2030-falso/
     Revisión sin errores
[32m 200 [0m | https://www.fastcheck.cl/2023/11/23/not-check-sera-inconstitucional-la-penalizacion-al-narcotrafico/
     No se encontró la veracidad
     Revisión sin errores
[32m 200 [0m | https://www.fastcheck.cl/2023/11/14/comparador-cuales-son-las-diferencias-entre-la-constitucion-vigente-y-la-propuesta-constitucional/
     No se encontró la veracidad
     Revisión sin errores
[32m 200 [0m | https://www.fastcheck.cl/2023/10/31/en-la-nueva-constitucion-se-acaba-el-derecho-a-indemnizacion-por-anos-de-servicio-falso/
     Revisión sin errores
[32m 200 [0m | https://www.fastcheck.cl/2023/11

In [28]:
with open('json_pages_info_fast_check.json', 'r', encoding='utf-8') as file:
    json_pages_info = json.load(file)
    print(len(json_pages_info["pages"]))

46
