In [4]:
import json
import re
import time
from datetime import datetime

import pandas as pd
import requests
from bs4 import BeautifulSoup

In [5]:
def get_page_content(url):
    while True:
        try:
            page = requests.get(url)
        except Exception as e:
            time.sleep(60)
        else:
            if page.status_code == 200:
                soup: BeautifulSoup = BeautifulSoup(page.content, "html.parser")
                return soup
            else:
                return None

In [6]:
def get_data(soup, load_date, site_ranking):
    script = soup.find_all("script", {"type":"application/ld+json"})
    del script[-1]

    if script and len(script) == 1 and script[0].string:
        dict = {
            "load_date": load_date,
            "site_ranking": site_ranking,
            "site_name": None,
            "url": None,
            "site_language": None,
            "site_content_url": None,
            "site_url": "https://www.aosfatos.org/",
            "site_date_published": None,
            "title": None,
            "description": None,
            "keywords": None,
            "tag": None,
            "claim_reviewed": None,
            "review_body": None,
            "claim": None,
        }

        script_json = json.loads(script[0].string, strict=False)

        for key in script_json:
            if "claimReviewed" == key:
                dict["claim_reviewed"] = script_json["claimReviewed"]
            if "reviewBody" == key:
                dict["review_body"] = script_json["reviewBody"]
            if "publisher" == key:
                publisher = script_json["publisher"]
                if publisher and "url" in publisher:
                    dict["site_url"] = publisher["url"]

        site_name = soup.select("h1 > a")
        dict["site_name"] = site_name[0].string if site_name else None

        url = soup.find_all(attrs={"property": "og:url"})
        dict["url"] = url[0]['content'] if url else None

        dict["site_language"] = soup.html['lang'] if soup.html else None

        content_url = soup.find_all(attrs={"property": "og:image"})
        dict["site_content_url"] = content_url[0]['content'] if content_url else None

        date_published = soup.find_all(attrs={"class": "publish-date"})
        dict["site_date_published"] = date_published[0].string if date_published else None

        title = soup.find_all(attrs={"property": "og:title"})
        dict["title"] = title[0]['content'] if title else None

        description = soup.find_all(attrs={"property": "og:description"})
        dict["description"] = description[0]['content'] if description else None

        keywords = soup.find_all(attrs={"name": "keywords"})
        dict["keywords"] = keywords[0]['content'] if keywords else None

        pages = soup.select_one("body > main > section > div.default-container > article")
        if pages:
            div = pages.div
            if div:
                div.decompose()

            h1 = pages.h1
            if h1:
                h1.decompose()

            if div:
                div.decompose()

            if div:
                div.decompose()

            dict["claim"] = pages.text

        return dict
    else:
        return None

In [7]:
def get_number_pages(url):
    soup = get_page_content(url)

    number_pages = [num_page.get('href') for num_page in soup.select("span > a")]
    number_pages = number_pages[-2].split('=')[-1]
    
    return number_pages

def get_page_links(url, page):
    soup = get_page_content(f"{url}?page={page}")

    link_all = soup.find_all(attrs={"class": "entry-item-card entry-content"})

    links = [link['href'] for link in link_all]
    return links

In [8]:
def convert_dict_dataframe(dict):
    df = pd.DataFrame.from_dict([dict])
    return df

def write_csv(df, path, header, mode):
    df.to_csv(path, index=False, header=header, mode=mode)

In [9]:
site_list = [
    {
        "url": "https://www.aosfatos.org/noticias/checamos/verdadeiro/",
        "site_ranking": "Verdadeiro",
        "path": "../dados/brutos/raw_aosfatos/verdadeiro_aosfatos.csv",
        "base_url": "https://www.aosfatos.org"
    },
    {
        "url": "https://www.aosfatos.org/noticias/checamos/nao-e-bem-assim/",
        "site_ranking": "Distorcido",
        "path": "../dados/brutos/raw_aosfatos/distorcido_aosfatos.csv",
        "base_url": "https://www.aosfatos.org"
    },
    {
        "url": "https://www.aosfatos.org/noticias/checamos/falso/",
        "site_ranking": "Falso",
        "path": "../dados/brutos/raw_aosfatos/fake_aosfatos.csv",
        "base_url": "https://www.aosfatos.org"
    },
]

now = datetime.now()
load_date = now.strftime("%Y-%m-%d %H:%M:%S")

for site in  site_list:
    if site and "url" in site and "site_ranking" in site and "path" in site and "base_url" in site:
        url = site["url"]
        site_ranking = site["site_ranking"]
        path = site["path"]
        base_url = site["base_url"]

        number_pages = get_number_pages(url)

        mode = 'w'
        header = True

        for n_page in range(1, int(number_pages)):
            links = get_page_links(url, n_page)
            for link in links:
                soup = get_page_content(base_url + link)

                if soup:
                    dict = get_data(soup, load_date, site_ranking)
                    if dict:
                        df = convert_dict_dataframe(dict)
                        write_csv(df, path, header, mode)

                        mode = 'a'
                        header = False