In [1]:
import json
import re
import time
from datetime import datetime

import pandas as pd
import requests
from bs4 import BeautifulSoup

In [2]:
def get_page_content(url):
    while True:
        try:
            page = requests.get(url)
        except Exception as e:
            time.sleep(60)
        else:
            if page.status_code == 200:
                soup: BeautifulSoup = BeautifulSoup(page.content, "html.parser")
                return soup
            else:
                return None


In [3]:
def get_data(soup, load_date, site_ranking):
    dict = {
        "load_date": load_date,
        "site_ranking": site_ranking,
        "site_name": None,
        "url": None,
        "site_language": None,
        "site_content_url": None,
        "site_url": None,
        "site_date_published": None,
        "title": None,
        "description": None,
        "keywords": None,
        "tag": None,
        "claim_reviewed": None,
        "review_body": None,
        "claim": None,
    }

    script = soup.find("script", class_="yoast-schema-graph")
    script_json = json.loads(script.string)

    for graph in script_json["@graph"]:
        if "@type" in graph and graph["@type"] == "WebSite":
            dict["site_name"] = graph["name"] if "name" in graph else None
            dict["site_language"] = graph["inLanguage"] if "inLanguage" in graph else None
            dict["site_url"] = graph["url"] if "url" in graph else None

        if "@type" in graph and graph["@type"] == "ImageObject":
            dict["site_content_url"] = graph["contentUrl"] if "contentUrl" in graph else None
            dict["claim_reviewed"] = graph["caption"] if "caption" in graph else None

        if "@type" in graph and graph["@type"] == "WebPage":
            dict["url"] = graph["url"] if "url" in graph else None
            dict["title"] = graph["name"] if "name" in graph else None
            dict["description"] = graph["description"] if "description" in graph else None
            dict["site_date_published"] = graph["datePublished"] if "datePublished" in graph else None
        
        if "@type" in graph and graph["@type"] == "Article":
            dict["keywords"] = graph["keywords"] if "keywords" in graph else None
            dict["tag"] = graph["articleSection"] if "articleSection" in graph else None

    pages = soup.select_one("#tdi_94 > div > div.vc_column.tdi_97.wpb_column.vc_column_container.tdc-column.td-pb-span8 > div > div.td_block_wrap.tdb_single_content.tdi_99.td-pb-border-top.td_block_template_1.td-post-content.tagdiv-type > div")
    dict["claim"] = pages.text if pages else None

    return dict
    

In [4]:
def get_number_pages(url):
    soup = get_page_content(url)

    number_pages = soup.find("a", class_="last").text
    return number_pages

def get_page_links(url, page):
    soup = get_page_content(f"{url}/page/{page}")

    link_all = soup.find_all("a", class_="td-image-wrap")

    links = [link['href'] for link in link_all]
    return links

In [5]:
def convert_dict_dataframe(dict):
    df = pd.DataFrame.from_dict([dict])
    return df

def write_csv(df, path, header, mode):
    df.to_csv(path, index=False, header=header, mode=mode)

In [6]:
site_list = [
    {
        "url": "https://www.e-farsas.com/secoes/verdadeiro-2",
        "site_ranking": "Verdadeiro",
        "path": "../dados/brutos/raw_efarsas/verdadeiro_efarsas.csv"
    },
    {
        "url": "https://www.e-farsas.com/secoes/falso-2",
        "site_ranking": "Falso",
        "path": "../dados/brutos/raw_efarsas/fake_efarsas.csv"
    },
]

now = datetime.now()
load_date = now.strftime("%Y-%m-%d %H:%M:%S")

for site in  site_list:
    if site and "url" in site and "site_ranking" in site and "path" in site:
        url = site["url"]
        site_ranking = site["site_ranking"]
        path = site["path"]

        number_pages = get_number_pages(url)

        mode = 'w'
        header = True

        for n_page in range(1, int(number_pages)):
            links = get_page_links(url, n_page)
            for link in links:
                soup = get_page_content(link)

                if soup:
                    dict = get_data(soup, load_date, site_ranking)
                    df = convert_dict_dataframe(dict)
                    write_csv(df, path, header, mode)

                    mode = 'a'
                    header = False