# Imports

In [None]:
import pandas as pd
import numpy as np
import csv
import time
from tqdm import tqdm

from trafilatura.sitemaps import sitemap_search
from trafilatura import fetch_url, extract, bare_extraction

# Data Collection

## Scraping Functions

In [None]:
def get_urls_from_sitemap(resource_url: str) -> list:
    """
    Funzione che crea un DataFrame Pandas di URL e articoli.
    Function to create a Pandas DataFrame from the article URL
    """
    urls = sitemap_search(resource_url)
    print(f'Urls: {urls}')
    return urls


def extract_article(url: str) -> dict:
    """
    Estrae un articolo da una URL con Trafilatura
    Extract an article from a URL from Trafiltura
    """
    downloaded = fetch_url(url)
    article = extract(downloaded, favor_precision=True)

    return article

def extract_date(url: str) -> dict:

    downloaded = fetch_url(url)
    date = bare_extraction(downloaded,favor_precision=True)['date']

    return date

def create_dataset(list_of_websites: list, df_original=None) -> pd.DataFrame:
    """
    Funzione che crea un DataFrame Pandas di URL e articoli.
    Function to create a Pandas Dataframe from an article URL
    """

    data = []
    for website in tqdm(list_of_websites, desc="Websites"):
        urls = get_urls_from_sitemap(website)
        for url in tqdm(urls, desc="URLs"):
            if df_original is None:
                if url in df_original['url'].tolist():
                    continue
            d = {
                'url': url,
                "article": extract_article(url),
                "date" : pd.to_datetime(extract_date(url))
            }
            data.append(d)
            time.sleep(0.5)

    df = pd.DataFrame(data)
    if df_original:
        # Append only rows from df2 that are not in df1
        df_original = pd.concat([df_original, df[~df.apply(tuple,1).isin(df_original.apply(tuple,1))]])
        df = df_original

    df = df.drop_duplicates()
    df = df.dropna()

    return df


## Dataset 1

In [None]:
list_of_websites = [
    "https://nation.africa/kenya",
    "https://www.standardmedia.co.ke/",
    "https://www.businessdailyafrica.com/",
    "https://www.pd.co.ke/",
    "https://www.citizen.digital/",
    "https://www.nationmedia.com/brands/daily-nation/",
    "https://www.the-star.co.ke/"
]

df = create_dataset(list_of_websites, df)

## Filtering

In [None]:
# Public debt relevant terms
# relevant_keywords = ['public debt', 'public budget', 'public finance management', 'budget trends', 'budget theft']
relevant_keywords = ['debt', 'budget', 'finance', 'trends', 'theft']

# Filter relevant blogs
df = df[df['article'].str.contains('|'.join(relevant_keywords), case=False)]

## Exporting Dataset to CSV


In [None]:
df.to_csv("dataset.csv", index=False, mode='a', header=not os.path.exists("dataset.csv"))