# Imports

In [2]:
import pandas as pd
import numpy as np
import csv
import time
from tqdm import tqdm

from trafilatura.sitemaps import sitemap_search
from trafilatura import fetch_url, extract, bare_extraction

import os
from htmldate import find_date

# Data Collection

## Scraping Functions

In [18]:
def get_urls_from_sitemap(resource_url: str) -> list:
    """
    Funzione che crea un DataFrame Pandas di URL e articoli.
    Function to create a Pandas DataFrame from the article URL
    """
    urls = sitemap_search(resource_url)
    print(f'Urls: {urls}')
    return urls


def extract_article(url: str) -> dict:
    """
    Estrae un articolo da una URL con Trafilatura
    Extract an article from a URL from Trafiltura
    """
    downloaded = fetch_url(url)
    article = extract(downloaded, favor_precision=True)

    return article

def extract_date(url: str) -> dict:

    downloaded = fetch_url(url)
    date = bare_extraction(downloaded,favor_precision=True)['date']

    return date

def create_dataset(list_of_websites: list, df_original=None) -> pd.DataFrame:
    """
    Funzione che crea un DataFrame Pandas di URL e articoli.
    Function to create a Pandas Dataframe from an article URL
    """
    
    data = []
    for website in tqdm(list_of_websites, desc="Websites"):
        urls = get_urls_from_sitemap(website)
        if df_original is not None:
            urls = list(set(urls) - set(df_original['url']))
            
        for url in tqdm(urls, desc="URLs"):
            d = {
                'url': url,
                "article": extract_article(url),
                "date": pd.to_datetime(find_date(url))
            }
            data.append(d)
            time.sleep(0.25)

    df = pd.DataFrame(data)
    if df_original is not None:
        # Append only rows from df that are not in df_original
        df_original = pd.concat([df_original, df]).drop_duplicates(subset=['url'], keep='first')
        df = df_original

    df = df.drop_duplicates()
    df = df.dropna(subset=['article'])

    return df


## Dataset 1

In [19]:
list_of_websites = [
    "https://nation.africa/kenya",
    #"https://www.standardmedia.co.ke/",
    "https://www.businessdailyafrica.com/",
    #"https://www.pd.co.ke/",
    #"https://www.citizen.digital/", Working but slow asl
    #"https://www.nationmedia.com/brands/daily-nation/",
    #"https://www.the-star.co.ke/"
]

# Initialize an empty DataFrame or load an existing one
try:
    df = pd.read_csv('/workspaces/Project-Uchumi/data/raw/articles.csv')
except FileNotFoundError:
    df = pd.DataFrame(columns=['url', 'article', 'date'])

df = create_dataset(list_of_websites, df)

Websites:   0%|          | 0/2 [00:00<?, ?it/s]

Urls: ['https://nation.africa/kenya/counties/kakamega/kakamega-tale-dead-body-deadly-clash-dead-brother-4677512', 'https://nation.africa/kenya/counties/meru/why-meru-county-has-yet-to-pass-a-budget-4677466', 'https://nation.africa/kenya/counties/nakuru/kevin-madanga-kagoni-the-pain-of-a-mother-4677382', 'https://nation.africa/kenya/counties/turkana/a-balancing-act-kenya-walks-tight-rope-to-please-refugees-and-locals-4677318', 'https://nation.africa/kenya/life-and-style/dn2/a-10-step-guide-on-how-to-bamboozle-kenyan-voters-4676676', 'https://nation.africa/kenya/life-and-style/dn2/dear-kitoto-how-do-i-forgive-my-abusive-stepdad--4676670', 'https://nation.africa/kenya/life-and-style/dn2/gen-zs-intolerance-of-status-quo-is-admirable-4676672', 'https://nation.africa/kenya/life-and-style/dn2/octane-ratings-from-the-beach-to-the-mountaintop-4676710', 'https://nation.africa/kenya/life-and-style/dn2/parents-saddled-by-high-costs-cut-back-on-soft-life--4676674', 'https://nation.africa/kenya/news


URLs:   0%|          | 0/25 [00:00<?, ?it/s][A
URLs:   4%|▍         | 1/25 [00:31<12:25, 31.08s/it][A
URLs:   8%|▊         | 2/25 [00:31<05:03, 13.17s/it][A
URLs:  12%|█▏        | 3/25 [00:32<02:42,  7.38s/it][A
URLs:  16%|█▌        | 4/25 [00:32<01:39,  4.72s/it][A
URLs:  20%|██        | 5/25 [00:33<01:05,  3.27s/it][A
URLs:  24%|██▍       | 6/25 [00:34<00:45,  2.39s/it][A
URLs:  28%|██▊       | 7/25 [00:34<00:31,  1.77s/it][A
URLs:  32%|███▏      | 8/25 [00:37<00:37,  2.22s/it][A
URLs:  36%|███▌      | 9/25 [00:38<00:28,  1.75s/it][A
URLs:  40%|████      | 10/25 [00:39<00:23,  1.57s/it][A
URLs:  44%|████▍     | 11/25 [00:40<00:19,  1.38s/it][A
URLs:  48%|████▊     | 12/25 [00:41<00:15,  1.16s/it][A
URLs:  52%|█████▏    | 13/25 [00:42<00:12,  1.01s/it][A
URLs:  56%|█████▌    | 14/25 [00:42<00:09,  1.11it/s][A
URLs:  60%|██████    | 15/25 [00:43<00:08,  1.15it/s][A
URLs:  64%|██████▍   | 16/25 [00:44<00:07,  1.24it/s][A
URLs:  68%|██████▊   | 17/25 [00:45<00:08,  1.10

Urls: ['https://www.businessdailyafrica.com/bd/sponsored/oil-and-gold-markets-on-edge-ahead-of-key-economic-releases-4677474', 'https://www.businessdailyafrica.com/bd/opinion-analysis/columnists/how-the-paints-industry-can-ride-the-sustainability-wave-4676816', 'https://www.businessdailyafrica.com/bd/corporate/companies/equity-pilots-insurance-in-drc-ahead-of-regional-rollout-4677116', 'https://www.businessdailyafrica.com/bd/economy/schools-remain-closed-on-fear-of-violent-protests-4677284', 'https://www.businessdailyafrica.com/bd/opinion-analysis/columnists/stringent-policies-drive-mushrooming-black-market-4676800', 'https://www.businessdailyafrica.com/bd/economy/inside-sh1bn-wage-bill-dilemma-as-new-pay-falls-due-4677166', 'https://www.businessdailyafrica.com/bd/markets/real-estate/ncba-stanbic-tap-sh1-8bn-home-loans-4677102', 'https://www.businessdailyafrica.com/bd/economy/explainer-the-appropriation-bill-and-why-it-matters-4677196', 'https://www.businessdailyafrica.com/bd/lifestyle


URLs:   0%|          | 0/25 [00:00<?, ?it/s][A
URLs:   4%|▍         | 1/25 [00:31<12:26, 31.09s/it][A
URLs:   8%|▊         | 2/25 [00:32<05:06, 13.35s/it][A
URLs:  12%|█▏        | 3/25 [00:32<02:44,  7.48s/it][A
URLs:  16%|█▌        | 4/25 [00:33<01:42,  4.90s/it][A
URLs:  20%|██        | 5/25 [00:34<01:08,  3.42s/it][A
URLs:  24%|██▍       | 6/25 [00:35<00:50,  2.66s/it][A
URLs:  28%|██▊       | 7/25 [00:36<00:38,  2.12s/it][A
URLs:  32%|███▏      | 8/25 [00:37<00:30,  1.77s/it][A
URLs:  36%|███▌      | 9/25 [00:38<00:24,  1.53s/it][A
URLs:  40%|████      | 10/25 [00:39<00:20,  1.38s/it][A
URLs:  44%|████▍     | 11/25 [00:40<00:16,  1.19s/it][A
URLs:  48%|████▊     | 12/25 [00:40<00:13,  1.04s/it][A
URLs:  52%|█████▏    | 13/25 [00:41<00:11,  1.02it/s][A
URLs:  56%|█████▌    | 14/25 [00:42<00:10,  1.01it/s][A
URLs:  60%|██████    | 15/25 [00:44<00:12,  1.27s/it][A
URLs:  64%|██████▍   | 16/25 [00:45<00:10,  1.13s/it][A
URLs:  68%|██████▊   | 17/25 [00:46<00:08,  1.07

## Relevance Filtering

In [20]:
# Public debt relevant terms
# relevant_keywords = ['public debt', 'public budget', 'public finance management', 'budget trends', 'budget theft']
# relevant_keywords = ['debt', 'budget', 'finance', 'trends', 'theft','government']

# Filter relevant blogs
# df_filtered = df[df['article'].str.contains('|'.join(relevant_keywords), case=False)]

## Exporting Dataset to CSV


In [21]:
df.to_csv("/workspaces/Project-Uchumi/data/raw/articles.csv", index=False, mode='a', header=not os.path.exists("/workspaces/Project-Uchumi/data/raw/articles.csv"))