# Imports

In [1]:
import pandas as pd
import numpy as np
import csv
import time
from tqdm import tqdm

from trafilatura.sitemaps import sitemap_search
from trafilatura import fetch_url, extract, bare_extraction

import os

# Data Collection

## Scraping Functions

In [2]:
def get_urls_from_sitemap(resource_url: str) -> list:
    """
    Funzione che crea un DataFrame Pandas di URL e articoli.
    Function to create a Pandas DataFrame from the article URL
    """
    urls = sitemap_search(resource_url)
    print(f'Urls: {urls}')
    return urls


def extract_article(url: str) -> dict:
    """
    Estrae un articolo da una URL con Trafilatura
    Extract an article from a URL from Trafiltura
    """
    downloaded = fetch_url(url)
    article = extract(downloaded, favor_precision=True)

    return article

def extract_date(url: str) -> dict:

    downloaded = fetch_url(url)
    date = bare_extraction(downloaded,favor_precision=True)['date']

    return date

def create_dataset(list_of_websites: list, df_original=None) -> pd.DataFrame:
    """
    Funzione che crea un DataFrame Pandas di URL e articoli.
    Function to create a Pandas Dataframe from an article URL
    """

    data = []
    for website in tqdm(list_of_websites, desc="Websites"):
        urls = get_urls_from_sitemap(website)
        for url in tqdm(urls, desc="URLs"):
            if df_original is not None and url in df_original['url'].tolist():
                continue
            d = {
                'url': url,
                "article": extract_article(url),
                "date": pd.to_datetime(extract_date(url))
            }
            data.append(d)
            time.sleep(0.25)

    df = pd.DataFrame(data)
    if df_original is not None:
        # Append only rows from df that are not in df_original
        df_original = pd.concat([df_original, df]).drop_duplicates(subset=['url'], keep='first')
        df = df_original

    df = df.drop_duplicates()
    df = df.dropna(subset=['article'])

    return df


## Dataset 1

In [3]:
list_of_websites = [
    "https://nation.africa/kenya",
    "https://www.standardmedia.co.ke/",
    "https://www.businessdailyafrica.com/",
    #"https://www.pd.co.ke/",
    #"https://www.citizen.digital/", Working but slow asl
    #"https://www.nationmedia.com/brands/daily-nation/",
    #"https://www.the-star.co.ke/"
]

# Initialize an empty DataFrame or load an existing one
try:
    df = pd.read_csv('/workspaces/Project-Uchumi/data/raw/articles.csv')
except FileNotFoundError:
    df = pd.DataFrame(columns=['url', 'article', 'date'])

df = create_dataset(list_of_websites, df)

Websites:   0%|          | 0/3 [00:00<?, ?it/s]

Urls: ['https://nation.africa/kenya/business/airasia-x-unveils-direct-flights-from-nairobi-to-kuala-lumpur-4675966', 'https://nation.africa/kenya/business/m-tiba-md-prickaerts-appointed-ceo-of-carepay-international-4676016', 'https://nation.africa/kenya/counties/kakamega/not-off-the-hook-yet-court-rejects-dpp-s-application-to-drop-charges-against-ayub-savula-wives-in-sh122m-fraud-case-4675578', 'https://nation.africa/kenya/counties/lamu/let-there-be-light-remote-manda-island-set-to-get-electricity-4675262', 'https://nation.africa/kenya/counties/makueni/governor-mutula-kilonzo-withdraws-plan-to-increase-taxes-in-makueni-drops-county-s-finance-bill-2024-4675610', 'https://nation.africa/kenya/counties/nairobi/anti-tax-protester-charged-for-unlawfully-entering-parliament-building-4676086', 'https://nation.africa/kenya/counties/nakuru/police-bullet-kills-the-hope-of-a-nakuru-family-4675502', 'https://nation.africa/kenya/news/-malicious-claims-ig-japhet-koome-says-no-kenyan-police-officer-ha


URLs:   0%|          | 0/25 [00:00<?, ?it/s][A
URLs:   4%|▍         | 1/25 [00:00<00:21,  1.11it/s][A
URLs:   8%|▊         | 2/25 [00:01<00:18,  1.24it/s][A
URLs:  12%|█▏        | 3/25 [00:02<00:18,  1.17it/s][A
URLs:  16%|█▌        | 4/25 [00:03<00:17,  1.23it/s][A
URLs:  20%|██        | 5/25 [00:04<00:16,  1.24it/s][A
URLs:  24%|██▍       | 6/25 [00:04<00:14,  1.31it/s][A
URLs:  28%|██▊       | 7/25 [00:05<00:16,  1.12it/s][A
URLs:  32%|███▏      | 8/25 [00:06<00:14,  1.14it/s][A
URLs:  36%|███▌      | 9/25 [00:07<00:13,  1.16it/s][A
URLs:  40%|████      | 10/25 [00:08<00:12,  1.23it/s][A
URLs:  44%|████▍     | 11/25 [00:09<00:11,  1.24it/s][A
URLs:  48%|████▊     | 12/25 [00:09<00:10,  1.22it/s][A
URLs:  52%|█████▏    | 13/25 [00:10<00:10,  1.17it/s][A
URLs:  56%|█████▌    | 14/25 [00:11<00:08,  1.25it/s][A
URLs:  60%|██████    | 15/25 [00:13<00:10,  1.08s/it][A
URLs:  64%|██████▍   | 16/25 [00:14<00:11,  1.25s/it][A
URLs:  68%|██████▊   | 17/25 [00:15<00:09,  1.16

Urls: []



URLs: 0it [00:00, ?it/s][A
Websites:  67%|██████▋   | 2/3 [00:26<00:10, 10.85s/it]

Urls: ['https://www.businessdailyafrica.com/bd/economy/kenya-agoa-exports-dip-42pc-deny-traders-dollar-windfall-4675728', 'https://www.businessdailyafrica.com/bd/economy/kenya-to-open-market-for-duty-free-eu-imports-4675654', 'https://www.businessdailyafrica.com/bd/economy/dar-port-starts-facelift-in-hub-status-fight-against-mombasa--4674792', 'https://www.businessdailyafrica.com/bd/economy/how-kenya-paid-sh197bn-interest-on-debut-eurobond--4675074', 'https://www.businessdailyafrica.com/bd/economy/eyes-on-president-ruto-office-over-sh667m-budget-cut-pledge--4674838', 'https://www.businessdailyafrica.com/bd/opinion-analysis/columnists/leaders-gaslighting-us-to-submission--4674968', 'https://www.businessdailyafrica.com/bd/economy/civil-servants-medical-cover-intact-as-shif-rollout-starts--4675082', 'https://www.businessdailyafrica.com/bd/lifestyle/profiles/mary-wamae-iron-lady-that-has-shaped-equity-group-exits--4675094', 'https://www.businessdailyafrica.com/bd/markets/commodities/why-ns


URLs:   0%|          | 0/25 [00:00<?, ?it/s][A
URLs:   4%|▍         | 1/25 [00:00<00:17,  1.41it/s][A
URLs:   8%|▊         | 2/25 [00:01<00:14,  1.63it/s][A
URLs:  12%|█▏        | 3/25 [00:02<00:15,  1.43it/s][A
URLs:  16%|█▌        | 4/25 [00:02<00:14,  1.47it/s][A
URLs:  20%|██        | 5/25 [00:03<00:14,  1.41it/s][A
URLs:  24%|██▍       | 6/25 [00:04<00:14,  1.30it/s][A
URLs:  28%|██▊       | 7/25 [00:05<00:14,  1.22it/s][A
URLs:  32%|███▏      | 8/25 [00:06<00:15,  1.13it/s][A
URLs:  36%|███▌      | 9/25 [00:07<00:15,  1.02it/s][A
URLs:  40%|████      | 10/25 [00:08<00:16,  1.07s/it][A
URLs:  44%|████▍     | 11/25 [00:09<00:14,  1.07s/it][A
URLs:  48%|████▊     | 12/25 [00:10<00:14,  1.09s/it][A
URLs:  52%|█████▏    | 13/25 [00:12<00:16,  1.35s/it][A
URLs:  56%|█████▌    | 14/25 [00:14<00:15,  1.40s/it][A
URLs:  60%|██████    | 15/25 [00:15<00:12,  1.25s/it][A
URLs:  64%|██████▍   | 16/25 [00:16<00:10,  1.20s/it][A
URLs:  68%|██████▊   | 17/25 [00:17<00:08,  1.08

## Relevance Filtering

In [4]:
# Public debt relevant terms
# relevant_keywords = ['public debt', 'public budget', 'public finance management', 'budget trends', 'budget theft']
relevant_keywords = ['debt', 'budget', 'finance', 'trends', 'theft','government']

# Filter relevant blogs
df_filtered = df[df['article'].str.contains('|'.join(relevant_keywords), case=False)]

## Exporting Dataset to CSV


In [5]:
df_filtered.to_csv("/workspaces/Project-Uchumi/data/raw/articles.csv", index=False, mode='a', header=not os.path.exists("/workspaces/Project-Uchumi/data/raw/articles.csv"))