In [1]:
import requests
from bs4 import BeautifulSoup
import dateparser
import pandas as pd
from tqdm import tqdm
import unicodedata
from pathlib import Path

In [2]:
def get_soup_parser(url):
    response = requests.get(url)
    return BeautifulSoup(response.text, "html.parser")

In [3]:
def get_mali_jet_page_list_of_articles(num_page):
    soup = get_soup_parser(url=f"https://malijet.com/a_la_une_du_mali/?page={num_page}")
    articles = soup.find("div", id="v_container").find_all("div", class_="card")
    titles, source_papers, dates, links = [], [], [], []
    print('Getting list of articles...')
    for article in tqdm(articles[:-1]):
        header = article.find("div", class_="card-header")
        link = header.find("a", href=True)
        title = None if not header else header.text.strip().split("\n")[-1]
        infos = article.find("div", class_="card-body")
        infos = None if not infos else infos.text.strip().split("\n")
        
        titles.append(title)
        source_papers.append(None if not infos else infos[0])
        dates.append(None if not infos or not dateparser.parse(infos[1]) else dateparser.parse(infos[1]).date())
        links.append(link['href'])
        # print("*"*100)
    return pd.DataFrame({"title": titles, "source_paper": source_papers, "date": dates, "link": links})

## Extract info from one article


In [5]:
def fetch_article_content(article_link):
    soup = get_soup_parser(url=article_link)
    
    # get content
    content = " ".join(paragraph.text for paragraph in soup.find_all("div", dir="auto") if not paragraph.text.isspace())
    
    # TODO : We must implement a way to parse the article's author and return it as a tuple with "content"
    # author = ""
    
    if content != '':
        return content
    else:
        return unicodedata.normalize("NFKD", " ".join(soup.find("div", class_="card-header").text.split('Date : ')[1].split('\n')[1:])).strip().replace("     ", " ")

In [6]:
new_article_link = "https://malijet.com/a_la_une_du_mali/290531-industrie--le-president-assimi-goita-a-recu-lâ€™ancien-footballeur.html"

In [7]:
fetch_article_content(new_article_link)

## Second part : Parsing using date

In [8]:
begin_date = "2024-04-27"
end_date = "2024-05-08" #today

# parse them
begin_date = dateparser.parse(begin_date).date()
end_date = dateparser.parse(end_date).date()

In [9]:
page_number = 1
articles_to_fetch_df = pd.DataFrame(columns=["title", "source_paper", "date", "link"])
contents = []
current_date = end_date
while begin_date <= current_date:
    print(f"fetching article from page {page_number} ...")
    articles_to_fetch_df = pd.concat([articles_to_fetch_df, get_mali_jet_page_list_of_articles(page_number)])
    page_number+=1
    current_date = articles_to_fetch_df.date.min()

articles_to_fetch_df.query("date >= @begin_date and date <= @end_date")

In [10]:
CSV_DIR = Path().resolve() / 'data' / 'malijet' / 'source.csv'
CSV_DIR

In [11]:
subset_fetching_articles_df = articles_to_fetch_df.query("date >= @begin_date and date <= @end_date").copy()
article_contents, new_titles = [], []
existing_article_titles = pd.read_csv(CSV_DIR, sep='\t').title.tolist()
for _, row in tqdm(subset_fetching_articles_df.iterrows(), total=subset_fetching_articles_df.shape[0]):
    if row.title not in existing_article_titles:
        new_titles.append(row.title)
        article_contents.append(fetch_article_content(row.link))
if article_contents:
    print("New articles found, writing article contents to file...")
    subset_fetching_articles_df.query("title in @new_titles").assign(content=article_contents).to_csv(CSV_DIR, mode='a', sep='\t', index=False)
else:
    print("No new articles found, skipping...")