In [1]:
import requests
from bs4 import BeautifulSoup
import dateparser
import pandas as pd
from tqdm import tqdm
import unicodedata
from pathlib import Path

In [2]:
def get_soup_parser(url):
    response = requests.get(url)
    return BeautifulSoup(response.text, "html.parser")

In [3]:
def get_mali_jet_page_list_of_articles(num_page):
    soup = get_soup_parser(url=f"https://malijet.com/a_la_une_du_mali/?page={num_page}")
    articles = soup.find("div", id="v_container").find_all("div", class_="card")
    titles, source_papers, dates, links = [], [], [], []
    print('Getting list of articles...')
    for article in tqdm(articles[:-1]):
        header = article.find("div", class_="card-header")
        link = header.find("a", href=True)
        title = None if not header else header.text.strip().split("\n")[-1]
        infos = article.find("div", class_="card-body")
        infos = None if not infos else infos.text.strip().split("\n")
        
        titles.append(title)
        source_papers.append(None if not infos else infos[0])
        dates.append(None if not infos or not dateparser.parse(infos[1]) else dateparser.parse(infos[1]).date())
        links.append(link['href'])
        # print("*"*100)
    return pd.DataFrame({"title": titles, "source_paper": source_papers, "date": dates, "link": links})

## Extract info from one article


In [5]:
def fetch_article_content(article_link):
    soup = get_soup_parser(url=article_link)
    
    # get content
    content = " ".join(paragraph.text for paragraph in soup.find_all("div", dir="auto") if not paragraph.text.isspace())
    
    # TODO : We must implement a way to parse the article's author and return it as a tuple with "content"
    # author = ""
    
    if content != '':
        return content
    else:
        return unicodedata.normalize("NFKD", " ".join(soup.find("div", class_="card-header").text.split('Date : ')[1].split('\n')[1:])).strip().replace("     ", " ")

In [6]:
new_article_link = "https://malijet.com/a_la_une_du_mali/290531-industrie--le-president-assimi-goita-a-recu-l’ancien-footballeur.html"

In [7]:
fetch_article_content(new_article_link)

'Le Président de la Transition, Son Excellence le Colonel Assimi GOÏTA, Chef de l’État, a reçu en audience, ce mardi 07 mai 2024, Seydou KEÏTA, ancien footballeur international et entrepreneur visionnaire, en prélude de la finalisation de son complexe industriel à Sanankoroba, dans le Cercle de Kati. Cette rencontre, qui s’est tenue en présence du ministre de l’Industrie et du Commerce, marque un jalon important dans le parcours de M. KEÏTA et pour le développement économique du Mali. Seydou KEÏTA, qui a entamé ce projet ambitieux en 2021, a transformé son rêve en réalité, illustrant parfaitement la transition réussie d’une carrière sportive à celle d’entrepreneur engagé. Le complexe industriel, résultat d’années d’efforts et d’investissements considérables, est désormais prêt à démarrer ses opérations, promettant de devenir un moteur de croissance pour la région et pour le Mali tout entier. Au cours de cette audience, Seydou KEÏTA a exprimé sa gratitude envers les autorités maliennes 

## Second part : Parsing using date

In [8]:
begin_date = "2024-04-27"
end_date = "2024-05-08" #today

# parse them
begin_date = dateparser.parse(begin_date).date()
end_date = dateparser.parse(end_date).date()

In [9]:
page_number = 1
articles_to_fetch_df = pd.DataFrame(columns=["title", "source_paper", "date", "link"])
contents = []
current_date = end_date
while begin_date <= current_date:
    print(f"fetching article from page {page_number} ...")
    articles_to_fetch_df = pd.concat([articles_to_fetch_df, get_mali_jet_page_list_of_articles(page_number)])
    page_number+=1
    current_date = articles_to_fetch_df.date.min()

    articles_to_fetch_df.query("date >= @begin_date and date <= @end_date")

fetching article from page 1 ...
Getting list of articles...


100%|██████████| 20/20 [00:00<00:00, 184.30it/s]

fetching article from page 2 ...





Getting list of articles...


100%|██████████| 20/20 [00:00<00:00, 397.17it/s]


Unnamed: 0,title,source_paper,date,link
0,Industrie : Le Président Assimi GOITA a reçu l...,Présidence,2024-05-08,https://malijet.com/a_la_une_du_mali/290531-in...
1,"Célébration du 3 mai: La presse malienne, actr...",Le Challenger,2024-05-07,https://malijet.com/a_la_une_du_mali/290517-ce...
2,Assimi Goïta lors du lancement des travaux de ...,Le Républicain,2024-05-07,https://malijet.com/a_la_une_du_mali/290501-as...
3,Ouverture de la phase finale du Dialogue Inter...,Présidence,2024-05-06,https://malijet.com/a_la_une_du_mali/290494-ou...
4,Top départ de la phase finale du Dialogue inte...,Malijet,2024-05-06,https://malijet.com/a_la_une_du_mali/290492-to...
5,Journée mondiale de la liberté de la presse le...,Arc en Ciel,2024-05-06,https://malijet.com/a_la_une_du_mali/290490-jo...
6,"Discours de Son Excellence, le colonel Assimi,...",Malijet,2024-05-06,https://malijet.com/a_la_une_du_mali/290483-di...
7,Cérémonie d’ouverture: Discours de Monsieur Ou...,Malijet,2024-05-06,https://malijet.com/a_la_une_du_mali/290482-ce...
8,Les Maliens se parlent sans filtre,Studio tamani,2024-05-06,https://malijet.com/a_la_une_du_mali/290481-le...
9,Situation sécuritaire : Les Fama occupent le t...,L'Aube,2024-05-06,https://malijet.com/a_la_une_du_mali/290455-si...


In [10]:
CSV_DIR = Path().resolve() / 'data' / 'malijet' / 'source.csv'
CSV_DIR

PosixPath('/home/bouba/Workspace/kounafoni-app/data/malijet/source.csv')

In [11]:
subset_fetching_articles_df = articles_to_fetch_df.query("date >= @begin_date and date <= @end_date").copy()
article_contents, new_titles = [], []
existing_article_titles = pd.read_csv(CSV_DIR, sep='\t').title.tolist()
for _, row in tqdm(subset_fetching_articles_df.iterrows(), total=subset_fetching_articles_df.shape[0]):
    if row.title not in existing_article_titles:
        new_titles.append(row.title)
        article_contents.append(fetch_article_content(row.link))
if article_contents:
    print("New articles found, writing article contents to file...")
    subset_fetching_articles_df.query("title in @new_titles").assign(content=article_contents).to_csv(CSV_DIR, mode='a', sep='\t', index=False)
else:
    print("No new articles found, skipping...")

100%|██████████| 32/32 [00:00<00:00, 14641.40it/s]

No new articles found, skipping...





In [1]:
from langchain_community.llms import OpenLLM

In [2]:
llm = OpenLLM(
    model_name='flan-t5',
    model_id='google/flan-t5-large',
)

For more details, check out https://huggingface.co/docs/huggingface_hub/main/en/guides/download#download-files-to-local-folder.


Fetching 7 files:   0%|          | 0/7 [00:00<?, ?it/s]

model.safetensors:   0%|          | 0.00/3.13G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

NOT RECOMMENDED in production and SHOULD ONLY used for development.


In [4]:
llm.invoke("What is the difference between a duck and a goose?")

TypeError: 'FlanT5Runner' object is not callable