### Scraping Ami By Tag

In [5]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
import random
from concurrent.futures import ThreadPoolExecutor, as_completed
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry
import urllib3

urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)

# ===============================
# GLOBAL CONFIG
# ===============================

HEADERS = {
    "User-Agent": (
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
        "AppleWebKit/537.36 (KHTML, like Gecko) "
        "Chrome/143.0.0.0 Safari/537.36"
    )
}

# ===============================
# MAIN FUNCTION
# ===============================

def scrape_ami_tag(tag_name, tag_slug, max_workers=10):
    """
    Scrape all articles from an AMI tag and save to CSV.

    Parameters
    ----------
    tag_name : str
        Human-readable tag name (e.g. 'SantÃ©')
    tag_slug : str
        Tag slug used in URL (e.g. 'sante')
    max_workers : int
        Number of threads for article body fetching
    """

    BASE_URL = f"https://ami.mr/fr/archives/tag/{tag_slug}/page/{{}}"

    # ---- Session with retries ----
    session = requests.Session()
    retry = Retry(total=3, backoff_factor=0.5)
    adapter = HTTPAdapter(max_retries=retry)
    session.mount("https://", adapter)
    session.mount("http://", adapter)

    articles = []

    # ===============================
    # FETCH ARTICLE BODY
    # ===============================

    def fetch_article_body(link):
        if not link:
            return None
        try:
            resp = session.get(link, headers=HEADERS, timeout=10, verify=False)
            soup = BeautifulSoup(resp.text, "html.parser")
            content_div = soup.find(
                "div", class_="entry-content clearfix single-post-content"
            )

            if content_div:
                first_p = content_div.find("p")
                if first_p:
                    for br in first_p.find_all("br"):
                        br.replace_with(" ")
                    time.sleep(random.uniform(0.3, 1.0))
                    return first_p.get_text(" ", strip=True)

        except Exception as e:
            print(f"âš  Error fetching body: {link} -> {e}")

        return None

    # ===============================
    # SCRAPE PAGINATION
    # ===============================

    page = 1
    while True:
        url = BASE_URL.format(page)
        print(f"\nðŸ”Ž Scraping tag '{tag_name}' | page {page}")

        try:
            response = session.get(url, headers=HEADERS, timeout=10, verify=False)

            if response.status_code != 200:
                print("âœ” No more pages. Stopping.")
                break

            soup = BeautifulSoup(response.text, "html.parser")
            posts = soup.find_all("article")

            if not posts:
                print("âœ” No articles found. Finished.")
                break

            page_articles = []

            for post in posts:
                title_tag = post.find("a", class_="post-url")
                time_tag = post.find("time")
                summary_tag = post.find("div", class_="post-summary")

                page_articles.append({
                    "tag": tag_name,
                    "title": title_tag.get_text(strip=True) if title_tag else None,
                    "url": title_tag["href"] if title_tag else None,
                    "datetime": time_tag["datetime"] if time_tag else None,
                    "summary": summary_tag.get_text(strip=True) if summary_tag else None,
                })

            # ---- Fetch article bodies in parallel ----
            print(f"âž¡ Fetching {len(page_articles)} article bodies...")
            with ThreadPoolExecutor(max_workers=max_workers) as executor:
                futures = {
                    executor.submit(fetch_article_body, art["url"]): i
                    for i, art in enumerate(page_articles)
                }
                for future in as_completed(futures):
                    idx = futures[future]
                    page_articles[idx]["article_body"] = future.result()

            articles.extend(page_articles)
            page += 1
            time.sleep(random.uniform(1, 2))

        except requests.RequestException as e:
            print(f"âš  Request error on page {page}: {e}")
            time.sleep(2)

    # ===============================
    # SAVE TO CSV
    # ===============================

    df = pd.DataFrame(articles)
    df["datetime"] = pd.to_datetime(df["datetime"], errors="coerce")
    df = df.sort_values("datetime", ascending=False)

    if not df.empty:
        start_date = df["datetime"].max().strftime("%Y-%m-%d")
        end_date   = df["datetime"].min().strftime("%Y-%m-%d")
    else:
        start_date = end_date = "NA"

    filename = f"ami_Tag_{tag_slug}_{start_date}_to_{end_date}.csv"
    df.to_csv(filename, index=False, encoding="utf-8-sig")

    print(f"\nâœ… Saved: {filename}")
    print(f"ðŸ“Œ Total articles scraped: {len(df)}")

    return df


## Politique Tag


In [6]:
# Politique tag
df_politique = scrape_ami_tag("Politique", "politique")


ðŸ”Ž Scraping tag 'Politique' | page 1
âž¡ Fetching 10 article bodies...

ðŸ”Ž Scraping tag 'Politique' | page 2
âž¡ Fetching 10 article bodies...

ðŸ”Ž Scraping tag 'Politique' | page 3
âž¡ Fetching 10 article bodies...

ðŸ”Ž Scraping tag 'Politique' | page 4
âž¡ Fetching 10 article bodies...

ðŸ”Ž Scraping tag 'Politique' | page 5
âž¡ Fetching 10 article bodies...

ðŸ”Ž Scraping tag 'Politique' | page 6
âž¡ Fetching 10 article bodies...

ðŸ”Ž Scraping tag 'Politique' | page 7
âž¡ Fetching 10 article bodies...
âš  Error fetching body: https://ami.mr/fr/archives/93029 -> HTTPSConnectionPool(host='ami.mr', port=443): Max retries exceeded with url: /fr/archives/93029 (Caused by ReadTimeoutError("HTTPSConnectionPool(host='ami.mr', port=443): Read timed out. (read timeout=10)"))
âš  Error fetching body: https://ami.mr/fr/archives/92957 -> HTTPSConnectionPool(host='ami.mr', port=443): Max retries exceeded with url: /fr/archives/92957 (Caused by ReadTimeoutError("HTTPSConnectionPool(host='a

In [3]:
# upload csv file
import pandas as pd
df= pd.read_csv("ami_Tag_politique_2022-08-06_to_2007-01-01.csv")
df

Unnamed: 0,tag,title,url,datetime,summary,article_body
0,Politique,Les partis de la majoritÃ© commÃ©morent le trois...,https://ami.mr/fr/archives/203122,2022-08-06 19:13:39+00:00,"Les partis de la majoritÃ© ont commÃ©morÃ©, vendr...","Les partis de la majoritÃ© ont commÃ©morÃ©, vendr..."
1,Politique,La prÃ©sidente du Conseil rÃ©gional de Nouakchot...,https://ami.mr/fr/archives/202418,2022-07-30 23:21:33+00:00,La prÃ©sidente du Conseil rÃ©gional de Nouakchot...,La prÃ©sidente du Conseil rÃ©gional de Nouakchot...
2,Politique,CÃ©lÃ©bration de la journÃ©e mondiale de lutte co...,https://ami.mr/fr/archives/202417,2022-07-30 23:18:52+00:00,"Le Commissaire adjoint aux droits de l'homme, ...","Le Commissaire adjoint aux droits de lâ€™homme, ..."
3,Politique,Le ministre de lâ€™Equipement et des Transports ...,https://ami.mr/fr/archives/202174,2022-07-24 21:14:53+00:00,"Le ministre de l'Equipement et des Transports,...","Le ministre de lâ€™Equipement et des Transports,..."
4,Politique,SÃ©libabi : Atelier pour la coordination entre ...,https://ami.mr/fr/archives/202094,2022-07-21 19:24:11+00:00,"Un atelier associant autoritÃ©s locales, comitÃ©...","Un atelier associant autoritÃ©s locales, comitÃ©..."
...,...,...,...,...,...,...
9037,Politique,DÃ©but des opÃ©rations du Ravel en Adrar,https://ami.mr/fr/archives/1075,2007-01-02 11:00:00+00:00,"Le Wali de l'Adrar, M. Mohamed Ould Mohamed Sa...","Le Wali de lâ€™Adrar, M. Mohamed Ould Mohamed Sa..."
9038,Politique,Atar: Validation de 15 listes candidates aux s...,https://ami.mr/fr/archives/1074,2007-01-02 10:00:00+00:00,La commission administrative chargÃ©e de la val...,La commission administrative chargÃ©e de la val...
9039,Politique,DÃ©but du Ravel au Trarza,https://ami.mr/fr/archives/1072,2007-01-02 09:00:00+00:00,Les opÃ©rations du recensement administratif Ã ...,Les opÃ©rations du recensement administratif Ã ...
9040,Politique,RÃ©union du Bureau ExÃ©cutif de lâ€™UCD,https://ami.mr/fr/archives/1070,2007-01-02 08:00:00+00:00,Le Bureau exÃ©cutif national du parti de l'Unio...,Le Bureau exÃ©cutif national du parti de lâ€™Unio...


In [4]:
# read json file
df_json = pd.read_json("articles_mauritanie.json")
df_json

Unnamed: 0,tag,title,url,datetime,summary,article_body
0,Politique,Les partis de la majoritÃ© commÃ©morent le trois...,https://ami.mr/fr/archives/203122,2022-08-06 19:13:39+00:00,"Les partis de la majoritÃ© ont commÃ©morÃ©, vendr...","Les partis de la majoritÃ© ont commÃ©morÃ©, vendr..."
1,Politique,La prÃ©sidente du Conseil rÃ©gional de Nouakchot...,https://ami.mr/fr/archives/202418,2022-07-30 23:21:33+00:00,La prÃ©sidente du Conseil rÃ©gional de Nouakchot...,La prÃ©sidente du Conseil rÃ©gional de Nouakchot...
2,Politique,CÃ©lÃ©bration de la journÃ©e mondiale de lutte co...,https://ami.mr/fr/archives/202417,2022-07-30 23:18:52+00:00,"Le Commissaire adjoint aux droits de l'homme, ...","Le Commissaire adjoint aux droits de lâ€™homme, ..."
3,Politique,Le ministre de lâ€™Equipement et des Transports ...,https://ami.mr/fr/archives/202174,2022-07-24 21:14:53+00:00,"Le ministre de l'Equipement et des Transports,...","Le ministre de lâ€™Equipement et des Transports,..."
4,Politique,SÃ©libabi : Atelier pour la coordination entre ...,https://ami.mr/fr/archives/202094,2022-07-21 19:24:11+00:00,"Un atelier associant autoritÃ©s locales, comitÃ©...","Un atelier associant autoritÃ©s locales, comitÃ©..."
...,...,...,...,...,...,...
9037,Politique,DÃ©but des opÃ©rations du Ravel en Adrar,https://ami.mr/fr/archives/1075,2007-01-02 11:00:00+00:00,"Le Wali de l'Adrar, M. Mohamed Ould Mohamed Sa...","Le Wali de lâ€™Adrar, M. Mohamed Ould Mohamed Sa..."
9038,Politique,Atar: Validation de 15 listes candidates aux s...,https://ami.mr/fr/archives/1074,2007-01-02 10:00:00+00:00,La commission administrative chargÃ©e de la val...,La commission administrative chargÃ©e de la val...
9039,Politique,DÃ©but du Ravel au Trarza,https://ami.mr/fr/archives/1072,2007-01-02 09:00:00+00:00,Les opÃ©rations du recensement administratif Ã ...,Les opÃ©rations du recensement administratif Ã ...
9040,Politique,RÃ©union du Bureau ExÃ©cutif de lâ€™UCD,https://ami.mr/fr/archives/1070,2007-01-02 08:00:00+00:00,Le Bureau exÃ©cutif national du parti de l'Unio...,Le Bureau exÃ©cutif national du parti de lâ€™Unio...
