## 1. Vous pouvez utiliser un autre site web de votre choix

In [20]:
import time
import csv
import requests
from bs4 import BeautifulSoup

url = "https://fstt.ac.ma/Portail2023/"

headers = {
  "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3",
  "Accept-Language": "fr-FR,fr;q=0.9,en-US;q=0.8,en;q=0.7",
  "Accept-Encoding": "gzip, deflate, br",
  "Connection": "keep-alive"
}

data = []

try:
  response = requests.get(url, headers=headers)
  response.raise_for_status() # Vérifie si la requête a réussi
except requests.exceptions.RequestException as e:
  print(f"Erreur de connexion : {e}")
else:
  titres = BeautifulSoup(response.text, "html.parser")

  for title in titres.find_all("div", class_="elementor-posts-container elementor-posts elementor-posts--skin-classic elementor-grid"):
    for article in title.find_all("article"):
      titres = article.h3.a
      dates = article.div.span
      if titres and titres.has_attr('href') and dates:
        data.append((titres.text.strip(), titres['href'], dates.text.strip()))

  time.sleep(5)


# Step 2: Write to CSV
with open('articles_BeautifulSoup.csv', 'w', newline='', encoding='utf-8') as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(['Title', 'Link', 'Date'])
    writer.writerows(data)

## 2. Scraping du contenu dynamique :

In [21]:
# !pip install selenium
# !apt-get update
# !apt install chromium-chromedriver

from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
import csv
import time

chromedriver_path = '/usr/bin/chromedriver'

# Configuration des options Chrome driver
options = webdriver.ChromeOptions()
options.add_argument('--headless')
options.add_argument('--no-sandbox')
options.add_argument('--disable-dev-shm-usage')

# Initialiser le WebDriver
driver = webdriver.Chrome(options=options)

data = []

# Changer le lien selon le besoin
driver.get('https://fstt.ac.ma/Portail2023/')
# Attendre qu'un élément spécifique soit chargé
try:
  wait = WebDriverWait(driver, 10)
  wait.until(EC.presence_of_element_located((By.CLASS_NAME, "elementor-posts-container")))
  time.sleep(2)
  articles = driver.find_elements(By.CLASS_NAME, "elementor-post")

  for article in articles:
    title = article.find_element(By.CLASS_NAME, "elementor-post__title").text.strip()
    link = article.find_element(By.CLASS_NAME, "elementor-post__title").find_element(By.TAG_NAME, "a").get_attribute("href")
    date = article.find_element(By.CLASS_NAME, "elementor-post-date").text.strip()
    data.append((title, link, date))

except Exception as e:
  print(f"Erreur lors de l'attente ou de l'extraction : {e}")

# Step 2: Write to CSV
with open('articles_selenium.csv', 'w', newline='', encoding='utf-8') as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(['Title', 'Link', 'Date'])
    writer.writerows(data)

# Fermer le navigateur
driver.quit()

## 3. Combinaison des deux :

In [22]:
import pandas as pd

# Chargement des deux fichiers CSV
bs_df = pd.read_csv("articles_BeautifulSoup.csv")
selenium_df = pd.read_csv("articles_selenium.csv")

# Fusion et suppression des doublons basés sur le lien
combined_df = pd.concat([bs_df, selenium_df], ignore_index=True)
combined_df.drop_duplicates(subset="Link", inplace=True)

# Enregistrement dans un nouveau fichier
combined_df.to_csv("articles_combines.csv", index=False)


## 4. Bonus (+) :

### 1. Gérer la pagination dynamique (boutons "Suivant").

In [None]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
import csv
import time
import pandas as pd

chromedriver_path = '/usr/bin/chromedriver'

# Configuration des options Chrome driver
options = webdriver.ChromeOptions()
options.add_argument('--headless')
options.add_argument('--no-sandbox')
options.add_argument('--disable-dev-shm-usage')

# Initialiser le WebDriver
driver = webdriver.Chrome(options=options)

articlesPagination = []

# Changer le lien selon le besoin
driver.get('https://fstt.ac.ma/Portail2023/category/articles/actualites/')
# Attendre qu'un élément spécifique soit chargé

while True:
    time.sleep(2)  # laisser le temps à la page de charger

    # Sélectionner tous les articles sur la page
    post_elements = driver.find_elements(By.CSS_SELECTOR, ".elementor-post")

    for post in post_elements:
        try:
            title = post.find_element(By.CSS_SELECTOR, "h3.elementor-post__title").text.strip()
            link = post.find_element(By.CSS_SELECTOR, "a").get_attribute("href")
            date = post.find_element(By.CSS_SELECTOR, "span.elementor-post-date").text.strip()

            articlesPagination.append((title, link, date))

        except Exception as e:
          print(f"Erreur lors de l'attente ou de l'extraction : {e}")

    try:
      next_button = driver.find_element(By.CSS_SELECTOR, "a.next.page-numbers")
      next_button.click()
    except Exception as e:
      break

# Fermer le navigateur
driver.quit()

# Sauvegarde dans un fichier CSV
with open('articles_selenium_pagination.csv', 'w', newline='', encoding='utf-8') as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(['Title', 'Link', 'Date'])
    writer.writerows(articlesPagination)

print("✅ Scraping terminé : articles_selenium_pagination.csv créé.")


### 2. Extraire des images ou d'autres médias.

In [24]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
import csv
import time

# Setup headless Chrome
options = Options()
options.add_argument('--headless')
options.add_argument('--no-sandbox')
options.add_argument('--disable-dev-shm-usage')
driver = webdriver.Chrome(options=options)

# Open the main site
driver.get('https://fstt.ac.ma/Portail2023/')
time.sleep(3)

media_data = []

# 📷 Images
images = driver.find_elements(By.TAG_NAME, "img")
for img in images:
    src = img.get_attribute("src")
    if src:
        media_data.append(("image", src))

# 🎥 Videos
videos = driver.find_elements(By.TAG_NAME, "video")
for video in videos:
    src = video.get_attribute("src")
    if src:
        media_data.append(("video", src))

# 🔊 Audio
audios = driver.find_elements(By.TAG_NAME, "audio")
for audio in audios:
    src = audio.get_attribute("src")
    if src:
        media_data.append(("audio", src))

# 📦 Iframes (e.g. YouTube embeds)
iframes = driver.find_elements(By.TAG_NAME, "iframe")
for iframe in iframes:
    src = iframe.get_attribute("src")
    if src and ("youtube.com" in src or "player" in src):
        media_data.append(("iframe", src))

# Save to CSV
with open("media_from_fstt.csv", "w", newline="", encoding="utf-8") as f:
    writer = csv.writer(f)
    writer.writerow(["Type", "URL"])
    writer.writerows(media_data)

driver.quit()