In [1]:
%pip install beautifulsoup4 

Collecting beautifulsoup4
  Using cached beautifulsoup4-4.12.3-py3-none-any.whl.metadata (3.8 kB)
Collecting soupsieve>1.2 (from beautifulsoup4)
  Using cached soupsieve-2.6-py3-none-any.whl.metadata (4.6 kB)
Using cached beautifulsoup4-4.12.3-py3-none-any.whl (147 kB)
Using cached soupsieve-2.6-py3-none-any.whl (36 kB)
Installing collected packages: soupsieve, beautifulsoup4
Successfully installed beautifulsoup4-4.12.3 soupsieve-2.6
Note: you may need to restart the kernel to use updated packages.


In [14]:
import requests
from bs4 import BeautifulSoup
from tqdm import tqdm
import pandas as pd

In [17]:
def scrape_wikipedia_page(title, lang="en"):
    """
    Scrape le texte principal d'une page Wikipédia.
    
    :param title: Titre de la page Wikipédia (ex: "Paris").
    :param lang: Langue de Wikipédia (par défaut "fr").
    :return: Le texte principal de la page ou un message d'erreur.
    """
    url = f"https://{lang}.wikipedia.org/wiki/{title.replace(' ', '_')}_arrondissement_of_Paris"
    response = requests.get(url)
    print(response.status_code)  # Vérifie le code de réponse (200 attendu)
    print(response.url)          # Vérifie si l'URL finale est correcte
    #print(soup.prettify()[:1000])  # Affiche un aperçu du HTML récupéré


    if response.status_code == 200:
        soup = BeautifulSoup(response.content, "html.parser")
        #content_div = soup.find("div", {"class": "mw-parser-output"})  # Contient le texte principal
        content_div = soup.find("div", {"id": "bodyContent"})  # Option alternative

        
        if content_div:
            paragraphs = content_div.find_all("p")  # Récupère les paragraphes
            text = "\n".join([para.get_text(strip=True) for para in paragraphs if para.text.strip()])
            print(soup.prettify()[:1000])  # Affiche un aperçu du HTML récupéré
            return text if text else "Pas de texte disponible sur cette page."
        else:
            return "La structure de la page n'a pas pu être trouvée."
    else:
        return f"Erreur {response.status_code}: Impossible d'accéder à la page '{title}'."


# Scraper les 20 arrondissements et les stocker dans un tableau
def scrape_all_arrondissements():
    """
    Scrape les pages Wikipédia des 20 arrondissements de Paris et retourne les résultats dans un tableau.
    
    :return: Un DataFrame contenant le numéro de l'arrondissement et le texte récupéré.
    """
    arrondissements = ['1st','2nd','3rd','4th','5th','6th','7th','8th','9rd','10th','11th','12th','13th','14th','15th','16th','17th','18th','19th','20th'] 
    data = []
    
    for arrondissement in tqdm(arrondissements, desc="Scraping Wikipédia"):
        text = scrape_wikipedia_page(arrondissement)
        data.append({"Arrondissement": arrondissement, "Texte": text})
    
    # Convertir les données en DataFrame
    df = pd.DataFrame(data)
    return df


# Scraper les 20 arrondissements et afficher le tableau
df = scrape_all_arrondissements()
print(df.head())  # Affiche les 5 premières lignes
    
# Sauvegarder les résultats dans un fichier CSV
df.to_csv("arrondissements_paris.csv", index=False, encoding="utf-8")
print("Les résultats ont été sauvegardés dans 'arrondissements_paris.csv'.")


Scraping Wikipédia:   0%|          | 0/20 [00:00<?, ?it/s]

200
https://en.wikipedia.org/wiki/1st_arrondissement_of_Paris


Scraping Wikipédia:   5%|▌         | 1/20 [00:00<00:11,  1.61it/s]

<!DOCTYPE html>
<html class="client-nojs vector-feature-language-in-header-enabled vector-feature-language-in-main-page-header-disabled vector-feature-page-tools-pinned-disabled vector-feature-toc-pinned-clientpref-1 vector-feature-main-menu-pinned-disabled vector-feature-limited-width-clientpref-1 vector-feature-limited-width-content-enabled vector-feature-custom-font-size-clientpref-1 vector-feature-appearance-pinned-clientpref-1 vector-feature-night-mode-enabled skin-theme-clientpref-day vector-sticky-header-enabled vector-toc-available" dir="ltr" lang="en">
 <head>
  <meta charset="utf-8"/>
  <title>
   1st arrondissement of Paris - Wikipedia
  </title>
  <script>
   (function(){var className="client-js vector-feature-language-in-header-enabled vector-feature-language-in-main-page-header-disabled vector-feature-page-tools-pinned-disabled vector-feature-toc-pinned-clientpref-1 vector-feature-main-menu-pinned-disabled vector-feature-limited-width-clientpref-1 vector-feature-limited-w

Scraping Wikipédia:  10%|█         | 2/20 [00:00<00:08,  2.17it/s]

<!DOCTYPE html>
<html class="client-nojs vector-feature-language-in-header-enabled vector-feature-language-in-main-page-header-disabled vector-feature-page-tools-pinned-disabled vector-feature-toc-pinned-clientpref-1 vector-feature-main-menu-pinned-disabled vector-feature-limited-width-clientpref-1 vector-feature-limited-width-content-enabled vector-feature-custom-font-size-clientpref-1 vector-feature-appearance-pinned-clientpref-1 vector-feature-night-mode-enabled skin-theme-clientpref-day vector-sticky-header-enabled vector-toc-available" dir="ltr" lang="en">
 <head>
  <meta charset="utf-8"/>
  <title>
   2nd arrondissement of Paris - Wikipedia
  </title>
  <script>
   (function(){var className="client-js vector-feature-language-in-header-enabled vector-feature-language-in-main-page-header-disabled vector-feature-page-tools-pinned-disabled vector-feature-toc-pinned-clientpref-1 vector-feature-main-menu-pinned-disabled vector-feature-limited-width-clientpref-1 vector-feature-limited-w

Scraping Wikipédia:  15%|█▌        | 3/20 [00:01<00:06,  2.55it/s]

<!DOCTYPE html>
<html class="client-nojs vector-feature-language-in-header-enabled vector-feature-language-in-main-page-header-disabled vector-feature-page-tools-pinned-disabled vector-feature-toc-pinned-clientpref-1 vector-feature-main-menu-pinned-disabled vector-feature-limited-width-clientpref-1 vector-feature-limited-width-content-enabled vector-feature-custom-font-size-clientpref-1 vector-feature-appearance-pinned-clientpref-1 vector-feature-night-mode-enabled skin-theme-clientpref-day vector-sticky-header-enabled vector-toc-available" dir="ltr" lang="en">
 <head>
  <meta charset="utf-8"/>
  <title>
   3rd arrondissement of Paris - Wikipedia
  </title>
  <script>
   (function(){var className="client-js vector-feature-language-in-header-enabled vector-feature-language-in-main-page-header-disabled vector-feature-page-tools-pinned-disabled vector-feature-toc-pinned-clientpref-1 vector-feature-main-menu-pinned-disabled vector-feature-limited-width-clientpref-1 vector-feature-limited-w

Scraping Wikipédia:  20%|██        | 4/20 [00:01<00:06,  2.31it/s]

200
https://en.wikipedia.org/wiki/4th_arrondissement_of_Paris
<!DOCTYPE html>
<html class="client-nojs vector-feature-language-in-header-enabled vector-feature-language-in-main-page-header-disabled vector-feature-page-tools-pinned-disabled vector-feature-toc-pinned-clientpref-1 vector-feature-main-menu-pinned-disabled vector-feature-limited-width-clientpref-1 vector-feature-limited-width-content-enabled vector-feature-custom-font-size-clientpref-1 vector-feature-appearance-pinned-clientpref-1 vector-feature-night-mode-enabled skin-theme-clientpref-day vector-sticky-header-enabled vector-toc-available" dir="ltr" lang="en">
 <head>
  <meta charset="utf-8"/>
  <title>
   4th arrondissement of Paris - Wikipedia
  </title>
  <script>
   (function(){var className="client-js vector-feature-language-in-header-enabled vector-feature-language-in-main-page-header-disabled vector-feature-page-tools-pinned-disabled vector-feature-toc-pinned-clientpref-1 vector-feature-main-menu-pinned-disabled vect

Scraping Wikipédia:  25%|██▌       | 5/20 [00:02<00:07,  2.13it/s]

<!DOCTYPE html>
<html class="client-nojs vector-feature-language-in-header-enabled vector-feature-language-in-main-page-header-disabled vector-feature-page-tools-pinned-disabled vector-feature-toc-pinned-clientpref-1 vector-feature-main-menu-pinned-disabled vector-feature-limited-width-clientpref-1 vector-feature-limited-width-content-enabled vector-feature-custom-font-size-clientpref-1 vector-feature-appearance-pinned-clientpref-1 vector-feature-night-mode-enabled skin-theme-clientpref-day vector-sticky-header-enabled vector-toc-available" dir="ltr" lang="en">
 <head>
  <meta charset="utf-8"/>
  <title>
   5th arrondissement of Paris - Wikipedia
  </title>
  <script>
   (function(){var className="client-js vector-feature-language-in-header-enabled vector-feature-language-in-main-page-header-disabled vector-feature-page-tools-pinned-disabled vector-feature-toc-pinned-clientpref-1 vector-feature-main-menu-pinned-disabled vector-feature-limited-width-clientpref-1 vector-feature-limited-w

Scraping Wikipédia:  30%|███       | 6/20 [00:02<00:07,  1.96it/s]

<!DOCTYPE html>
<html class="client-nojs vector-feature-language-in-header-enabled vector-feature-language-in-main-page-header-disabled vector-feature-page-tools-pinned-disabled vector-feature-toc-pinned-clientpref-1 vector-feature-main-menu-pinned-disabled vector-feature-limited-width-clientpref-1 vector-feature-limited-width-content-enabled vector-feature-custom-font-size-clientpref-1 vector-feature-appearance-pinned-clientpref-1 vector-feature-night-mode-enabled skin-theme-clientpref-day vector-sticky-header-enabled vector-toc-available" dir="ltr" lang="en">
 <head>
  <meta charset="utf-8"/>
  <title>
   6th arrondissement of Paris - Wikipedia
  </title>
  <script>
   (function(){var className="client-js vector-feature-language-in-header-enabled vector-feature-language-in-main-page-header-disabled vector-feature-page-tools-pinned-disabled vector-feature-toc-pinned-clientpref-1 vector-feature-main-menu-pinned-disabled vector-feature-limited-width-clientpref-1 vector-feature-limited-w

Scraping Wikipédia:  35%|███▌      | 7/20 [00:03<00:06,  1.88it/s]

<!DOCTYPE html>
<html class="client-nojs vector-feature-language-in-header-enabled vector-feature-language-in-main-page-header-disabled vector-feature-page-tools-pinned-disabled vector-feature-toc-pinned-clientpref-1 vector-feature-main-menu-pinned-disabled vector-feature-limited-width-clientpref-1 vector-feature-limited-width-content-enabled vector-feature-custom-font-size-clientpref-1 vector-feature-appearance-pinned-clientpref-1 vector-feature-night-mode-enabled skin-theme-clientpref-day vector-sticky-header-enabled vector-toc-available" dir="ltr" lang="en">
 <head>
  <meta charset="utf-8"/>
  <title>
   7th arrondissement of Paris - Wikipedia
  </title>
  <script>
   (function(){var className="client-js vector-feature-language-in-header-enabled vector-feature-language-in-main-page-header-disabled vector-feature-page-tools-pinned-disabled vector-feature-toc-pinned-clientpref-1 vector-feature-main-menu-pinned-disabled vector-feature-limited-width-clientpref-1 vector-feature-limited-w

Scraping Wikipédia:  40%|████      | 8/20 [00:03<00:06,  1.89it/s]

<!DOCTYPE html>
<html class="client-nojs vector-feature-language-in-header-enabled vector-feature-language-in-main-page-header-disabled vector-feature-page-tools-pinned-disabled vector-feature-toc-pinned-clientpref-1 vector-feature-main-menu-pinned-disabled vector-feature-limited-width-clientpref-1 vector-feature-limited-width-content-enabled vector-feature-custom-font-size-clientpref-1 vector-feature-appearance-pinned-clientpref-1 vector-feature-night-mode-enabled skin-theme-clientpref-day vector-sticky-header-enabled vector-toc-available" dir="ltr" lang="en">
 <head>
  <meta charset="utf-8"/>
  <title>
   8th arrondissement of Paris - Wikipedia
  </title>
  <script>
   (function(){var className="client-js vector-feature-language-in-header-enabled vector-feature-language-in-main-page-header-disabled vector-feature-page-tools-pinned-disabled vector-feature-toc-pinned-clientpref-1 vector-feature-main-menu-pinned-disabled vector-feature-limited-width-clientpref-1 vector-feature-limited-w

Scraping Wikipédia:  45%|████▌     | 9/20 [00:04<00:06,  1.80it/s]

404
https://en.wikipedia.org/wiki/9rd_arrondissement_of_Paris
200
https://en.wikipedia.org/wiki/10th_arrondissement_of_Paris


Scraping Wikipédia:  50%|█████     | 10/20 [00:04<00:04,  2.00it/s]

<!DOCTYPE html>
<html class="client-nojs vector-feature-language-in-header-enabled vector-feature-language-in-main-page-header-disabled vector-feature-page-tools-pinned-disabled vector-feature-toc-pinned-clientpref-1 vector-feature-main-menu-pinned-disabled vector-feature-limited-width-clientpref-1 vector-feature-limited-width-content-enabled vector-feature-custom-font-size-clientpref-1 vector-feature-appearance-pinned-clientpref-1 vector-feature-night-mode-enabled skin-theme-clientpref-day vector-sticky-header-enabled vector-toc-available" dir="ltr" lang="en">
 <head>
  <meta charset="utf-8"/>
  <title>
   10th arrondissement of Paris - Wikipedia
  </title>
  <script>
   (function(){var className="client-js vector-feature-language-in-header-enabled vector-feature-language-in-main-page-header-disabled vector-feature-page-tools-pinned-disabled vector-feature-toc-pinned-clientpref-1 vector-feature-main-menu-pinned-disabled vector-feature-limited-width-clientpref-1 vector-feature-limited-

Scraping Wikipédia:  55%|█████▌    | 11/20 [00:05<00:05,  1.73it/s]

<!DOCTYPE html>
<html class="client-nojs vector-feature-language-in-header-enabled vector-feature-language-in-main-page-header-disabled vector-feature-page-tools-pinned-disabled vector-feature-toc-pinned-clientpref-1 vector-feature-main-menu-pinned-disabled vector-feature-limited-width-clientpref-1 vector-feature-limited-width-content-enabled vector-feature-custom-font-size-clientpref-1 vector-feature-appearance-pinned-clientpref-1 vector-feature-night-mode-enabled skin-theme-clientpref-day vector-sticky-header-enabled vector-toc-available" dir="ltr" lang="en">
 <head>
  <meta charset="utf-8"/>
  <title>
   11th arrondissement of Paris - Wikipedia
  </title>
  <script>
   (function(){var className="client-js vector-feature-language-in-header-enabled vector-feature-language-in-main-page-header-disabled vector-feature-page-tools-pinned-disabled vector-feature-toc-pinned-clientpref-1 vector-feature-main-menu-pinned-disabled vector-feature-limited-width-clientpref-1 vector-feature-limited-

Scraping Wikipédia:  60%|██████    | 12/20 [00:06<00:05,  1.41it/s]

<!DOCTYPE html>
<html class="client-nojs vector-feature-language-in-header-enabled vector-feature-language-in-main-page-header-disabled vector-feature-page-tools-pinned-disabled vector-feature-toc-pinned-clientpref-1 vector-feature-main-menu-pinned-disabled vector-feature-limited-width-clientpref-1 vector-feature-limited-width-content-enabled vector-feature-custom-font-size-clientpref-1 vector-feature-appearance-pinned-clientpref-1 vector-feature-night-mode-enabled skin-theme-clientpref-day vector-sticky-header-enabled vector-toc-available" dir="ltr" lang="en">
 <head>
  <meta charset="utf-8"/>
  <title>
   12th arrondissement of Paris - Wikipedia
  </title>
  <script>
   (function(){var className="client-js vector-feature-language-in-header-enabled vector-feature-language-in-main-page-header-disabled vector-feature-page-tools-pinned-disabled vector-feature-toc-pinned-clientpref-1 vector-feature-main-menu-pinned-disabled vector-feature-limited-width-clientpref-1 vector-feature-limited-

Scraping Wikipédia:  65%|██████▌   | 13/20 [00:07<00:05,  1.22it/s]

200
https://en.wikipedia.org/wiki/13th_arrondissement_of_Paris
<!DOCTYPE html>
<html class="client-nojs vector-feature-language-in-header-enabled vector-feature-language-in-main-page-header-disabled vector-feature-page-tools-pinned-disabled vector-feature-toc-pinned-clientpref-1 vector-feature-main-menu-pinned-disabled vector-feature-limited-width-clientpref-1 vector-feature-limited-width-content-enabled vector-feature-custom-font-size-clientpref-1 vector-feature-appearance-pinned-clientpref-1 vector-feature-night-mode-enabled skin-theme-clientpref-day vector-sticky-header-enabled vector-toc-available" dir="ltr" lang="en">
 <head>
  <meta charset="utf-8"/>
  <title>
   13th arrondissement of Paris - Wikipedia
  </title>
  <script>
   (function(){var className="client-js vector-feature-language-in-header-enabled vector-feature-language-in-main-page-header-disabled vector-feature-page-tools-pinned-disabled vector-feature-toc-pinned-clientpref-1 vector-feature-main-menu-pinned-disabled ve

Scraping Wikipédia:  70%|███████   | 14/20 [00:09<00:06,  1.01s/it]

<!DOCTYPE html>
<html class="client-nojs vector-feature-language-in-header-enabled vector-feature-language-in-main-page-header-disabled vector-feature-page-tools-pinned-disabled vector-feature-toc-pinned-clientpref-1 vector-feature-main-menu-pinned-disabled vector-feature-limited-width-clientpref-1 vector-feature-limited-width-content-enabled vector-feature-custom-font-size-clientpref-1 vector-feature-appearance-pinned-clientpref-1 vector-feature-night-mode-enabled skin-theme-clientpref-day vector-sticky-header-enabled vector-toc-available" dir="ltr" lang="en">
 <head>
  <meta charset="utf-8"/>
  <title>
   14th arrondissement of Paris - Wikipedia
  </title>
  <script>
   (function(){var className="client-js vector-feature-language-in-header-enabled vector-feature-language-in-main-page-header-disabled vector-feature-page-tools-pinned-disabled vector-feature-toc-pinned-clientpref-1 vector-feature-main-menu-pinned-disabled vector-feature-limited-width-clientpref-1 vector-feature-limited-

Scraping Wikipédia:  75%|███████▌  | 15/20 [00:10<00:05,  1.08s/it]

<!DOCTYPE html>
<html class="client-nojs vector-feature-language-in-header-enabled vector-feature-language-in-main-page-header-disabled vector-feature-page-tools-pinned-disabled vector-feature-toc-pinned-clientpref-1 vector-feature-main-menu-pinned-disabled vector-feature-limited-width-clientpref-1 vector-feature-limited-width-content-enabled vector-feature-custom-font-size-clientpref-1 vector-feature-appearance-pinned-clientpref-1 vector-feature-night-mode-enabled skin-theme-clientpref-day vector-sticky-header-enabled vector-toc-available" dir="ltr" lang="en">
 <head>
  <meta charset="utf-8"/>
  <title>
   15th arrondissement of Paris - Wikipedia
  </title>
  <script>
   (function(){var className="client-js vector-feature-language-in-header-enabled vector-feature-language-in-main-page-header-disabled vector-feature-page-tools-pinned-disabled vector-feature-toc-pinned-clientpref-1 vector-feature-main-menu-pinned-disabled vector-feature-limited-width-clientpref-1 vector-feature-limited-

Scraping Wikipédia:  80%|████████  | 16/20 [00:11<00:04,  1.11s/it]

<!DOCTYPE html>
<html class="client-nojs vector-feature-language-in-header-enabled vector-feature-language-in-main-page-header-disabled vector-feature-page-tools-pinned-disabled vector-feature-toc-pinned-clientpref-1 vector-feature-main-menu-pinned-disabled vector-feature-limited-width-clientpref-1 vector-feature-limited-width-content-enabled vector-feature-custom-font-size-clientpref-1 vector-feature-appearance-pinned-clientpref-1 vector-feature-night-mode-enabled skin-theme-clientpref-day vector-sticky-header-enabled vector-toc-available" dir="ltr" lang="en">
 <head>
  <meta charset="utf-8"/>
  <title>
   16th arrondissement of Paris - Wikipedia
  </title>
  <script>
   (function(){var className="client-js vector-feature-language-in-header-enabled vector-feature-language-in-main-page-header-disabled vector-feature-page-tools-pinned-disabled vector-feature-toc-pinned-clientpref-1 vector-feature-main-menu-pinned-disabled vector-feature-limited-width-clientpref-1 vector-feature-limited-

Scraping Wikipédia:  85%|████████▌ | 17/20 [00:12<00:03,  1.02s/it]

<!DOCTYPE html>
<html class="client-nojs vector-feature-language-in-header-enabled vector-feature-language-in-main-page-header-disabled vector-feature-page-tools-pinned-disabled vector-feature-toc-pinned-clientpref-1 vector-feature-main-menu-pinned-disabled vector-feature-limited-width-clientpref-1 vector-feature-limited-width-content-enabled vector-feature-custom-font-size-clientpref-1 vector-feature-appearance-pinned-clientpref-1 vector-feature-night-mode-enabled skin-theme-clientpref-day vector-sticky-header-enabled vector-toc-available" dir="ltr" lang="en">
 <head>
  <meta charset="utf-8"/>
  <title>
   17th arrondissement of Paris - Wikipedia
  </title>
  <script>
   (function(){var className="client-js vector-feature-language-in-header-enabled vector-feature-language-in-main-page-header-disabled vector-feature-page-tools-pinned-disabled vector-feature-toc-pinned-clientpref-1 vector-feature-main-menu-pinned-disabled vector-feature-limited-width-clientpref-1 vector-feature-limited-

Scraping Wikipédia:  90%|█████████ | 18/20 [00:13<00:01,  1.10it/s]

200
https://en.wikipedia.org/wiki/18th_arrondissement_of_Paris
<!DOCTYPE html>
<html class="client-nojs vector-feature-language-in-header-enabled vector-feature-language-in-main-page-header-disabled vector-feature-page-tools-pinned-disabled vector-feature-toc-pinned-clientpref-1 vector-feature-main-menu-pinned-disabled vector-feature-limited-width-clientpref-1 vector-feature-limited-width-content-enabled vector-feature-custom-font-size-clientpref-1 vector-feature-appearance-pinned-clientpref-1 vector-feature-night-mode-enabled skin-theme-clientpref-day vector-sticky-header-enabled vector-toc-available" dir="ltr" lang="en">
 <head>
  <meta charset="utf-8"/>
  <title>
   18th arrondissement of Paris - Wikipedia
  </title>
  <script>
   (function(){var className="client-js vector-feature-language-in-header-enabled vector-feature-language-in-main-page-header-disabled vector-feature-page-tools-pinned-disabled vector-feature-toc-pinned-clientpref-1 vector-feature-main-menu-pinned-disabled ve

Scraping Wikipédia:  95%|█████████▌| 19/20 [00:13<00:00,  1.13it/s]

200
https://en.wikipedia.org/wiki/19th_arrondissement_of_Paris
<!DOCTYPE html>
<html class="client-nojs vector-feature-language-in-header-enabled vector-feature-language-in-main-page-header-disabled vector-feature-page-tools-pinned-disabled vector-feature-toc-pinned-clientpref-1 vector-feature-main-menu-pinned-disabled vector-feature-limited-width-clientpref-1 vector-feature-limited-width-content-enabled vector-feature-custom-font-size-clientpref-1 vector-feature-appearance-pinned-clientpref-1 vector-feature-night-mode-enabled skin-theme-clientpref-day vector-sticky-header-enabled vector-toc-available" dir="ltr" lang="en">
 <head>
  <meta charset="utf-8"/>
  <title>
   19th arrondissement of Paris - Wikipedia
  </title>
  <script>
   (function(){var className="client-js vector-feature-language-in-header-enabled vector-feature-language-in-main-page-header-disabled vector-feature-page-tools-pinned-disabled vector-feature-toc-pinned-clientpref-1 vector-feature-main-menu-pinned-disabled ve

Scraping Wikipédia: 100%|██████████| 20/20 [00:14<00:00,  1.35it/s]

  Arrondissement                                              Texte
0            1st  The1st arrondissement of Paris(Ierarrondisseme...
1            2nd  The2nd arrondissement of Paris(IIearrondisseme...
2            3rd  The3rd arrondissement of Paris(IIIearrondissem...
3            4th  The4th arrondissement of Paris(IVearrondisseme...
4            5th  The5th arrondissement of Paris(Vearrondissemen...
Les résultats ont été sauvegardés dans 'arrondissements_paris.csv'.





In [None]:
def scrape_wikivoyage_section(arrondissement, section, lang="en"):
    """
    Scrape a specific section (e.g., 'Eat', 'Sleep') from a Wikivoyage page, including subsections.
    
    :param arrondissement: The arrondissement number as a string (e.g., "1st", "2nd").
    :param section: The section to scrape (e.g., "Eat", "Sleep").
    :param lang: Language version of Wikivoyage (default is "en").
    :return: The text content of the section, including all subsections.
    """
    url = f"https://{lang}.wikivoyage.org/wiki/Paris/{arrondissement}_arrondissement"
    response = requests.get(url)
    
    if response.status_code == 200:
        soup = BeautifulSoup(response.content, "html.parser")
        
        # Locate the section by id
        section_header = soup.find("h2", {"id": section})
        if section_header:
            # Collect all subsequent <section> tags related to this section
            content_text = []
            parent_section = section_header.find_parent("section")
            if parent_section:
                subsections = parent_section.find_all("section")  # Subsections within the main section
                for subsection in subsections:
                    # Extract text from <li> elements
                    list_items = subsection.find_all("li")
                    for item in list_items:
                        content_text.append(item.get_text(strip=True))
            
            return "\n".join(content_text) if content_text else f"No content found for section '{section}'."
        else:
            return f"Section '{section}' not found."
    else:
        return f"Error {response.status_code}: Unable to access the page for '{arrondissement}_arrondissement'."


In [22]:
import requests
from bs4 import BeautifulSoup
from tqdm import tqdm
import pandas as pd
import json

def scrape_wikivoyage_section(arrondissement, section, lang="en"):
    """
    Scrape a specific section (e.g., 'Eat', 'Sleep') from a Wikivoyage page, including subsections or lists.
    
    :param arrondissement: The arrondissement number as a string (e.g., "1st", "2nd").
    :param section: The section to scrape (e.g., "Eat", "Sleep").
    :param lang: Language version of Wikivoyage (default is "en").
    :return: The text content of the section, including all subsections or direct lists.
    """
    url = f"https://{lang}.wikivoyage.org/wiki/Paris/{arrondissement}_arrondissement"
    response = requests.get(url)
    response.encoding = "utf-8"  # Ensure proper encoding
    
    if response.status_code == 200:
        soup = BeautifulSoup(response.content, "html.parser")
        
        # Locate the section by id
        section_header = soup.find("h2", {"id": section})
        if section_header:
            # Collect all content either from subsections or direct list
            content_text = []
            parent_section = section_header.find_parent("section")
            if parent_section:
                # Case 1: Check for subsections within the parent <section>
                subsections = parent_section.find_all("section")  # Subsections within the main section
                for subsection in subsections:
                    list_items = subsection.find_all("li")
                    for item in list_items:
                        content_text.append(item.get_text(strip=True).replace("\n", " ").replace("\r", " "))

                # Case 2: Check for a direct <ul> within the parent <section>
                direct_list = parent_section.find("ul")
                if direct_list:
                    list_items = direct_list.find_all("li")
                    for item in list_items:
                        content_text.append(item.get_text(strip=True).replace("\n", " ").replace("\r", " "))
            
            return "\n".join(content_text) if content_text else f"No content found for section '{section}'."
        else:
            return f"Section '{section}' not found."
    else:
        return f"Error {response.status_code}: Unable to access the page for '{arrondissement}_arrondissement'."


# Scrape all arrondissements for "Eat" and "Sleep" sections
def scrape_all_arrondissements_wikivoyage():
    """
    Scrape the "Eat" and "Sleep" sections for all 20 arrondissements in Paris from Wikivoyage.
    
    :return: A DataFrame containing the arrondissement, section, and text content.
    """
    arrondissements = [f"{i}th" if i > 3 else f"{i}st" if i == 1 else f"{i}nd" if i == 2 else f"{i}rd" for i in range(1, 21)]
    sections = ["Eat", "Sleep"]  # Sections to scrape
    data = []
    
    for arrondissement in tqdm(arrondissements, desc="Scraping Wikivoyage"):
        for section in sections:
            text = scrape_wikivoyage_section(arrondissement, section)
            data.append({"Arrondissement": arrondissement, "Section": section, "Content": text})
    
    # Convert the data to a DataFrame
    df = pd.DataFrame(data)
    return df


def scrape_all_arrondissements_wikivoyage_json():
    """
    Scrape the "Eat" and "Sleep" sections for all 20 arrondissements in Paris from Wikivoyage
    and save the data as a JSON file.
    """
    arrondissements = [f"{i}th" if i > 3 else f"{i}st" if i == 1 else f"{i}nd" if i == 2 else f"{i}rd" for i in range(1, 21)]
    sections = ["Eat", "Sleep"]  # Sections to scrape
    data = {}

    for arrondissement in tqdm(arrondissements, desc="Scraping Wikivoyage"):
        data[arrondissement] = {}
        for section in sections:
            text = scrape_wikivoyage_section(arrondissement, section)
            data[arrondissement][section] = text

    # Save the data to a JSON file
    with open("wikivoyage_paris_eat_sleep.json", "w", encoding="utf-8") as json_file:
        json.dump(data, json_file, ensure_ascii=False, indent=4)

    print("The data has been saved to 'wikivoyage_paris_eat_sleep.json'.")


#Scrape the data and display/save results
scrape_all_arrondissements_wikivoyage_json()


Scraping Wikivoyage: 100%|██████████| 20/20 [00:06<00:00,  3.24it/s]

The data has been saved to 'wikivoyage_paris_eat_sleep.json'.



