In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
from dateutil import parser

In [None]:
# Liste des auteurs et leurs pages Wikipédia
authors_pages = {
    "Jack London": "https://en.wikipedia.org/wiki/Jack_London",
    "Romain Gary": "https://en.wikipedia.org/wiki/Romain_Gary",
    "Fyodor Dostoevsky": "https://en.wikipedia.org/wiki/Fyodor_Dostoevsky",
    "Stefan Zweig": "https://en.wikipedia.org/wiki/Stefan_Zweig",
    "Joseph Kessel": "https://en.wikipedia.org/wiki/Joseph_Kessel",
    "Victor Hugo": "https://en.wikipedia.org/wiki/Victor_Hugo",
    "Jules Verne": "https://en.wikipedia.org/wiki/Jules_Verne",
    "Edgar Allan Poe": "https://en.wikipedia.org/wiki/Edgar_Allan_Poe",
    "Leo Tolstoy": "https://en.wikipedia.org/wiki/Leo_Tolstoy",
    "George Orwell": "https://en.wikipedia.org/wiki/George_Orwell"
}

# Liste pour stocker les données extraites
authors_data = []

# Fonction pour extraire les informations depuis Wikipédia
def scrape_wikipedia(url):
    response = requests.get(url)
    if response.status_code != 200:
        print(f"Erreur lors de l'accès à la page : {url}")
        return None

    soup = BeautifulSoup(response.text, 'html.parser')

    # Nom complet
    name = soup.find('h1', {'id': 'firstHeading'}).text.strip()

    # Date de naissance
    birth_date = soup.find('span', {'class': 'bday'}).text.strip() if soup.find('span', {'class': 'bday'}) else "N/A"

    # Date de décès (dans l'infobox)
    death_date = "N/A"
    infobox = soup.find('table', {'class': 'infobox'})
    if infobox:
        rows = infobox.find_all('tr')
        for row in rows:
            if 'Died' in row.text or 'Death' in row.text:  # Rechercher la ligne contenant "Died"
                death_date_cell = row.find('td')  # Récupérer la cellule correspondante
                if death_date_cell:
                    death_date = death_date_cell.text.strip().split('(')[0].strip()  # Supprimer les âges ou annotations
                break

    # Nationalité
    nationality = "N/A"
    if infobox:
        for row in rows:
            if 'Nationality' in row.text or 'nationality' in row.text:
                nationality_cell = row.find('td')
                if nationality_cell:
                    nationality = nationality_cell.text.strip()
                break

    # Biographie courte
    biography = "N/A"
    paragraphs = soup.find_all('p')
    for paragraph in paragraphs:
        if paragraph.text.strip():  # Prendre le premier paragraphe non vide
            biography = paragraph.text.strip()
            break

    return {
        "Name": name,
        "Birth Date": birth_date,
        "Death Date": death_date,
    }

# Parcourir tous les auteurs et scraper leurs pages Wikipédia
for author, url in authors_pages.items():
    print(f"Scraping data for {author}...")
    data = scrape_wikipedia(url)
    if data:
        authors_data.append(data)

# Convertir les données en DataFrame
authors_date = pd.DataFrame(authors_data)

# Sauvegarder dans un fichier CSV
authors_date.to_csv('authors_wikipedia_corrected.csv', index=False)
print("Données corrigées sauvegardées dans 'authors_wikipedia_corrected.csv'.")

# Afficher un aperçu des données
print(authors_date)


Scraping data for Jack London...
Scraping data for Romain Gary...
Scraping data for Fyodor Dostoevsky...
Scraping data for Stefan Zweig...
Scraping data for Joseph Kessel...
Scraping data for Victor Hugo...
Scraping data for Jules Verne...
Scraping data for Edgar Allan Poe...
Scraping data for Leo Tolstoy...
Scraping data for George Orwell...
Données corrigées sauvegardées dans 'authors_wikipedia_corrected.csv'.
                Name  Birth Date         Death Date
0        Jack London  1876-01-12  November 22, 1916
1        Romain Gary  1914-05-21    2 December 1980
2  Fyodor Dostoevsky  1821-11-11    9 February 1881
3       Stefan Zweig  1881-11-28   22 February 1942
4      Joseph Kessel  1898-02-10       23 July 1979
5        Victor Hugo  1802-02-26        22 May 1885
6        Jules Verne  1828-02-08      24 March 1905
7    Edgar Allan Poe  1809-01-19    October 7, 1849
8        Leo Tolstoy  1828-09-09   20 November 1910
9      George Orwell  1903-06-25    21 January 1950


In [None]:

# Charger les données extraites précédemment
authors_date = pd.read_csv('authors_wikipedia_corrected.csv')

# Fonction pour convertir les dates en format standard (YYYY-MM-DD)
def standardize_date(date_str):
    try:
        # Parser et convertir la date
        parsed_date = parser.parse(date_str)
        return parsed_date.strftime('%Y-%m-%d')
    except:
        return "N/A"  # Si la date ne peut pas être convertie

# Appliquer la conversion aux colonnes "Birth Date" et "Death Date"
authors_date['Birth Date'] = authors_date['Birth Date'].apply(standardize_date)
authors_date['Death Date'] = authors_date['Death Date'].apply(standardize_date)

# Sauvegarder les données corrigées dans un nouveau fichier
authors_date.to_csv('authors_wikipedia_standardized.csv', index=False)

# Afficher un aperçu des données corrigées
print(authors_date)


                Name  Birth Date  Death Date
0        Jack London  1876-01-12  1916-11-22
1        Romain Gary  1914-05-21  1980-12-02
2  Fyodor Dostoevsky  1821-11-11  1881-02-09
3       Stefan Zweig  1881-11-28  1942-02-22
4      Joseph Kessel  1898-02-10  1979-07-23
5        Victor Hugo  1802-02-26  1885-05-22
6        Jules Verne  1828-02-08  1905-03-24
7    Edgar Allan Poe  1809-01-19  1849-10-07
8        Leo Tolstoy  1828-09-09  1910-11-20
9      George Orwell  1903-06-25  1950-01-21


In [15]:
# Convertir les colonnes en format datetime
authors_date['Birth Date'] = pd.to_datetime(authors_date['Birth Date'], errors='coerce')
authors_date['Death Date'] = pd.to_datetime(authors_date['Death Date'], errors='coerce')


In [16]:
# Renommer les colonnes pour correspondre à la table MySQL
authors_date.rename(columns={
    'Birth Date': 'BirthDate',
    'Death Date': 'DeathDate'
}, inplace=True)

# Vérifiez les colonnes
print(authors_date.columns)


Index(['Name', 'BirthDate', 'DeathDate'], dtype='object')


In [17]:
authors_date.to_csv('authors_date_table.csv', index=False, encoding='utf-8')

In [18]:
authors_date

Unnamed: 0,Name,BirthDate,DeathDate
0,Jack London,1876-01-12,1916-11-22
1,Romain Gary,1914-05-21,1980-12-02
2,Fyodor Dostoevsky,1821-11-11,1881-02-09
3,Stefan Zweig,1881-11-28,1942-02-22
4,Joseph Kessel,1898-02-10,1979-07-23
5,Victor Hugo,1802-02-26,1885-05-22
6,Jules Verne,1828-02-08,1905-03-24
7,Edgar Allan Poe,1809-01-19,1849-10-07
8,Leo Tolstoy,1828-09-09,1910-11-20
9,George Orwell,1903-06-25,1950-01-21
