Test récupération data

In [11]:
from bs4 import BeautifulSoup
import requests
from datetime import datetime
import re

# ... previous code ...


def fetch_episode_data(url):
    # Récupération de la page web
    response = requests.get(url, headers=headers)
    response.raise_for_status()  # s'assure que la requête a réussi
    soup = BeautifulSoup(response.content, 'html.parser')

    # Récupérer le mois courant pour filtrer les épisodes
    current_month = datetime.now().strftime("%B").lower()

    data = []

    # Find all the 'td' elements with the class 'floatleftmobile td_jour'
    days = soup.find_all('td', class_='floatleftmobile td_jour')

    for day in days:
        class_regex = re.compile("div_jour(courant)?")
        # Find the date within the 'div' with class 'div_jour'
        date_div = day.find('div', class_=class_regex)
        if date_div:
            date_text = date_div.get('id')

            # Find all the series entries within the 'span' with class 'calendrier_episodes'
            episodes = day.find_all('span', class_='calendrier_episodes')
            for episode in episodes:
                # Extracting information using the tags and structure provided
                series_info = episode.find('a', style=True)
                episode_info = episode.find('a', class_='liens')
                country_img = episode.find_previous_sibling('img')
                network_img = country_img.find_next_sibling('img')

                name = series_info.get('title')
                episode_detail = episode_info.get('alt')
                origin_country = country_img.get(
                    'alt') if country_img else None
                network = network_img.get('alt') if network_img else None
                episode_url = episode_info.get('href')

                # Parse out the season and episode numbers from the text
                season_episode_match = re.search(
                    r'saison (\d+) episode (\d+)', episode_detail)
                if season_episode_match:
                    season_num = int(season_episode_match.group(1))
                    episode_num = int(season_episode_match.group(2))

                    # Add to the data list
                    episode_data = {
                        'name': name,
                        'season_num': season_num,
                        'episode_num': episode_num,
                        # Format the date as required
                        'date': date_text,
                        'origin_country': origin_country,
                        'network': network,
                        'episode_url': episode_url
                    }
                    data.append(episode_data)

    return data


# URL du site
url = 'https://www.spin-off.fr/calendrier_des_series.html'
episode_data = fetch_episode_data(url)

# Afficher les données récupérées
for episode in episode_data:
    print(episode)


{'name': '4 Estrellas', 'season_num': 1, 'episode_num': 110, 'date': 'jour_01-11-2023', 'origin_country': 'TVE', 'network': 'Etats-Unis', 'episode_url': 'episode110-411772-01112023-saison1-4-Estrellas.html'}
{'name': 'Black Cake', 'season_num': 1, 'episode_num': 1, 'date': 'jour_01-11-2023', 'origin_country': 'Hulu', 'network': 'Etats-Unis', 'episode_url': 'episode01-410147-01112023-saison1-Black-Cake.html'}
{'name': 'Black Cake', 'season_num': 1, 'episode_num': 2, 'date': 'jour_01-11-2023', 'origin_country': 'Hulu', 'network': 'Etats-Unis', 'episode_url': 'episode02-410148-01112023-saison1-Black-Cake.html'}
{'name': 'Black Cake', 'season_num': 1, 'episode_num': 3, 'date': 'jour_01-11-2023', 'origin_country': 'Hulu', 'network': 'Italie', 'episode_url': 'episode03-410149-01112023-saison1-Black-Cake.html'}
{'name': 'Blanca', 'season_num': 2, 'episode_num': 5, 'date': 'jour_01-11-2023', 'origin_country': 'Rai 1', 'network': 'Canada', 'episode_url': 'episode05-411693-01112023-saison2-Blanc