## Scraping [1/2]

In [7]:
import requests
from bs4 import BeautifulSoup


def strToInt(str):
  try:
    return int(str)
  except:
    return 0


def elementToObject(element, date):
  episodeNameElements = element.findAll('a')
  seasonAndEpisode = episodeNameElements[1].text.split('.')
  channelElement = element.find_previous_sibling('img')
  contryElements = channelElement.find_previous_sibling('img')

  object = {
    'name': episodeNameElements[0].text,
    'episode': strToInt(seasonAndEpisode[1]),
    'season': strToInt(seasonAndEpisode[0]),
    'date': date,
    'country': contryElements.get('alt'),
    'channel': channelElement.get('alt'),
    'url': episodeNameElements[0].get('href')
  }
  return object


url = 'https://www.spin-off.fr/calendrier_des_series.html'
response = requests.get(url)
content = response.content
page = BeautifulSoup(content, 'html')
currentMonth = page.find_all('td', class_=['floatleftmobile td_jour', 'floatleftmobile td_jour td_jourcourant'])


episodes = []
for day in currentMonth:
  dateElement = day.find('div', class_=['div_jour', 'div_jourcourant fond_degrade_v4'])
  if dateElement:
    date = dateElement.get('id').split('_')[1]
    episodesElements = day.find_all('span', class_=['calendrier_episodes'])
    for episodeElement in episodesElements:
      episodes.append(elementToObject(episodeElement, date))

print(episodes)

[{'name': '4 Estrellas', 'episode': 110, 'season': 1, 'date': '01-11-2023', 'country': 'Espagne', 'channel': 'TVE', 'url': 'serie-15345-4-Estrellas.html'}, {'name': 'Black Cake', 'episode': 1, 'season': 1, 'date': '01-11-2023', 'country': 'Etats-Unis', 'channel': 'Hulu', 'url': 'serie-15614-Black-Cake.html'}, {'name': 'Black Cake', 'episode': 2, 'season': 1, 'date': '01-11-2023', 'country': 'Etats-Unis', 'channel': 'Hulu', 'url': 'serie-15614-Black-Cake.html'}, {'name': 'Black Cake', 'episode': 3, 'season': 1, 'date': '01-11-2023', 'country': 'Etats-Unis', 'channel': 'Hulu', 'url': 'serie-15614-Black-Cake.html'}, {'name': 'Blanca', 'episode': 5, 'season': 2, 'date': '01-11-2023', 'country': 'Italie', 'channel': 'Rai 1', 'url': 'serie-14708-Blanca.html'}, {'name': 'Children Ruin Everything', 'episode': 6, 'season': 3, 'date': '01-11-2023', 'country': 'Canada', 'channel': 'CTV', 'url': 'serie-13789-Children-Ruin-Everything.html'}, {'name': 'Chucky', 'episode': 5, 'season': 3, 'date': '01

## Fichiers

In [None]:
def objectToStringforCsv(episode):
  return f'{episode['name']},{episode['episode']},{episode['season']},{episode['date']},{episode['country']},{episode['channel']},{episode['url']}\n'

strEpisodes = [
  objectToStringforCsv(episode)
  for episode in episodes
]

with open('./data/files/episodes.csv', mode='w+', encoding='utf-8') as file:
  file.write(''.join(strEpisodes))

In [32]:
def lineToTuples(line):
  values = line.split(',')
  print(values)

# la method "arrayToEpisode" permet de récupérer les données depuis le fichier CSV plutôt que de refaire le scrapping du site.
def arrayToEpisode(values):
  return {
    'name': values[0],
    'episode': int(values[1]),
    'season': int(values[2]),
    'date': values[3],
    'country': values[4],
    'channel': values[5],
    'url': values[6]
  }

typeTuples = []
episodes = []
with open('./data/files/episodes.csv', 'r', encoding='utf-8') as file:
  for line in file:
    types = []
    values = line.strip().split(',')
    episodes.append(arrayToEpisode(values))
    for value in values:
      try:
        value = int(value)
      except:
        value = value
      types.append(type(value).__name__)
    typeTuples.append(tuple(types))

print(typeTuples[0])

('str', 'int', 'int', 'str', 'str', 'str', 'str')


## SQL [1/2]

In [11]:
import sqlite3

db = sqlite3.connect('./data/databases/database.db')
cursor = db.cursor()

tuples = [
  tuple(episode.values())
  for episode in episodes
]

req = 'insert into Episode (name, episode, season, date, country, channel, url) values (?, ?, ?, ?, ?, ?, ?)'
cursor.executemany(req, tuples) # /!\ Ne pas relancer cette partie du code si les données sont déjà présente dans la base
db.commit()
db.close()

## Algorithmie [1/2]

In [6]:
import sqlite3

def countBy(key):
  db = sqlite3.connect('./data/databases/database.db')
  cursor = db.cursor()

  req = f'''
    select {key}, count(*) as length
    from Episode
    group by {key}
    order by length desc
    limit 3
  '''

  res = cursor.execute(req).fetchall()
  db.commit()
  db.close()

  return res



print(countBy('channel'))
print(countBy('country'))

[('Netflix', 75), ('Disney+', 33), ('Apple TV+', 18)]
[('Etats-Unis', 264), ('Canada', 37), ('France', 21)]


In [33]:
import sqlite3

db = sqlite3.connect('./data/databases/database.db')
cursor = db.cursor()

req = '''
  select name
  from Episode
  group by name
'''

res = cursor.execute(req).fetchall()
db.commit()
db.close()

counter = {}

for nameInArray in res:
  name = nameInArray[0]
  for word in name.split(' '):
    word = word.lower()
    try:
      counter[word] += 1
    except:
      counter[word] = 1

sortedCounter = dict(sorted(counter.items(), key=lambda item: item[1], reverse=True))

print(sortedCounter)

{'the': 19, 'of': 4, 'all': 4, '(2023)': 4, 'de': 3, 'murder': 2, 'at': 2, 'and': 2, 'legacy': 2, 'family': 2, 'tout': 2, '(2022)': 2, 'la': 2, 'un': 2, '4': 1, 'estrellas': 1, 'a': 1, 'end': 1, 'world': 1, 'creatures': 1, 'great': 1, 'small': 1, '(2020)': 1, 'rise': 1, 'light': 1, 'we': 1, 'cannot': 1, 'see': 1, 'anderson': 1, 'spider': 1, 'silva': 1, 'moment': 1, '(ci': 1, 'shi': 1, 'ci': 1, 'ke)': 1, 'beacon': 1, '23': 1, 'billy': 1, 'kid': 1, 'black': 1, 'cake': 1, 'blackberry': 1, 'blanca': 1, 'blue': 1, 'eye': 1, 'samurai': 1, "bob's": 1, 'burgers': 1, 'boomer': 1, 'bosch:': 1, 'bros': 1, 'children': 1, 'ruin': 1, 'everything': 1, 'chucky': 1, "cooper's": 1, 'bar': 1, 'culprits': 1, 'dna': 1, 'do': 1, 'crime': 1, 'daily': 1, 'dose': 1, 'sunshine': 1, 'demain': 1, 'nous': 1, 'appartient': 1, 'deutsches': 1, 'haus': 1, 'doctor': 1, 'who': 1, '(2005)': 1, 'doom': 1, 'patrol': 1, 'en': 1, 'helt': 1, 'vanlig': 1, 'familj': 1, 'entre': 1, 'tierras': 1, 'guy': 1, 'law': 1, '(can)': 1, '

## Scraping [2/2]

In [31]:
import time
import requests
from bs4 import BeautifulSoup

# for episode in episodes:
for index in range(10):
  episode = episodes[index]
  episodeName = episode
  url = f'https://www.spin-off.fr/{episode['url']}'

response = requests.get(url)
content = response.content
page = BeautifulSoup(content, 'html')
currentMonth = page.find_all('table', id_='series_liste_saisons')
print(currentMonth)

[]


## SQL [2/2]

## Algorithmie [2/2]

## Orchestration