# Scraping du site de la SNEP : Top Albums hebdomadaire

Import de BeautifulSoup, requests, pandas, numpy et la classe Album

In [1]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import numpy as np
from album import Album
import time

Récupération des données du site (top albums) et initialisation de BeautifulSoup

In [2]:
request_text = requests.get('https://snepmusique.com/les-tops/le-top-de-la-semaine/top-albums/')
soup = BeautifulSoup(request_text.content, 'html.parser')

Création d'un DataFrame avec les colonnes relatives à un album

In [3]:
df = pd.DataFrame(columns=['rank','trend','title','artist','editor','last_week_rank','week_in','best_rank','certification','certification_date'])

Bouclage sur la liste d'album du site, création des objets Album et du contenu du DataFrame

In [4]:
top_albums = []
limit = 5

for element in soup.find_all('div', class_='item', limit=limit):
    
    rank = element.find('div', class_='rang')
    trend_up = element.find('div', class_='rang_up icon-bigarrowup')
    trend_down = element.find('div', class_='rang_down icon-bigarrowdown')
    title = element.find('div', class_='titre')
    artist = element.find('div', class_='artiste')
    editor = element.find('div', class_='editeur')
    last_week_rank = element.find('div', class_='rang_precedent')
    week_in = element.find('div', class_='week_in')
    best_rank = element.find('div', class_='best_pos')

    top_albums.append(Album(
        rank.get_text(),
        'Up' if(trend_up != None) else ('Down' if(trend_down != None) else 'Neutral'),
        title.get_text(),
        artist.get_text(),
        editor.get_text(),
        0 if last_week_rank == None else int(last_week_rank.find('strong').get_text()[0:-2 if 'er' in last_week_rank.find('strong').get_text() else -1]),
        0 if week_in.find('strong').get_text() == 'Nouvelle entrée' else int(week_in.find('strong').get_text()[0:-2 if 'er' in week_in.find('strong').get_text() else -1]),
        int(best_rank.find('strong').get_text()[0:-2 if 'er' in best_rank.find('strong').get_text() else -1]),
        None,
        None
    ))

Récupération des certifications

In [5]:
for album in top_albums:
    print()
    print(f'Scraping certification (album {album.rank}/{limit}).', end='\r')
    time.sleep(1)
    print(f'Scraping certification (album {album.rank}/{limit})..', end='\r')
    time.sleep(1)
    print(f'Scraping certification (album {album.rank}/{limit})...', end='\r')
    time.sleep(1)
    
    request_text = requests.get(f'https://snepmusique.com/les-certifications/?categorie=Albums&interprete={album.artist}&titre={album.title}')
    soup = BeautifulSoup(request_text.content, 'html.parser')

    album.certification = soup.find('div', class_='certif').get_text() if soup.find('div', class_='certification') else ''
    album.certification_date = soup.find_all('div', class_='date')[1].get_text()[15:] if soup.find('div', class_='certification') else ''

    df = df.append({
        'rank': album.rank,
        'trend': album.trend,
        'title': album.title,
        'artist': album.artist,
        'editor': album.editor,
        'last_week_rank': album.last_week_rank,
        'week_in': album.week_in,
        'best_rank': album.best_rank,
        'certification': album.certification,
        'certification_date': album.certification_date
    }, ignore_index=True)
    
    time.sleep(3)
    
print()
print("Done.")

scraping album 1/200 ...
scraping album 2/200 ...
scraping album 3/200 ...
scraping album 4/200 ...
scraping album 5/200 ...
finished !


Apperçu du dataFrame

In [6]:
df['rank'] = df['rank'].astype('int')
df['trend'] = df['trend'].astype('string')
df['title'] = df['title'].astype('string')
df['artist'] = df['artist'].astype('string')
df['editor'] = df['editor'].astype('string')
df['last_week_rank'] = df['last_week_rank'].astype('int')
df['week_in'] = df['week_in'].astype('int')
df['best_rank'] = df['best_rank'].astype('int')
df['certification'] = df['certification'].astype('string')

df.dtypes

rank                   int64
trend                 string
title                 string
artist                string
editor                string
last_week_rank         int64
week_in                int64
best_rank              int64
certification         string
certification_date    object
dtype: object

In [7]:
df

Unnamed: 0,rank,trend,title,artist,editor,last_week_rank,week_in,best_rank,certification,certification_date
0,1,Neutral,JEFE,NINHO,REC. 118 / MAL LUNE MUSIC,0,0,1,Or,09/12/2021
1,2,Down,CIVILISATION,ORELSAN,3EME BUREAU,1,2,1,Double Platine,09/12/2021
2,3,Down,30,ADELE,COLUMBIA,2,2,2,Platine,09/12/2021
3,4,Neutral,NONANTE-CINQ,ANGÈLE,ROMANCE MUSIQUE / ANGELE VL RECORDS,0,0,4,,
4,5,Down,=,ED SHEERAN,WEA / EAST WEST UK,4,5,1,Or,18/11/2021
