# Projet python pour le Data Scientist 

### THÉLOT Léonard
### TITA Nadir

Importation des modules :

In [52]:
import pandas as pd 
import numpy as np
from bs4 import BeautifulSoup
import requests
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials

# I. Création de la base de données 

## A . Scrapping du top 100 chaque année en France

In [561]:
def get_top_fr(year, size = 100):
    page = requests.get('https://snepmusique.com/les-tops/le-top-de-lannee/top-singles-annee/?annee='+str(year))
    soup = BeautifulSoup(page.content, 'html.parser')
    artistes = []
    titres = []
    
    n = min(size, len(soup.find_all('div', class_='artiste')))
    
    for i in range(n):
        artiste = soup.find_all('div', class_='artiste')[i].get_text()
        titre = soup.find_all('div', class_='titre')[i].get_text()
        
        artistes.append(artiste)
        titres.append(titre)
        
    df = pd.DataFrame({'Artiste' : artistes, 'Titre' : titres, 'Année' : [year]*n}, index=range(1,n+1))
    
    return(df)   

In [562]:
def get_tops_fr(year_inf, year_sup, size = 100):
    if year_inf == year_sup:
        return(get_top_fr(year_inf, size))
    else: 
        return(pd.concat([get_tops_fr(year_inf,year_sup -1,size), get_top_fr(year_sup,size)]))

## B . Top 100 aux USA 

In [6]:
def get_top_us(year, size = 100):
    page = requests.get('https://www.billboard.com/charts/year-end/'+str(year)+'/hot-100-songs')
    soup = BeautifulSoup(page.content, 'html.parser')
    artistes = []
    titres = []
    
    n = min(size, len(soup.find_all('div', class_= "ye-chart-item__title" )))
    
    for i in range(n):
        artiste = soup.find_all('div', class_="ye-chart-item__artist")[i].get_text()[1:-1]
        titre = soup.find_all('div', class_="ye-chart-item__title")[i].get_text()[1:-1]
        
        artistes.append(artiste)
        titres.append(titre)
        
    df = pd.DataFrame({'Artiste' : artistes, 'Titre' : titres, 'Année' : [year]*n}, index=range(1,n+1))
    
    return(df)

In [7]:
def get_tops_us(year_inf, year_sup, size = 100):
    if year_inf == year_sup:
        return(get_top_us(year_inf, size))
    else: 
        return(pd.concat([get_tops_us(year_inf,year_sup -1,size), get_top_us(year_sup,size)]))

## C. Changement de la forme 

In [8]:
def clean(data):
    df = data.copy()
    df['Rang']=data.index
    df.reset_index(inplace=True)
    df.drop(['index'], axis = 1, inplace = True)
    df[['Artiste', 'Titre']] = df[['Artiste', 'Titre']].apply(lambda x: x.str.lower().str.replace(',','').str.replace('feat.', '').str.replace('featuring','').str.replace('&','').str.replace('.',' ').str.strip())
    return(df)
    

In [12]:
us = clean(get_tops_us(2006,2019))

In [13]:
us

Unnamed: 0,Artiste,Titre,Année,Rang
0,daniel powter,bad day,2006,1
1,sean paul,temperature,2006,2
2,nelly furtado ring timbaland,promiscuous,2006,3
3,james blunt,you're beautiful,2006,4
4,shakira ring wyclef jean,hips don't lie,2006,5
...,...,...,...,...
1393,chase rice,eyes on you,2019,96
1394,dan + shay,all to myself,2019,97
1395,ariana grande social house,boyfriend,2019,98
1396,p!nk,walk me home,2019,99


## D. Ajout des features issus de Spotify

In [53]:
cid ="d6440636c8de42f9a31db8c168ecfd01" 
secret = "ad9b78a3088b4acbbd61540f90405692"

client_credentials_manager = SpotifyClientCredentials(client_id=cid, client_secret=secret)
sp = spotipy.Spotify(client_credentials_manager=client_credentials_manager)

In [17]:
def search_id_track(df, market):
    
    d = {}
    dff = df.copy()

    for x in list(dff.index):

        artiste = dff['Artiste'][x]
        titre = dff['Titre'][x]
        year = dff['Année'][x]
        
        track_results = sp.search(q=f'track:{titre}', type='track', market = str(market))
    
        for i in range(min(5,len(track_results['tracks']['items']))):
            if x in d.keys():
                break
            for word in artiste.split():
                if word in track_results['tracks']['items'][i]['artists'][0]['name'].lower():
                    d[x] = track_results['tracks']['items'][i]['id'], track_results['tracks']['items'][i]['popularity']
                    break
                 
    dff = df.drop(d.keys(),axis=0)

    
    for x in list(dff.index):
        
    
        artiste = dff['Artiste'][x]
        titre = dff['Titre'][x]
        year = dff['Année'][x]
        
        track_results = sp.search(q=f'artist:{artiste}' , type='track', market = str(market), limit = 30)
        
        for i in range(min(30,len(track_results['tracks']['items']))):
            
            if x in d.keys():
                break
            
            else:
                
                n = min(3, len(titre.split()))
                titre_reduce = ' '.join(titre.split()[:n])
                if titre_reduce in track_results['tracks']['items'][i]['name'].lower() or track_results['tracks']['items'][i]['name'].lower() in titre_reduce:
                    d[x] = track_results['tracks']['items'][i]['id'], track_results['tracks']['items'][i]['popularity']
    
    dff = df.drop(d.keys(),axis=0)
                    
    print(f"pourcentage de données manquantes : {100*dff.shape[0]/df.shape[0]} %")
    
    ids = pd.DataFrame(d.values(), index = d.keys(), columns =['id', 'popularity'])
    
    df_id = df.drop(list(dff.index),axis=0)
    
    return(pd.concat([df_id,ids],axis=1))
        
    

In [20]:
us_ids = search_id_track(us, market = 'US')

pourcentage de données manquantes : 6.509298998569385 %


In [22]:
us_ids.groupby('Année').size()/us.groupby('Année').size()*100

Année
2006    90.000000
2007    95.000000
2008    94.000000
2009    90.000000
2010    87.000000
2011    91.919192
2012    95.000000
2013    89.000000
2014    93.000000
2015    94.000000
2016    97.979798
2017    98.000000
2018    97.000000
2019    97.000000
dtype: float64

In [23]:
def get_features(df):
    rows = []
    batchsize = 100
    None_counter = 0
    for i in range(0,df.shape[0],batchsize):
        batch = df['id'][i:i+batchsize]
        feature_results = sp.audio_features(batch)
        
        for i, t in enumerate(feature_results):
            if t == None:
                None_counter = None_counter + 1
            else:
                rows.append(t)
    print(None_counter)
    features = pd.DataFrame.from_dict(rows,orient='columns')
    columns_to_drop = ['analysis_url','track_href','type','uri']
    features.drop(columns_to_drop, axis=1,inplace=True)
    musics = pd.merge(df,features,on='id')
    
    return(musics.drop_duplicates(keep='first'))
    
    

In [24]:
us_features = get_features(us_ids)

0


In [559]:
musics.to_csv('/Users/Nadir/Documents/Data Science/Projets/spotify/musics.csv')

In [27]:
us_features.to_csv('/Users/Nadir/Documents/Data Science/Projets/spotify/top100us.csv')

##  E. Sons en vrac labélisés

In [86]:
dico = {'jazz' : ['37i9dQZF1DXbITWG1ZJKYt', '37i9dQZF1DX4wta20PHgwo'], 
        'rock' : ['37i9dQZF1DWXRqgorJj26U', '37i9dQZF1DX2xKCIjknUoQ'], 
        'classique' : ['7IFEOepzcIUmsD5aUTivyX', '62MgIbD9qUykkaD8dbRNK4'],
        'rap_fr' : ['37i9dQZF1DWSrqNVMcxGKc', '6cZmC8TNyvDc7FQR9JIaH0'],
        'rnb' : ['37i9dQZF1DX9UuQbl12Nmb', '37i9dQZF1DX6VDO8a6cQME'],
       'country' : ['37i9dQZF1DXaiEFNvQPZrM', '37i9dQZF1DWWnpcjfCqaW0'],
       'funk' : ['37i9dQZF1DWUS3jbm4YExP', '37i9dQZF1DX4WgZiuR77Ef'],
       'drill' : ['0SukvgmzHzCjVCgkvSLjbW', '5GKWfQHnF31SrDj2iFB0e6']}

In [89]:
def create_df(dico):
    categorie = []
    artiste = []
    ids = []
    titre = []
    for j in dico.keys():
        for idd in dico[j]:
            playlist = sp.user_playlist(cid, idd, 'tracks')
            data = playlist['tracks']
            for track in data['items']:
                ids.append(track['track']['id'])
                titre.append(track['track']['name'])
                b=''
                for artist in track['track']['artists']:
                    b+=str(' '+artist['name'])
                artiste.append(b)
                categorie.append(j)
            
    df = pd.DataFrame(list(zip(artiste,titre,categorie,ids)),columns = ['Artiste', 'Titre', 'Catégorie', 'id'])
    
    return(get_features(df))     

In [91]:
vrac = create_df(dico)

0


In [92]:
vrac.to_csv('/Users/Nadir/Documents/Data Science/Projets/spotify/vrac.csv')

## F. Recommendation non supervisé

In [84]:
def get_taratata():
    
    artistes = []
    for letter in list(map(chr, range(97, 123))):
        page = requests.get('https://mytaratata.com/artistes/index/'+letter)
        soup = BeautifulSoup(page.content, 'html.parser')
        for i in range(len(soup.find_all('p',class_= "artist ellipsis"))):
            artiste = soup.find_all('p',class_= "artist ellipsis")[i].get_text()
            artistes.append(artiste)
    
    artist_name= []
    track_name = []
    track_id = []
    for artiste in artistes:
        
        track_results = sp.search(q=f'artist:{artiste}', type='track')
        t = track_results['tracks']['items']
        
        for i in range(min(3, len(t))):
            
            artist_name.append(t[i]['artists'][0]['name'])
            track_name.append(t[i]['name']) 
            track_id.append(t[i]['id'])
    
    df =  pd.DataFrame({'Artiste':artist_name,'Titre':track_name,'id':track_id})
    
    
    rows = []
    batchsize = 100
    None_counter = 0
    for i in range(0,len(df['id']),batchsize):
        batch = df['id'][i:i+batchsize]
        feature_results = sp.audio_features(batch)
        for i, t in enumerate(feature_results):
            if t == None:
                None_counter = None_counter + 1
            else:
                rows.append(t)
    
    df_audio_features = pd.DataFrame.from_dict(rows,orient='columns')
    columns_to_drop = ['analysis_url','track_href','type','uri']
    df_audio_features.drop(columns_to_drop, axis=1,inplace=True)
    df = pd.merge(df,df_audio_features,on='id',how='inner')
    
    return(df)
        


In [85]:
df_taratata = get_taratata()

In [86]:
df_taratata

Unnamed: 0,Artiste,Titre,id,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature
0,Bob Marley & The Wailers,Three Little Birds,6A9mKXlFRPMPem6ygQSt7z,0.814,0.482,9,-10.493,1,0.0588,0.011100,0.000002,0.0476,0.6150,148.404,180267,4
1,Bob Marley & The Wailers,Three Little Birds,6A9mKXlFRPMPem6ygQSt7z,0.814,0.482,9,-10.493,1,0.0588,0.011100,0.000002,0.0476,0.6150,148.404,180267,4
2,Bob Marley & The Wailers,Three Little Birds,6A9mKXlFRPMPem6ygQSt7z,0.814,0.482,9,-10.493,1,0.0588,0.011100,0.000002,0.0476,0.6150,148.404,180267,4
3,Bob Marley & The Wailers,Three Little Birds,6A9mKXlFRPMPem6ygQSt7z,0.814,0.482,9,-10.493,1,0.0588,0.011100,0.000002,0.0476,0.6150,148.404,180267,4
4,Katrina & The Waves,Walking On Sunshine,05wIrZSwuaVWhcv5FfqeH0,0.596,0.869,10,-11.970,1,0.0370,0.011600,0.173000,0.0678,0.9440,109.902,238733,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3824,Zoe Wees,Control,7Lnivdhb8OxqruPiAGXAjC,0.600,0.465,4,-4.951,1,0.0314,0.643000,0.000001,0.1490,0.0682,113.810,230880,4
3825,Zoe Wees,Control - NOTD Remix,3RWep52LPJvB43tMLpd1eh,0.654,0.816,4,-5.170,1,0.0327,0.107000,0.000009,0.0770,0.4860,117.876,201710,4
3826,Zucchero,Senza Una Donna - English Version,4VPcqPaOwnVcxqy9q3b0r7,0.610,0.436,9,-10.520,1,0.0304,0.235000,0.000000,0.1880,0.5080,78.631,269067,4
3827,Zucchero,Baila Morena - Spanish Version,3YAVUsXqn7SPLsp8gVkjP5,0.713,0.769,0,-5.120,1,0.0379,0.000812,0.013700,0.1160,0.7730,119.894,245213,4


In [87]:
df_taratata.to_csv('/Users/Nadir/Documents/Data Science/Projets/spotify/taratata.csv')