In [50]:
import json
from sklearn.preprocessing import MinMaxScaler
import pandas as pd
pd.set_option('display.max_columns', None)
from sklearn.metrics.pairwise import cosine_similarity
import yt_dlp
from essentia.standard import MusicExtractor
import requests
from fuzzywuzzy import process
from requests.auth import HTTPBasicAuth
from collections import Counter
import os
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials
import spacy

# Dataset

In [51]:
df = pd.read_csv("hf://datasets/maharshipandya/spotify-tracks-dataset/dataset.csv")
# df = pd.read_csv('RAW_DATASET.csv')

In [52]:
df.to_csv('RAW_DATASET.csv')

In [53]:
df.head

<bound method NDFrame.head of         Unnamed: 0                track_id                 artists  \
0                0  5SuOikwiRyPMVoIQDJUgSV             Gen Hoshino   
1                1  4qPNDBW1i3p13qLCt0Ki3A            Ben Woodward   
2                2  1iJBSr7s7jYXzM8EGcbK5b  Ingrid Michaelson;ZAYN   
3                3  6lfxq3CG4xtTiEg7opyCyx            Kina Grannis   
4                4  5vjLSffimiIP26QG5WcN2K        Chord Overstreet   
...            ...                     ...                     ...   
113995      113995  2C3TZjDRiAzdyViavDJ217           Rainy Lullaby   
113996      113996  1hIz5L4IB9hN3WRYPOCGPw           Rainy Lullaby   
113997      113997  6x8ZfSoqDjuNa5SVP5QjvX           Cesária Evora   
113998      113998  2e6sXL2bYv4bSz6VTdnfLs        Michael W. Smith   
113999      113999  2hETkH7cOfqmz3LqZDHZf5           Cesária Evora   

                                               album_name  \
0                                                  Comedy   
1      

In [54]:
df['track_genre'].unique()

array(['acoustic', 'afrobeat', 'alt-rock', 'alternative', 'ambient',
       'anime', 'black-metal', 'bluegrass', 'blues', 'brazil',
       'breakbeat', 'british', 'cantopop', 'chicago-house', 'children',
       'chill', 'classical', 'club', 'comedy', 'country', 'dance',
       'dancehall', 'death-metal', 'deep-house', 'detroit-techno',
       'disco', 'disney', 'drum-and-bass', 'dub', 'dubstep', 'edm',
       'electro', 'electronic', 'emo', 'folk', 'forro', 'french', 'funk',
       'garage', 'german', 'gospel', 'goth', 'grindcore', 'groove',
       'grunge', 'guitar', 'happy', 'hard-rock', 'hardcore', 'hardstyle',
       'heavy-metal', 'hip-hop', 'honky-tonk', 'house', 'idm', 'indian',
       'indie-pop', 'indie', 'industrial', 'iranian', 'j-dance', 'j-idol',
       'j-pop', 'j-rock', 'jazz', 'k-pop', 'kids', 'latin', 'latino',
       'malay', 'mandopop', 'metal', 'metalcore', 'minimal-techno', 'mpb',
       'new-age', 'opera', 'pagode', 'party', 'piano', 'pop-film', 'pop',
       'pow

In [55]:
print("Df len: ", len(df.index))
print("Duplicates: ", len(df[df.duplicated(subset='track_id', keep=False)]))

df = df.drop_duplicates(subset='track_id', keep='first')
print("Df with no track_id duplicates: ", len(df.index))

Df len:  114000
Duplicates:  40900
Df with no track_id duplicates:  89741


In [56]:
df = df.drop_duplicates(subset='track_name', keep='first')
print("Df with no track_name duplicates: ", len(df.index))

Df with no track_name duplicates:  73609


In [57]:
df = df.rename(columns={'track_genre': 'genre'})
feat_vec = df.drop(columns=['Unnamed: 0', 'artists', 'album_name', 'track_name', 'duration_ms', 'explicit', 'key', 'mode', 'time_signature'])

In [58]:
common_genres = [
    'acoustic', 'alt-rock', 'alternative', 'ambient', 'blues', 'classical', 'country', 'dance', 
    'disco', 'electro', 'electronic', 'folk', 'funk', 'gospel', 'hip-hop', 'house', 'indie', 'jazz', 
    'latin', 'metal', 'pop', 'rock', 'soul', 'synth-pop', 'techno', 'trance'
]

feat_vec = feat_vec[feat_vec['genre'].isin(common_genres)]

feat_vec

Unnamed: 0,track_id,popularity,danceability,energy,loudness,speechiness,acousticness,instrumentalness,liveness,valence,tempo,genre
0,5SuOikwiRyPMVoIQDJUgSV,73,0.676,0.4610,-6.746,0.1430,0.032200,0.000001,0.3580,0.7150,87.917,acoustic
1,4qPNDBW1i3p13qLCt0Ki3A,55,0.420,0.1660,-17.235,0.0763,0.924000,0.000006,0.1010,0.2670,77.489,acoustic
2,1iJBSr7s7jYXzM8EGcbK5b,57,0.438,0.3590,-9.734,0.0557,0.210000,0.000000,0.1170,0.1200,76.332,acoustic
3,6lfxq3CG4xtTiEg7opyCyx,71,0.266,0.0596,-18.515,0.0363,0.905000,0.000071,0.1320,0.1430,181.740,acoustic
4,5vjLSffimiIP26QG5WcN2K,82,0.618,0.4430,-9.681,0.0526,0.469000,0.000000,0.0829,0.1670,119.949,acoustic
...,...,...,...,...,...,...,...,...,...,...,...,...
110995,7sLknEg8aVr0m5ZuCja7b3,28,0.148,0.9930,-7.696,0.0922,0.009700,0.937000,0.0376,0.0928,140.001,trance
110996,6veycwSGozeHSFQ6fbr5dC,28,0.504,0.9850,-7.305,0.0504,0.000810,0.922000,0.1250,0.3830,139.978,trance
110997,0MLEzWJQcRkc5IMAqucPbV,28,0.474,0.9950,-4.265,0.0979,0.000166,0.369000,0.1500,0.0634,150.002,trance
110998,0cRNPYxzXLNLQd1g4kKYS6,28,0.416,0.9810,-3.653,0.0943,0.000079,0.928000,0.1870,0.0662,150.054,trance


In [59]:
genre_list = feat_vec['genre'].unique().tolist()
print(genre_list)

['acoustic', 'alt-rock', 'alternative', 'ambient', 'blues', 'classical', 'country', 'dance', 'disco', 'electro', 'electronic', 'folk', 'funk', 'gospel', 'hip-hop', 'house', 'indie', 'jazz', 'latin', 'metal', 'pop', 'rock', 'soul', 'synth-pop', 'techno', 'trance']


In [61]:
for item in genre_list:
    feat_vec['genre_'+item] = feat_vec['genre'].apply(lambda genre: 1 if genre == item else 0)

feat_vec

Unnamed: 0,track_id,popularity,danceability,energy,loudness,speechiness,acousticness,instrumentalness,liveness,valence,tempo,genre,genre_acoustic,genre_alt-rock,genre_alternative,genre_ambient,genre_blues,genre_classical,genre_country,genre_dance,genre_disco,genre_electro,genre_electronic,genre_folk,genre_funk,genre_gospel,genre_hip-hop,genre_house,genre_indie,genre_jazz,genre_latin,genre_metal,genre_pop,genre_rock,genre_soul,genre_synth-pop,genre_techno,genre_trance
0,5SuOikwiRyPMVoIQDJUgSV,73,0.676,0.4610,-6.746,0.1430,0.032200,0.000001,0.3580,0.7150,87.917,acoustic,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,4qPNDBW1i3p13qLCt0Ki3A,55,0.420,0.1660,-17.235,0.0763,0.924000,0.000006,0.1010,0.2670,77.489,acoustic,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,1iJBSr7s7jYXzM8EGcbK5b,57,0.438,0.3590,-9.734,0.0557,0.210000,0.000000,0.1170,0.1200,76.332,acoustic,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,6lfxq3CG4xtTiEg7opyCyx,71,0.266,0.0596,-18.515,0.0363,0.905000,0.000071,0.1320,0.1430,181.740,acoustic,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,5vjLSffimiIP26QG5WcN2K,82,0.618,0.4430,-9.681,0.0526,0.469000,0.000000,0.0829,0.1670,119.949,acoustic,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
110995,7sLknEg8aVr0m5ZuCja7b3,28,0.148,0.9930,-7.696,0.0922,0.009700,0.937000,0.0376,0.0928,140.001,trance,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
110996,6veycwSGozeHSFQ6fbr5dC,28,0.504,0.9850,-7.305,0.0504,0.000810,0.922000,0.1250,0.3830,139.978,trance,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
110997,0MLEzWJQcRkc5IMAqucPbV,28,0.474,0.9950,-4.265,0.0979,0.000166,0.369000,0.1500,0.0634,150.002,trance,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
110998,0cRNPYxzXLNLQd1g4kKYS6,28,0.416,0.9810,-3.653,0.0943,0.000079,0.928000,0.1870,0.0662,150.054,trance,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1


In [62]:
feat_vec.drop('genre', axis=1, inplace=True)
feat_vec

Unnamed: 0,track_id,popularity,danceability,energy,loudness,speechiness,acousticness,instrumentalness,liveness,valence,tempo,genre_acoustic,genre_alt-rock,genre_alternative,genre_ambient,genre_blues,genre_classical,genre_country,genre_dance,genre_disco,genre_electro,genre_electronic,genre_folk,genre_funk,genre_gospel,genre_hip-hop,genre_house,genre_indie,genre_jazz,genre_latin,genre_metal,genre_pop,genre_rock,genre_soul,genre_synth-pop,genre_techno,genre_trance
0,5SuOikwiRyPMVoIQDJUgSV,73,0.676,0.4610,-6.746,0.1430,0.032200,0.000001,0.3580,0.7150,87.917,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,4qPNDBW1i3p13qLCt0Ki3A,55,0.420,0.1660,-17.235,0.0763,0.924000,0.000006,0.1010,0.2670,77.489,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,1iJBSr7s7jYXzM8EGcbK5b,57,0.438,0.3590,-9.734,0.0557,0.210000,0.000000,0.1170,0.1200,76.332,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,6lfxq3CG4xtTiEg7opyCyx,71,0.266,0.0596,-18.515,0.0363,0.905000,0.000071,0.1320,0.1430,181.740,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,5vjLSffimiIP26QG5WcN2K,82,0.618,0.4430,-9.681,0.0526,0.469000,0.000000,0.0829,0.1670,119.949,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
110995,7sLknEg8aVr0m5ZuCja7b3,28,0.148,0.9930,-7.696,0.0922,0.009700,0.937000,0.0376,0.0928,140.001,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
110996,6veycwSGozeHSFQ6fbr5dC,28,0.504,0.9850,-7.305,0.0504,0.000810,0.922000,0.1250,0.3830,139.978,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
110997,0MLEzWJQcRkc5IMAqucPbV,28,0.474,0.9950,-4.265,0.0979,0.000166,0.369000,0.1500,0.0634,150.002,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
110998,0cRNPYxzXLNLQd1g4kKYS6,28,0.416,0.9810,-3.653,0.0943,0.000079,0.928000,0.1870,0.0662,150.054,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1


In [63]:
min_row = {'popularity': '0', 'loudness': '-60', 'tempo': '0'}
max_row = {'popularity': '100', 'loudness': '0', 'tempo': '250'}

min_row_df = pd.DataFrame([min_row])
max_row_df = pd.DataFrame([max_row])

feat_vec = pd.concat([feat_vec, min_row_df], ignore_index=True)
feat_vec = pd.concat([feat_vec, max_row_df], ignore_index=True)

# scale popularity, loudness, and tempo features to 0-1
scale = ['popularity', 'loudness', 'tempo']
scaler = MinMaxScaler()
feat_vec[scale] = scaler.fit_transform(feat_vec[scale])

feat_vec = feat_vec.iloc[:-2]

feat_vec

Unnamed: 0,track_id,popularity,danceability,energy,loudness,speechiness,acousticness,instrumentalness,liveness,valence,tempo,genre_acoustic,genre_alt-rock,genre_alternative,genre_ambient,genre_blues,genre_classical,genre_country,genre_dance,genre_disco,genre_electro,genre_electronic,genre_folk,genre_funk,genre_gospel,genre_hip-hop,genre_house,genre_indie,genre_jazz,genre_latin,genre_metal,genre_pop,genre_rock,genre_soul,genre_synth-pop,genre_techno,genre_trance
0,5SuOikwiRyPMVoIQDJUgSV,0.73,0.676,0.4610,0.861422,0.1430,0.032200,0.000001,0.3580,0.7150,0.351668,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,4qPNDBW1i3p13qLCt0Ki3A,0.55,0.420,0.1660,0.691755,0.0763,0.924000,0.000006,0.1010,0.2670,0.309956,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1iJBSr7s7jYXzM8EGcbK5b,0.57,0.438,0.3590,0.813089,0.0557,0.210000,0.000000,0.1170,0.1200,0.305328,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,6lfxq3CG4xtTiEg7opyCyx,0.71,0.266,0.0596,0.671050,0.0363,0.905000,0.000071,0.1320,0.1430,0.726960,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,5vjLSffimiIP26QG5WcN2K,0.82,0.618,0.4430,0.813947,0.0526,0.469000,0.000000,0.0829,0.1670,0.479796,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12303,7sLknEg8aVr0m5ZuCja7b3,0.28,0.148,0.9930,0.846056,0.0922,0.009700,0.937000,0.0376,0.0928,0.560004,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
12304,6veycwSGozeHSFQ6fbr5dC,0.28,0.504,0.9850,0.852380,0.0504,0.000810,0.922000,0.1250,0.3830,0.559912,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
12305,0MLEzWJQcRkc5IMAqucPbV,0.28,0.474,0.9950,0.901554,0.0979,0.000166,0.369000,0.1500,0.0634,0.600008,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
12306,0cRNPYxzXLNLQd1g4kKYS6,0.28,0.416,0.9810,0.911454,0.0943,0.000079,0.928000,0.1870,0.0662,0.600216,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [64]:
feat_vec.to_csv('DATASET_TO_IMPORT.csv', sep='\t')

In [65]:
feat_vec.columns

Index(['track_id', 'popularity', 'danceability', 'energy', 'loudness',
       'speechiness', 'acousticness', 'instrumentalness', 'liveness',
       'valence', 'tempo', 'genre_acoustic', 'genre_alt-rock',
       'genre_alternative', 'genre_ambient', 'genre_blues', 'genre_classical',
       'genre_country', 'genre_dance', 'genre_disco', 'genre_electro',
       'genre_electronic', 'genre_folk', 'genre_funk', 'genre_gospel',
       'genre_hip-hop', 'genre_house', 'genre_indie', 'genre_jazz',
       'genre_latin', 'genre_metal', 'genre_pop', 'genre_rock', 'genre_soul',
       'genre_synth-pop', 'genre_techno', 'genre_trance'],
      dtype='object')

In [66]:
song_dataset_cosine_sim = feat_vec.drop('track_id', axis=1)

In [67]:
song_dataset_cosine_sim.columns

Index(['popularity', 'danceability', 'energy', 'loudness', 'speechiness',
       'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo',
       'genre_acoustic', 'genre_alt-rock', 'genre_alternative',
       'genre_ambient', 'genre_blues', 'genre_classical', 'genre_country',
       'genre_dance', 'genre_disco', 'genre_electro', 'genre_electronic',
       'genre_folk', 'genre_funk', 'genre_gospel', 'genre_hip-hop',
       'genre_house', 'genre_indie', 'genre_jazz', 'genre_latin',
       'genre_metal', 'genre_pop', 'genre_rock', 'genre_soul',
       'genre_synth-pop', 'genre_techno', 'genre_trance'],
      dtype='object')

# Recommendation System

### functions

#### Video datas

In [17]:
def get_video_data(query: str = None, video_id: str = None):
    YT_KEY = os.getenv('GOOGLE_TOKEN')

    if not YT_KEY:
        raise ValueError("Google API key is missing. Please set 'GOOGLE_TOKEN' in the environment variables.")

    base_url = 'https://www.googleapis.com/youtube/v3'

    if video_id:
        url = f'{base_url}/videos'
        params = {
            'key': YT_KEY,
            'id': video_id,
            'part': 'snippet,statistics,contentDetails',
        }
    elif query:
        url = f'{base_url}/search'
        params = {
            'key': YT_KEY,
            'q': query,
            'part': 'snippet',
            'type': 'video',
            'maxResults': 1
        }
    else:
        raise ValueError("Devi fornire un video_id o una query!")

    response = requests.get(url, params=params)

    if response.status_code == 200:
        data = response.json()
        if video_id:
            if 'items' in data and len(data['items']) > 0:
                video_info = data['items'][0]
                return {
                    'video_id': video_id,
                    'title': video_info['snippet']['title'],
                    'description': video_info['snippet']['description'],
                    'tags': video_info['snippet'].get('tags', []),
                    'view_count': video_info['statistics']['viewCount'],
                    'like_count': video_info['statistics'].get('likeCount', 'N/A'),
                    'comment_count': video_info['statistics'].get('commentCount', 'N/A'),
                    'duration': video_info['contentDetails']['duration']
                }
        elif query:
            if 'items' in data and len(data['items']) > 0:
                first_video = data['items'][0]
                video_id = first_video['id']['videoId']
                return {
                    'video_id': video_id,
                    'title': first_video['snippet']['title'],
                    'description': first_video['snippet']['description'],
                    'channel_title': first_video['snippet']['channelTitle'],
                    'video_link': f'https://www.youtube.com/watch?v={video_id}'
                }
    else:
        print(f"Errore durante la richiesta: {response.status_code}")
    return None


def get_video_by_id(video_id):
    return get_video_data(video_id=video_id)


def get_video_by_name(query):
    return get_video_data(query=query)

####  Estrazione genere dalle canzoni

In [18]:
def get_artist_genre(artist_name, dataset_genres, spotify):
    if not artist_name or artist_name.isspace():
        return []
    
    result = spotify.search(q=artist_name, type='artist', limit=1)
    genres = []
    if result['artists']['items']:
        genres = result['artists']['items'][0].get('genres', [])
    
    return genres

def get_song_info(song_id, spotify):
    try:
        track_info = spotify.track(song_id)
        
        song_data = {
            "name": track_info["name"],
            "artists": [artist["name"] for artist in track_info["artists"]],
            "album": track_info["album"]["name"],
            "link": track_info["external_urls"]["spotify"],
        }
        return song_data

    except Exception as e:
        print(f"Errore durante il recupero delle informazioni della canzone: {e}")
        return None

def get_client():
    CLIENT_ID = os.getenv('SPOTIFY_CLIENT_ID')
    CLIENT_SECRET = os.getenv('SPOTIFY_CLIENT_SECRET')

    # Autenticazione con Spotify
    client_credentials_manager = SpotifyClientCredentials(
        client_id=CLIENT_ID,
        client_secret=CLIENT_SECRET
    )

    return spotipy.Spotify(client_credentials_manager=client_credentials_manager)

In [19]:
def extract_person_from_video_data(video_data):
    nlp = spacy.load('en_core_web_sm')
    text_to_search = video_data['title'] + " " + video_data['description'] + " " + " ".join(video_data['tags'])
    doc = nlp(text_to_search)

    person_entities = [ent.text for ent in doc.ents if ent.label_ == 'PERSON']

    return person_entities if person_entities else None

def dataset_mapping(genres):
    most_similar_genre = None
    highest_similarity = 0

    for genre in genres:
        best_match = process.extractOne(genre, genre_list)
        if best_match:
            similarity_score = best_match[1]
            if similarity_score > highest_similarity:
                highest_similarity = similarity_score
                most_similar_genre = best_match[0]

    return most_similar_genre

def get_genre_df(artists, video_data):

    def get_from_artist(artists):
        if not artists:
            return None
        
        genres = []
        for artist in artists:
            raw_genre = get_artist_genre(artist, genre_list, spotify=get_client())
            if raw_genre:
                artist_genre = dataset_mapping(raw_genre)
                
                if artist_genre:
                    genres.append(artist_genre)
            
        genre_counts = Counter(genres)
        return genre_counts.most_common(1)[0][0] if genre_counts else None

    genre = get_from_artist(artists)
    if not genre:
        genre = get_from_artist(extract_person_from_video_data(video_data))

    genre_df = pd.DataFrame() 

    print("\tGenre: ", genre)

    # encoding
    genre_encoding = {f'genre_{g}': [1 if g == genre else 0] for g in genre_list}
    genre_df = pd.DataFrame(genre_encoding)

    return genre_df

####  Estrazione song stats

In [20]:
def get_popularity_score(video_info):
    view_count = int(video_info['view_count'])
    like_count = int(video_info['like_count'])
    comment_count = int(video_info['comment_count'])

    max_view_count = 100000  
    max_like_count = 1000    
    max_comment_count = 100 

    
    normalized_view_count = view_count / max_view_count
    normalized_like_count = like_count / max_like_count
    normalized_comment_count = comment_count / max_comment_count

    
    normalized_view_count = min(normalized_view_count, 1)
    normalized_like_count = min(normalized_like_count, 1)
    normalized_comment_count = min(normalized_comment_count, 1)

    
    weight_view_count = 0.2
    weight_like_count = 0.3
    weight_comment_count = 0.25

    
    popularity_score = (
        weight_view_count * normalized_view_count +
        weight_like_count * normalized_like_count +
        weight_comment_count * normalized_comment_count
    )

    
    return round(popularity_score, 2)

def get_song_stats(song_path, video_data):
    music_extractor = MusicExtractor(
        lowlevelStats=['mean', 'stdev'],
        rhythmStats=['mean', 'stdev'],
        tonalStats=['mean', 'stdev']
    )
    
	
    features, _ = music_extractor(song_path)
    
    stats = {
        "danceability": features["rhythm.danceability"], #0 to 3 
        "energy": features["lowlevel.spectral_energy.mean"],
        "loudness": features["lowlevel.average_loudness"], # 0 to 1
        "speechiness": features["lowlevel.spectral_entropy.mean"],
        "acousticness": features["lowlevel.melbands_flatness_db.mean"],
        "instrumentalness": features["lowlevel.pitch_salience.mean"],
        "liveness": features["lowlevel.spectral_flux.mean"],
        "valence": features["tonal.chords_strength.mean"],
        "tempo": features["rhythm.bpm"],
        "popularity": get_popularity_score(video_data)
    }
    
    return pd.DataFrame([stats])

#### Download youtube mp3

In [21]:
def convert_to_optimal_codec(song_path, output_path):
    print("\t convertin codec")
    os.system(f"ffmpeg -i {song_path} -acodec libmp3lame {output_path} -y")

def download_mp3(song):
	id = song['Yt_Id']
	link = song['Yt_Link']
      
	raw_path = f'./songs/{id}/raw_song.mp3'
	output_path = f'./songs/{id}/song.mp3'
	ydl_opts = {
		'format': 'bestaudio/best',
		'extractaudio': True,
		'outtmpl': raw_path,
		'noplaylist': True
	}

	try:
		with yt_dlp.YoutubeDL(ydl_opts) as ydl:
			info_dict = ydl.extract_info(link, download=True)
			song_name = info_dict['title']

			convert_to_optimal_codec(raw_path, output_path)

			return output_path
	except Exception as e:
		raise e

#### Df creation

In [29]:
from concurrent.futures import ThreadPoolExecutor

def process_song(song):
	video_id = song['Yt_Id']
	song_path = song['Song_Path']
	artists = song['Artists']

	print(f"{song_path}")

	video_data = get_video_by_id(video_id)
	genre_df = get_genre_df(artists, video_data)
	song_stat_df = get_song_stats(song_path, video_data)

	song_data_df = pd.concat([song_stat_df, genre_df], axis=1)

	if not os.path.exists('./df'):
		os.makedirs('./df')

	song_data_df.to_csv(f'./df/{video_id}_df.csv', index=False)


def extract_songs_data(songs):
    with ThreadPoolExecutor() as executor:
        list(executor.map(process_song, songs))


In [23]:
def get_recommendation(dataset, song_df, genres, recommendation_count):
		similarity_scores = cosine_similarity(dataset, song_df)

		dataset['similarity_score'] = similarity_scores

		# fai il coso dei generi
		return dataset \
			.sort_values(by='similarity_score', ascending=False) \
			.head(recommendation_count) \
			['track_id']

### Get Recommendations

In [24]:
songs = [
    {
        'Yt_Link': 'https://www.youtube.com/watch?v=unRjK82bDLw',
        'Yt_Id': 'unRjK82bDLw',
        'Artists': [
            'Marco Mengoni'
        ]
    },
    {
        'Yt_Link': 'https://www.youtube.com/watch?v=E3sP6AXKuYg',
        'Yt_Id': 'E3sP6AXKuYg',
        'Artists': [
            'David Gilmour'
        ]
    },
    {
        'Yt_Link': 'https://www.youtube.com/watch?v=nVa0GPeuVk4',
        'Yt_Id': 'nVa0GPeuVk4',
        'Artists': [
            'Olly', 
            'Angelina Mango', 
            'JVLI'
        ]
    },
    {
        'Yt_Link': 'https://www.youtube.com/watch?v=u-cx3TN7Sho',
        'Yt_Id': 'u-cx3TN7Sho',
        'Artists': [
            'Rose Villai',
            'Guè'
        ]
    },
    {
        'Yt_Link': 'https://www.youtube.com/watch?v=XBwGUhdA7m4',
        'Yt_Id': 'XBwGUhdA7m4',
        'Artists': [
            'Tamango'
        ]
    }
]

In [25]:
def download_songs_mp3(songs):
    with ThreadPoolExecutor() as executor:
        list(executor.map(download_mp3, songs))


download_songs_mp3(songs)

[youtube] Extracting URL: https://www.youtube.com/watch?v=u-cx3TN7Sho
[youtube] Extracting URL: https://www.youtube.com/watch?v=nVa0GPeuVk4
[youtube] Extracting URL: https://www.youtube.com/watch?v=unRjK82bDLw
[youtube] Extracting URL: https://www.youtube.com/watch?v=E3sP6AXKuYg
[youtube] nVa0GPeuVk4: Downloading webpage
[youtube] u-cx3TN7Sho: Downloading webpage
[youtube] Extracting URL: https://www.youtube.com/watch?v=XBwGUhdA7m4
[youtube] unRjK82bDLw: Downloading webpage
[youtube] E3sP6AXKuYg: Downloading webpage
[youtube] XBwGUhdA7m4: Downloading webpage
[youtube] XBwGUhdA7m4: Downloading tv player API JSON
[youtube] E3sP6AXKuYg: Downloading tv player API JSON
[youtube] u-cx3TN7Sho: Downloading tv player API JSON
[youtube] unRjK82bDLw: Downloading tv player API JSON
[youtube] XBwGUhdA7m4: Downloading ios player API JSON
[youtube] nVa0GPeuVk4: Downloading tv player API JSON
[youtube] E3sP6AXKuYg: Downloading ios player API JSON
[youtube] u-cx3TN7Sho: Downloading ios player API JSON


ffmpeg version 5.1.6-0+deb12u1 Copyright (c) 2000-2024 the FFmpeg developers
  built with gcc 12 (Debian 12.2.0-14)
  configuration: --prefix=/usr --extra-version=0+deb12u1 --toolchain=hardened --libdir=/usr/lib/x86_64-linux-gnu --incdir=/usr/include/x86_64-linux-gnu --arch=amd64 --enable-gpl --disable-stripping --enable-gnutls --enable-ladspa --enable-libaom --enable-libass --enable-libbluray --enable-libbs2b --enable-libcaca --enable-libcdio --enable-libcodec2 --enable-libdav1d --enable-libflite --enable-libfontconfig --enable-libfreetype --enable-libfribidi --enable-libglslang --enable-libgme --enable-libgsm --enable-libjack --enable-libmp3lame --enable-libmysofa --enable-libopenjpeg --enable-libopenmpt --enable-libopus --enable-libpulse --enable-librabbitmq --enable-librist --enable-librubberband --enable-libshine --enable-libsnappy --enable-libsoxr --enable-libspeex --enable-libsrt --enable-libssh --enable-libsvtav1 --enable-libtheora --enable-libtwolame --enable-libvidstab --enab

[download] 100% of    9.67MiB in 00:00:01 at 5.00MiB/s   
[download] 100% of    3.26MiB in 00:00:02 at 1.33MiB/s   	 convertin codec
[download] 100.0% of    3.91MiB at    2.61MiB/s ETA 00:00
[download] 100% of    3.91MiB in 00:00:01 at 2.49MiB/s   
	 convertin codec
	 convertin codec


ffmpeg version 5.1.6-0+deb12u1 Copyright (c) 2000-2024 the FFmpeg developers
  built with gcc 12 (Debian 12.2.0-14)
  configuration: --prefix=/usr --extra-version=0+deb12u1 --toolchain=hardened --libdir=/usr/lib/x86_64-linux-gnu --incdir=/usr/include/x86_64-linux-gnu --arch=amd64 --enable-gpl --disable-stripping --enable-gnutls --enable-ladspa --enable-libaom --enable-libass --enable-libbluray --enable-libbs2b --enable-libcaca --enable-libcdio --enable-libcodec2 --enable-libdav1d --enable-libflite --enable-libfontconfig --enable-libfreetype --enable-libfribidi --enable-libglslang --enable-libgme --enable-libgsm --enable-libjack --enable-libmp3lame --enable-libmysofa --enable-libopenjpeg --enable-libopenmpt --enable-libopus --enable-libpulse --enable-librabbitmq --enable-librist --enable-librubberband --enable-libshine --enable-libsnappy --enable-libsoxr --enable-libspeex --enable-libsrt --enable-libssh --enable-libsvtav1 --enable-libtheora --enable-libtwolame --enable-libvidstab --enab

In [26]:
for song in songs:
    song['Song_Path'] = f'./songs/{song["Yt_Id"]}/song.mp3'
    print(song['Song_Path'])

./songs/unRjK82bDLw/song.mp3
./songs/E3sP6AXKuYg/song.mp3
./songs/nVa0GPeuVk4/song.mp3
./songs/u-cx3TN7Sho/song.mp3
./songs/XBwGUhdA7m4/song.mp3


In [35]:
extract_songs_data(songs)

./songs/unRjK82bDLw/song.mp3
./songs/E3sP6AXKuYg/song.mp3
./songs/nVa0GPeuVk4/song.mp3
./songs/u-cx3TN7Sho/song.mp3
./songs/XBwGUhdA7m4/song.mp3
	Genre:  dance
	Genre:  rock


[   INFO   ] MusicExtractor: Read metadata
[   INFO   ] MusicExtractor: Compute md5 audio hash, codec, length, and EBU 128 loudness
[   INFO   ] MusicExtractor: Replay gain
[   INFO   ] MusicExtractor: Compute audio features
[   INFO   ] MusicExtractor: Compute aggregation
[   INFO   ] All done
[   INFO   ] MusicExtractor: Read metadata
[   INFO   ] MusicExtractor: Compute md5 audio hash, codec, length, and EBU 128 loudness
[   INFO   ] MusicExtractor: Replay gain
[   INFO   ] MusicExtractor: Compute audio features
[   INFO   ] MusicExtractor: Compute aggregation
[   INFO   ] All done


	Genre:  latin


[   INFO   ] MusicExtractor: Read metadata
[   INFO   ] MusicExtractor: Compute md5 audio hash, codec, length, and EBU 128 loudness
[   INFO   ] MusicExtractor: Replay gain
[   INFO   ] MusicExtractor: Compute audio features
[   INFO   ] MusicExtractor: Compute aggregation
[   INFO   ] All done


	Genre:  None


[   INFO   ] MusicExtractor: Read metadata
[   INFO   ] MusicExtractor: Compute md5 audio hash, codec, length, and EBU 128 loudness
[   INFO   ] MusicExtractor: Replay gain
[   INFO   ] MusicExtractor: Compute audio features
[   INFO   ] MusicExtractor: Compute aggregation
[   INFO   ] All done


	Genre:  latin


[   INFO   ] MusicExtractor: Read metadata
[   INFO   ] MusicExtractor: Compute md5 audio hash, codec, length, and EBU 128 loudness
[   INFO   ] MusicExtractor: Replay gain
[   INFO   ] MusicExtractor: Compute audio features
[   INFO   ] MusicExtractor: Compute aggregation
[   INFO   ] All done


In [36]:
song_df = pd.concat(
    [pd.read_csv(f"./df/{song['Yt_Id']}_df.csv") for song in songs],
    ignore_index=True,
)

In [82]:
song_df

Unnamed: 0.1,Unnamed: 0,danceability,energy,loudness,speechiness,acousticness,instrumentalness,liveness,valence,tempo,popularity,genre_acoustic,genre_alt-rock,genre_alternative,genre_ambient,genre_blues,genre_classical,genre_country,genre_dance,genre_disco,genre_electro,genre_electronic,genre_folk,genre_funk,genre_gospel,genre_hip-hop,genre_house,genre_indie,genre_jazz,genre_latin,genre_metal,genre_pop,genre_rock,genre_soul,genre_synth-pop,genre_techno,genre_trance
0,0,1.029173,0.023112,0.638778,7.545165,0.202663,0.542799,0.066513,0.544834,125.858482,0.75,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,0,1.183636,0.03197,0.577944,7.14276,0.268808,0.521356,0.076659,0.525631,123.47258,0.75,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0
2,0,1.16821,0.03479,0.764446,7.40827,0.230343,0.538284,0.077632,0.527097,96.839523,0.75,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0
3,0,1.193732,0.039334,0.867407,7.236877,0.259844,0.470456,0.09694,0.51417,140.002106,0.75,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0
4,0,0.943274,0.052897,0.514586,7.031957,0.30645,0.455911,0.089969,0.497854,163.967667,0.57,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [37]:
column_averages = song_df.mean()

# Create a new DataFrame for the averages and totals
averages_cosine_sim = pd.DataFrame([column_averages], index=['Average'])

averages_cosine_sim = averages_cosine_sim.loc[:, ~averages_cosine_sim.columns.str.contains('^Unnamed')]

In [38]:
averages_cosine_sim

Unnamed: 0,danceability,energy,loudness,speechiness,acousticness,instrumentalness,liveness,valence,tempo,popularity,genre_acoustic,genre_alt-rock,genre_alternative,genre_ambient,genre_blues,genre_classical,genre_country,genre_dance,genre_disco,genre_electro,genre_electronic,genre_folk,genre_funk,genre_gospel,genre_hip-hop,genre_house,genre_indie,genre_jazz,genre_latin,genre_metal,genre_pop,genre_rock,genre_soul,genre_synth-pop,genre_techno,genre_trance
Average,1.103605,0.036421,0.672632,7.273006,0.253622,0.505763,0.081543,0.521917,130.028072,0.714,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.4,0.0,0.0,0.2,0.0,0.0,0.0,0.0


In [68]:
similarity_scores = cosine_similarity(song_dataset_cosine_sim, averages_cosine_sim)
 
feat_vec['similarity_score'] = similarity_scores
 
top_similarities = feat_vec.sort_values(by='similarity_score', ascending=False).head(5)

top_similarities

Unnamed: 0,track_id,popularity,danceability,energy,loudness,speechiness,acousticness,instrumentalness,liveness,valence,tempo,genre_acoustic,genre_alt-rock,genre_alternative,genre_ambient,genre_blues,genre_classical,genre_country,genre_dance,genre_disco,genre_electro,genre_electronic,genre_folk,genre_funk,genre_gospel,genre_hip-hop,genre_house,genre_indie,genre_jazz,genre_latin,genre_metal,genre_pop,genre_rock,genre_soul,genre_synth-pop,genre_techno,genre_trance,similarity_score
1152,19zJggaQrCRG3AibCbjXcJ,0.0,0.732,0.348,0.694295,0.0476,0.0484,0.185,0.197,0.963,0.504568,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.545768
764,76kBXoobhmMlzN5mS1CD3b,0.27,0.389,0.494,0.79617,0.131,0.186,0.000138,0.0557,0.909,0.347488,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.544897
3562,367ORcqkmIh0KCnb6KqREY,0.0,0.523,0.224,0.647579,0.0403,0.771,0.00984,0.0518,0.961,0.440628,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.539024
904,73Zqgqez3FbGjeGRhSkw0O,0.26,0.555,0.508,0.76267,0.108,0.361,0.000968,0.12,0.961,0.488148,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.538232
5370,4v2K1ZcRXUc87CLtEXI3Pf,0.24,0.617,0.568,0.739959,0.0335,0.0429,0.000254,0.0422,0.949,0.490248,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.536436


In [69]:
recommended_songs = [get_song_info(track_id, get_client()) for track_id in top_similarities['track_id']]

recommended_songs

[{'name': 'Si Tu No Estas Aqui',
  'artists': ['La Ley'],
  'album': 'Rock un sentimiento vol. I',
  'link': 'https://open.spotify.com/track/19zJggaQrCRG3AibCbjXcJ'},
 {'name': 'Torches Are Calling',
  'artists': ['Days N Daze'],
  'album': 'Ward Off the Vultures',
  'link': 'https://open.spotify.com/track/76kBXoobhmMlzN5mS1CD3b'},
 {'name': 'Nutcracker Suite, Op. 71a: Chinese Dance (Tea)',
  'artists': ['Pyotr Ilyich Tchaikovsky',
   'Paris Conservatoire Orchestra',
   'Anatole Fistoulari'],
  'album': 'Clásica para los niños de hoy',
  'link': 'https://open.spotify.com/track/367ORcqkmIh0KCnb6KqREY'},
 {'name': 'Kick Your Lawyer in the Face',
  'artists': ['Days N Daze'],
  'album': 'Ward Off the Vultures',
  'link': 'https://open.spotify.com/track/73Zqgqez3FbGjeGRhSkw0O'},
 {'name': 'Love Is in the Air',
  'artists': ['John Paul Young'],
  'album': 'Classic Hits',
  'link': 'https://open.spotify.com/track/4v2K1ZcRXUc87CLtEXI3Pf'}]