## MUSIC RECOMMENDATION SYSTEM USING HYBRID RECOMMENDATION TECHNIQUE

In [2]:
import requests

# The base64 library 
# in Python provides functions for encoding binary data into printable ASCII characters and decoding such encodings back to binary data
import base64
import pandas as pd

# spotipy library is responsible for collecting music data from any playlist on spotify
import spotipy
from spotipy.oauth2 import SpotifyOAuth

In [2]:
pd.set_option('display.max_columns', 1000)
pd.set_option('display.max_row', 10000)

### CLIENT_ID AND CLIENT_SECRET

In [3]:
# Store the clientId and clientSecret in variables 'CLIENT_ID' and 'CLIENT_SECRET'

In [3]:
CLIENT_ID = '198da2205f494f7bb7788e9f927b7ee3'

CLIENT_SECRET = '511e99423d3846e7bedf70fd7caacc4b'

### ENCODE CLIENT_ID AND CLIENT_SECRET

In [4]:
client_credentials = f"{CLIENT_ID}:{CLIENT_SECRET}"

encoded_client_credentials = base64.b64encode(client_credentials.encode())

In [5]:
client_credentials

'198da2205f494f7bb7788e9f927b7ee3:511e99423d3846e7bedf70fd7caacc4b'

In [6]:
encoded_client_credentials

b'MTk4ZGEyMjA1ZjQ5NGY3YmI3Nzg4ZTlmOTI3YjdlZTM6NTExZTk5NDIzZDM4NDZlN2JlZGY3MGZkN2NhYWNjNGI='

### REQUESTING THE ACCESS TOKEN

In [8]:
# The access token serves as a temporary authorization credential, allowing the code to make authenticated requests to the Spotify API on behalf of the application.

In [5]:
token_URL = 'https://accounts.spotify.com/api/token'

In [6]:
headers = {
    'Authorization' : f'Basic {encoded_client_credentials.decode()}'
}

In [7]:
data = {
    'grant_type' : 'client_credentials'
}

In [14]:
# sending a post request

In [8]:
response = requests.post(token_URL, data = data, headers = headers)

In [20]:
response

<Response [200]>

In [9]:
response.json()

{'access_token': 'BQB5eFDgN92Ayq68D7NTYOa0S8-UQs516iiQNE6ZgSjOvpDO9qdbMpyVT0sbyBZrwHq3wzAukKzo_h0Aq-8UXgu4yVGb7HFczqvu9-jyq43syqQtvJA',
 'token_type': 'Bearer',
 'expires_in': 3600}

In [22]:
response.status_code

200

str

In [23]:
if response.status_code == 200:
    access_token = response.json()['access_token']
    print(access_token)
    print('Access token obtained successfully...!')

else:
    print('Error obtaining access token...!')
    exit()

BQAO5bLtvWEAHH9sW4cjgD3Eg_FDSzBO0cLK8B9cplDl7pI9sb2hf7fgcSuK8IIFH0onT2H9BvqBgQOR-MIKUMue6v1xQJvth0BFYQw6My6ZnLrlaE0
Access token obtained successfully...!


### BUILDING A FUNCTION TO EXTRACT THE DATA FROM THE SPOTIFY PLATFROM

In [18]:
# defining a function responsible for collecting music data from any playlist on Spotify

In [24]:
def get_trending_playlist_data(playlist_id, access_token):

    # setting up Spotipy with access token
    sp = spotipy.Spotify(auth = access_token)

    # fetching the tracks from the playlist
    playlist_tracks = sp.playlist_tracks(playlist_id, fields = 'items(track(id, name, artists, album(id, name)))')


    # extracting relevant information and storing them in a list of dictionaries

    music_data = []

    for track_info in playlist_tracks['items']:
        
        track = track_info['track']

        track_name = track['name']

        artists = ', '.join([artist['name'] for artist in track['artists']])

        album_name = track['album']['name']

        album_id = track['album']['id']

        track_id = track['id']


    # fetching audio features for the track

        audio_features = sp.audio_features(track_id)[0] if track_id != 'Not available' else None

    # fetching release date of the album

        try:
            album_info = sp.album(album_id) if album_id != 'Not available' else None
            release_date = album_info['release_date'] if album_info else None

        except:
            release_date = None


    # fetching popularity of the track

        try:
            track_info = sp.track(track_id) if track_id != 'Not available' else None
            popularity = track_info['popularity'] if track_info else None

        except:
            popularity = None

    # fetching additional track information

        track_data = {
            'Track_Name' : track_name,
            'Artists' : artists,
            'Album_Name' : album_name,
            'Album_ID' : album_id,
            'Track_ID' : track_id,
            'Popularity' : popularity,
            'Release_date' : release_date,
            'Duration(ms)' : audio_features['duration_ms'] if audio_features else None,
            'Explicit' : track_info.get('explicit', None),
            'External_URLs' : track_info.get('external_urls', {}).get('spotify', None),
            'Danceability' : audio_features['danceability'] if audio_features else None,
            'Energy' : audio_features['energy'] if audio_features else None,
            'Key' : audio_features['key'] if audio_features else None,
            'Loudness' : audio_features['loudness'] if audio_features else None,
            'Mode' : audio_features['mode'] if audio_features else None,
            'Speechiness' : audio_features['speechiness'] if audio_features else None,
            'Acousticness' : audio_features['acousticness'] if audio_features else None,
            'Instrumentalness' : audio_features['instrumentalness'] if audio_features else None,
            'Liveness' : audio_features['liveness'] if audio_features else None,
            'Valence' : audio_features['valence'] if audio_features else None,
            'Tempo' : audio_features['tempo'] if audio_features else None  
        }


        music_data.append(track_data)


# creating dataframe from the above dictionary

    df =  pd.DataFrame(music_data)

    return df

In [25]:
access_token

'BQAO5bLtvWEAHH9sW4cjgD3Eg_FDSzBO0cLK8B9cplDl7pI9sb2hf7fgcSuK8IIFH0onT2H9BvqBgQOR-MIKUMue6v1xQJvth0BFYQw6My6ZnLrlaE0'

In [26]:
playlist_id = '37i9dQZF1DX76Wlfdnj7AP'

In [27]:
music_df = get_trending_playlist_data(playlist_id , access_token)

In [29]:
music_df.head()

Unnamed: 0,Track_Name,Artists,Album_Name,Album_ID,Track_ID,Popularity,Release_date,Duration(ms),Explicit,External_URLs,Danceability,Energy,Key,Loudness,Mode,Speechiness,Acousticness,Instrumentalness,Liveness,Valence,Tempo
0,CARNIVAL,"¥$, Kanye West, Ty Dolla $ign, Rich The Kid, P...",VULTURES 1,30zwjSQEodaUXCn11nmiVF,3w0w2T288dec0mgeZZqoNN,94,2024-02-09,264324,True,https://open.spotify.com/track/3w0w2T288dec0mg...,0.594,0.811,1,-5.746,1,0.159,0.189,0.0,0.339,0.311,148.144
1,redrum,21 Savage,american dream,2RRYaYHY7fIIdvFlvgb5vq,52eIcoLUM25zbQupAZYoFh,91,2024-01-12,270698,True,https://open.spotify.com/track/52eIcoLUM25zbQu...,0.625,0.733,2,-8.757,1,0.0488,0.00598,0.000983,0.375,0.244,172.09
2,Type Shit,"Future, Metro Boomin, Travis Scott, Playboi Carti",WE DON'T TRUST YOU,4iqbFIdGOTzXeDtt9owjQn,28drn6tQo95MRvO0jQEo5C,91,2024-03-22,228267,True,https://open.spotify.com/track/28drn6tQo95MRvO...,0.64,0.552,2,-5.679,1,0.129,0.0215,0.0,0.119,0.112,144.941
3,Lovin On Me,Jack Harlow,Lovin On Me,6VCO0fDBGbRW8mCEvV95af,4xhsWYTOGcal8zt0J161CU,91,2023-11-10,138411,True,https://open.spotify.com/track/4xhsWYTOGcal8zt...,0.943,0.558,2,-4.911,1,0.0568,0.0026,2e-06,0.0937,0.606,104.983
4,Tell Ur Girlfriend,Lay Bankz,Tell Ur Girlfriend,0S8DGX9LmBkRSVi3ywcCOT,3lMzT16MjAKKXF7pSZn13B,91,2024-02-07,124444,True,https://open.spotify.com/track/3lMzT16MjAKKXF7...,0.866,0.741,7,-4.66,1,0.245,0.0933,0.0,0.0297,0.614,135.07


### EXPLORING THE DATA

In [30]:
music_df.shape

(100, 21)

In [31]:
music_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Track_Name        100 non-null    object 
 1   Artists           100 non-null    object 
 2   Album_Name        100 non-null    object 
 3   Album_ID          100 non-null    object 
 4   Track_ID          100 non-null    object 
 5   Popularity        100 non-null    int64  
 6   Release_date      100 non-null    object 
 7   Duration(ms)      100 non-null    int64  
 8   Explicit          100 non-null    bool   
 9   External_URLs     100 non-null    object 
 10  Danceability      100 non-null    float64
 11  Energy            100 non-null    float64
 12  Key               100 non-null    int64  
 13  Loudness          100 non-null    float64
 14  Mode              100 non-null    int64  
 15  Speechiness       100 non-null    float64
 16  Acousticness      100 non-null    float64
 17

In [32]:
music_df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Popularity,100.0,78.86,5.868973,70.0,75.0,78.0,83.0,94.0
Duration(ms),100.0,180562.54,44790.407022,97393.0,148430.0,171777.0,203162.75,319192.0
Danceability,100.0,0.72954,0.119274,0.454,0.6355,0.738,0.81925,0.948
Energy,100.0,0.74325,0.140636,0.388,0.6565,0.76,0.86125,0.972
Key,100.0,5.43,3.685228,0.0,1.75,6.0,8.25,11.0
Loudness,100.0,-5.53605,1.961216,-12.727,-6.40625,-5.3955,-4.498,-0.424
Mode,100.0,0.58,0.496045,0.0,0.0,1.0,1.0,1.0
Speechiness,100.0,0.125231,0.108772,0.03,0.04735,0.07795,0.17525,0.491
Acousticness,100.0,0.089471,0.122845,0.000307,0.0171,0.04485,0.11275,0.848
Instrumentalness,100.0,0.044521,0.172892,0.0,0.0,1e-05,0.00128,0.901


In [35]:
music_df.duplicated().sum()

0

In [36]:
music_df.isnull().sum()

Track_Name          0
Artists             0
Album_Name          0
Album_ID            0
Track_ID            0
Popularity          0
Release_date        0
Duration(ms)        0
Explicit            0
External_URLs       0
Danceability        0
Energy              0
Key                 0
Loudness            0
Mode                0
Speechiness         0
Acousticness        0
Instrumentalness    0
Liveness            0
Valence             0
Tempo               0
dtype: int64

In [37]:
music_df.dtypes

Track_Name           object
Artists              object
Album_Name           object
Album_ID             object
Track_ID             object
Popularity            int64
Release_date         object
Duration(ms)          int64
Explicit               bool
External_URLs        object
Danceability        float64
Energy              float64
Key                   int64
Loudness            float64
Mode                  int64
Speechiness         float64
Acousticness        float64
Instrumentalness    float64
Liveness            float64
Valence             float64
Tempo               float64
dtype: object

In [38]:
music_df.columns

Index(['Track_Name', 'Artists', 'Album_Name', 'Album_ID', 'Track_ID',
       'Popularity', 'Release_date', 'Duration(ms)', 'Explicit',
       'External_URLs', 'Danceability', 'Energy', 'Key', 'Loudness', 'Mode',
       'Speechiness', 'Acousticness', 'Instrumentalness', 'Liveness',
       'Valence', 'Tempo'],
      dtype='object')

In [39]:
data = music_df.copy()

In [40]:
data.shape

(100, 21)

### CREATING A FUNCTION TO CALCULATE WEIGHTED POPULARITY SCORES BASED ON RELEASE DATE

In [41]:
from datetime import datetime

In [42]:
def calculate_weighted_popularity(release_date):

    # converting the release date string to datetime object so that we can perform arithmetic operations on the dates

    release_date = datetime.strptime(release_date, '%Y-%m-%d')

    # calculating the time span between release date and today's date

    time_span = datetime.now() - release_date

    # calculating the weighted popularity score based on time span
    # recent released songs will be having higher weights

    # Adding 1 to the number of days ensures that the weight is never zero, even for very recent releases--
    # as this would lead to a division by zero error.
    weight = 1/(time_span.days + 1)

    return weight

### NORMALIZING THE MUSIC FEATURES

In [44]:
from sklearn.preprocessing import MinMaxScaler

In [45]:
scaler = MinMaxScaler()

In [46]:
music_features = data[['Danceability', 'Energy', 'Key', 'Loudness', 'Mode', 'Speechiness', 'Acousticness', 'Instrumentalness', 'Liveness', 'Valence', 'Tempo']].values

In [47]:
music_features_scaled = scaler.fit_transform(music_features)

In [127]:
music_features_scaled = pd.DataFrame(music_features_scaled)

In [129]:
music_features_scaled.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10
0,0.283401,0.724315,0.090909,0.567423,1.0,0.279826,0.222596,0.0,0.406813,0.291912,0.731533
1,0.346154,0.590753,0.181818,0.322686,1.0,0.040781,0.006692,0.001091,0.454163,0.220139,0.933269
2,0.376518,0.280822,0.181818,0.572868,1.0,0.214751,0.025001,0.0,0.117454,0.078736,0.704549
3,0.989879,0.291096,0.181818,0.635292,1.0,0.058134,0.002705,2e-06,0.084177,0.607927,0.367919
4,0.834008,0.604452,0.636364,0.655694,1.0,0.466377,0.109701,0.0,0.0,0.616497,0.62139


###  RECOMMENDING MUSIC BASED ON AUDIO FEATURES

In [49]:
from sklearn.metrics.pairwise import cosine_similarity

In [40]:
# Creating a function to get content-based recommendations based on music audio features

In [88]:
def content_base_recommendations(input_song_name, num_recommendations = 5):

    if input_song_name not in music_df['Track_Name'].values:
        print(f"'{input_song_name}' not found in the dataset. Please enter a valid song name.")
        return

    # fetching the index of the input song in the music dataframe

    input_song_index = music_df[music_df['Track_Name'] == input_song_name].index[0]

    # calculating the similarity score based on music_features using cosine_similarity

    similarity_scores = cosine_similarity([music_features_scaled[input_song_index]], music_features_scaled)

    # fetching the indices of the most similar songs
    # sorting the indices of an array and reversing the array and slicing the array from first index excluding the input song itself
    similar_song_indices = similarity_scores.argsort()[0][::-1][1:num_recommendations + 1]

    # fetching the names of the most similar songs based on content-based filtering

    content_based_recommendations = music_df.iloc[similar_song_indices][['Track_Name', 'Artists', 'Album_Name', 'Release_date', 'Popularity']]

    return content_based_recommendations

In [90]:
output = content_base_recommendations('Lovin On Me')
output

Unnamed: 0,Track_Name,Artists,Album_Name,Release_date,Popularity
91,Sunshine (My Girl),Wuki,Sunshine (My Girl),2023-09-01,72
89,Rhyme Dust,"MK, Dom Dolla",Rhyme Dust,2023-02-24,72
93,Pink Venom,BLACKPINK,Pink Venom,2022-08-19,71
33,Belly Dancer,"Imanbek, BYOR",Belly Dancer,2022-02-18,81
85,Marianela (Que Pasa),"HUGEL, Merk & Kremont, Lirico En La Casa",Marianela (Que Pasa),2022-11-25,72


### RECOMMENDING MUSIC BASED ON POPULARITY

In [124]:
def hybrid_recommendation_fun(input_song_name, num_recommendations = 5, alpha = 0.5):

    if input_song_name not in music_df['Track_Name'].values:
        print(f"'{input_song_name}' not found in the dataset. Please enter a valid song name.")
        return

    # get the content-based-recommendations

    content_based_rec = content_base_recommendations(input_song_name, num_recommendations)

    # fetcing the popularity score of the input song

    popularity_score = music_df.loc[music_df['Track_Name'] == input_song_name, 'Popularity'].values[0]

    # calculating the weighted popularity score

    weighted_popularity_score = popularity_score * calculate_weighted_popularity(music_df.loc[music_df['Track_Name'] == input_song_name, 'Release_date'].values[0])

    content_based_df = content_based_rec
    
    # combining content-based and popularity-based recommendation based on weighted popularity
    

    hybrid_recommendations_df = pd.DataFrame({
        'Track_Name': [input_song_name],
        'Artists': [music_df.loc[music_df['Track_Name'] == input_song_name, 'Artists'].values[0]],
        'Album_Name': [music_df.loc[music_df['Track_Name'] == input_song_name, 'Album_Name'].values[0]],
        'Release_date': [music_df.loc[music_df['Track_Name'] == input_song_name, 'Release_date'].values[0]],
        'Popularity': [weighted_popularity_score]})

    # final_df = content_based_df.append(hybrid_recommendations_df)

    final_df = pd.concat([content_based_df, hybrid_recommendations_df])

    # print(hybrid_recommendations_df)
    # sorting the hybrid recommendations based on weighted-popularity-score
    final_df = final_df.sort_values(by = 'Popularity', ascending = False)

    # removing the input song from the recommendations

    final_df = final_df[final_df['Track_Name'] != input_song_name]

    return final_df

### FETCHING THE RECOMMENDATIONS

In [126]:
input_song_name = "Belly Dancer"

recommended_music = hybrid_recommendation_fun(input_song_name, num_recommendations = 5)

recommended_music

Unnamed: 0,Track_Name,Artists,Album_Name,Release_date,Popularity
39,Thunder,"Gabry Ponte, LUM!X, Prezioso",Thunder,2021-05-07,80.0
91,Sunshine (My Girl),Wuki,Sunshine (My Girl),2023-09-01,72.0
85,Marianela (Que Pasa),"HUGEL, Merk & Kremont, Lirico En La Casa",Marianela (Que Pasa),2022-11-25,72.0
93,Pink Venom,BLACKPINK,Pink Venom,2022-08-19,71.0
98,Murder On The Dancefloor - David Guetta Remix,"Sophie Ellis-Bextor, David Guetta",Murder On The Dancefloor (David Guetta Remix),2024-03-08,70.0


### SAVING THE MUSIC_FEATURES_DATAFRAME AND MUSIC_DATAFRAME AS A PICKLE FILES

In [130]:
import pickle

pickle.dump(music_features_scaled, open('artifacts/music_features_scaled.pkl', 'wb'))
pickle.dump(music_df, open('artifacts/music_df.pkl', 'wb'))