In [1]:
import requests
from dotenv import load_dotenv
import os
import json
import base64   
import pandas as pd
import librosa
import numpy as np
import lyricsgenius
import langdetect
import re
from joblib import Parallel, delayed
import string
from tqdm import tqdm
import tempfile
from datetime import datetime, timedelta
import dask
import dask.dataframe as dd
from dask.diagnostics import ProgressBar
from concurrent.futures import ThreadPoolExecutor, as_completed, ProcessPoolExecutor
from difflib import SequenceMatcher



# Helper Functions for testing other functions

In [3]:
base_url = 'http://ws.audioscrobbler.com/2.0/'

def lastfm_get(payload):
    headers = {'user-agent': 'DataCollectorBot'}
    payload['api_key'] = lastfm_api_key
    payload['format'] = 'json'
    response = requests.get(base_url, headers=headers, params=payload)
    return response.json()

def get_user_info(user):
    payload = {'method': 'user.getinfo', 'user': user}
    return lastfm_get(payload)

def get_top_albums(user):
    payload = {'method': 'user.gettopalbums', 'user': user}
    return lastfm_get(payload)

def get_top_artists(user):
    payload = {'method': 'user.gettopartists', 'user': user}
    return lastfm_get(payload)

def get_top_tracks(user):
    payload = {'method': 'user.gettoptracks', 'user': user}
    return lastfm_get(payload)



In [4]:
def list_to_df(data_list, columns):
    if data_list:
        df = pd.DataFrame(data_list, columns=columns)
        return df
    else:
        return pd.DataFrame(columns=columns)
    
def get_top_albums_df(user):
    result = get_top_albums(user)
    albums = []
    for item in result['topalbums']['album']:
        albums.append({
            'Rank': item['@attr']['rank'],
            'Album Name': item['name'],
            'Artist': item['artist']['name'],
            'Play Count': item['playcount']
        })
    return list_to_df(albums, ['Rank', 'Album Name', 'Artist', 'Play Count'])

def get_top_artists_df(user):
    result = get_top_artists(user)
    artists = []
    for item in result['topartists']['artist']:
        artists.append({
            'Rank': item['@attr']['rank'],
            'Artist': item['name'],
            'Play Count': item['playcount']
        })
    return list_to_df(artists, ['Rank', 'Artist', 'Play Count'])


def get_top_tracks_df(user):
    result = get_top_tracks(user)
    tracks = []
    for item in result['toptracks']['track']:
        tracks.append({
            'Rank': item['@attr']['rank'],
            'Track Name': item['name'],
            'Artist': item['artist']['name'],
            'Play Count': item['playcount']
        })
    return list_to_df(tracks, ['Rank', 'Track Name', 'Artist', 'Play Count'])

In [1]:
def get_combined_top_data(user, top_n=10):
    
    top_albums_df = get_top_albums_df(user).head(top_n)
    top_artists_df = get_top_artists_df(user).head(top_n)
    top_tracks_df = get_top_tracks_df(user).head(top_n)
    
    
    top_albums_df['Rank'] = top_albums_df['Rank'].astype(int)
    top_artists_df['Rank'] = top_artists_df['Rank'].astype(int)
    top_tracks_df['Rank'] = top_tracks_df['Rank'].astype(int)


    combined_df = pd.concat([top_albums_df.set_index('Rank'),
                             top_artists_df.set_index('Rank'),
                             top_tracks_df.set_index('Rank')],
                            axis=1,
                            keys=['Albums', 'Artists', 'Tracks'])
    
    combined_df.reset_index(inplace=True)

    return combined_df



# Audio 

Getting Unique Songs so that we don't run on duplicate songs as it is a costly function


In [3]:
import pandas as pd
from ast import literal_eval


def safe_eval(x):
    try:
        return literal_eval(x)
    except:
        return x
    
def preprocess_column(df, column_name):
    df[column_name] = df[column_name].apply(lambda x: [] if pd.isna(x) or x == "" else x)
    df[column_name] = df[column_name].apply(safe_eval)
    return df

def extract_unique_songs(df):
    df = preprocess_column(df, 'Top 50 Songs')
    df = preprocess_column(df, 'Liked Songs')
    
    all_songs = []
    for index, row in df.iterrows():
        if isinstance(row['Top 50 Songs'], list) and isinstance(row['Liked Songs'], list):
            all_songs.extend(row['Top 50 Songs'])
            all_songs.extend(row['Liked Songs'])
    
    unique_songs = list(set(all_songs))
    unique_songs_df = pd.DataFrame(unique_songs, columns=['Unique Songs'])
    
    unique_songs_df.to_excel("Unique_songs.xlsx", index=False)
    
    return unique_songs_df

To get the 30s audio files of the songs, we get 2 key inputs , which are the track name and artist name, and then try to find the most similar song id 
from the results that deezer returns.

In [3]:
def similar(a, b):
    
    return SequenceMatcher(None, a, b).ratio()

def get_highest_similarity_track(tracks, track_name):

    track_similarities = [(track, similar(track_name, track['title'])) for track in tracks]

    best_match, highest_similarity = max(track_similarities, key=lambda x: x[1], default=(None, 0))

    if best_match and highest_similarity > 0.4:  
        return best_match['id']
    return None

def search_deezer_track(track_name, artist_name):
    search_url = "https://api.deezer.com/search/track"
    query = f"{track_name} artist:\"{artist_name}\""
    params = {"q": query}
    try:
        response = requests.get(search_url, params=params)
        response.raise_for_status()
        tracks = response.json().get('data', [])
        if tracks:
            
            best_match_id = get_highest_similarity_track(tracks, track_name)
            return best_match_id
        return None
    except requests.RequestException as e:
        print(f"Request failed: {e}")
        return None
    
def get_deezer_track_info(track_id):
    if not track_id:
        return {}
    base_url = "https://api.deezer.com/track/"
    try:
        response = requests.get(f"{base_url}{track_id}")
        response.raise_for_status()
        data = response.json()

        main_artist = data.get("artist", {}).get("name", "")
        contributors = [contributor['name'] for contributor in data.get("contributors", []) if contributor['name'] != main_artist]
        featured_artists = ", ".join(contributors) if contributors else None

        return {
            "title": data.get("title"),
            "artist": main_artist,
            "featured_artists": featured_artists,
            "duration": data.get("duration"),
            "album": data.get("album", {}).get("title"),
            "preview_url": data.get("preview"),
            "link": data.get("link")
        }
    except requests.RequestException:
        return {}

After getting the results from deezer, we then run librosa on the audio files to get the features below

In [4]:
def extract_librosa_features_from_url(url):
    
    response = requests.get(url)
    if response.status_code != 200:
        raise Exception(f"Failed to download audio file from {url}")
    
    
    temp_dir = tempfile.gettempdir()
    temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".mp3", dir=temp_dir)
    temp_file_path = temp_file.name
    
    try:
        
        temp_file.write(response.content)
        temp_file.close()
        
        y, sr = librosa.load(temp_file_path, sr=None) 
        
        tempo, beat_frames = librosa.beat.beat_track(y=y, sr=sr)
        rms = np.mean(librosa.feature.rms(y=y))
        spectral_centroid = np.mean(librosa.feature.spectral_centroid(y=y, sr=sr))
        zcr = np.mean(librosa.feature.zero_crossing_rate(y))


        features = {
            'mfcc': np.mean(librosa.feature.mfcc(y=y, sr=sr).T, axis=0),
            'chroma': np.mean(librosa.feature.chroma_stft(y=y, sr=sr).T, axis=0),
            'rms': rms,
            'spectral_centroid': spectral_centroid,
            'zcr': zcr,
            'tempo': tempo
        }
    finally:
        os.remove(temp_file_path)

    return features

process_song() is a function to put all the steps above together

In [5]:
def process_song(track_name, artist_name):
    
    track_id = search_deezer_track(track_name, artist_name) 
    track_info = get_deezer_track_info(track_id)
    if not track_info or 'preview_url' not in track_info or not track_info['preview_url']:
        return {'Song': track_name, 'Artist': artist_name}  

    try:
        features = extract_librosa_features_from_url(track_info['preview_url'])
        
    except Exception as e:
        print(f"Error processing {track_name}: {e}")
        features = {}
        scores = {}


    return {**{'Song': track_name, 'Artist': artist_name}, **track_info, **features} 


main() was run on all our unique songs, which was a very resource heavy function since we had to call APIs and run librosa and the whole process took close to 30hours

In [8]:
def main(df_songs):
    
    tracks_and_artists = [(row['track_name_original'], row['artist_name_original']) for index, row in df_songs.iterrows()]
    
    with ThreadPoolExecutor(max_workers=20) as executor:
        
        future_to_song = {executor.submit(process_song, track, artist): (track, artist) for track, artist in tracks_and_artists}
        
        
        results = []
        
        for future in tqdm(as_completed(future_to_song), total=len(tracks_and_artists), desc="Processing Songs"):
            track, artist = future_to_song[future]
            try:
                song_data = future.result()
                results.append(song_data)
            except Exception as e:
                print(f"Error processing {track} by {artist}: {e}")
    
    results_df = pd.DataFrame(results)
    results_df.to_excel("../../Downloads/Impute3.xlsx", index=False)


# try_df = pd.read_excel("../../Downloads/BT4222ProjectExcel/unique_songs_audio_hasnull.xlsx")
# try_df = try_df[['track_name_original', 'artist_name_original']]
# main(try_df[100000:150000])


Processing Songs:  11%|█▏        | 2605/23126 [1:32:08<4:13:33,  1.35it/s] 

Error processing Riding With Death: Failed to download audio file from https://cdns-preview-f.dzcdn.net/stream/c-f8b74a33c6c64c29a2c4e9abfdda5bbd-0.mp3


Processing Songs:  18%|█▊        | 4238/23126 [2:25:06<7:49:48,  1.49s/it] 

Error processing Witch Hunt: Failed to download audio file from https://cdns-preview-f.dzcdn.net/stream/c-f8b74a33c6c64c29a2c4e9abfdda5bbd-0.mp3


Processing Songs:  20%|██        | 4649/23126 [2:38:04<13:10:59,  2.57s/it]

Error processing Forgotten: ('Connection aborted.', TimeoutError(10060, 'A connection attempt failed because the connected party did not properly respond after a period of time, or established connection failed because connected host has failed to respond', None, 10060, None))


Processing Songs:  22%|██▏       | 5118/23126 [2:52:51<6:59:50,  1.40s/it] 

Error processing Darkeve Duet: HTTPSConnectionPool(host='cdns-preview-0.dzcdn.net', port=443): Max retries exceeded with url: /stream/c-056f82fd98a073229f4c682716704c34-3.mp3 (Caused by NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x00000207D59ACEE0>: Failed to establish a new connection: [WinError 10060] A connection attempt failed because the connected party did not properly respond after a period of time, or established connection failed because connected host has failed to respond'))


Processing Songs:  22%|██▏       | 5122/23126 [2:52:55<4:58:41,  1.00it/s]

Error processing Come With Me (Unreleased): HTTPSConnectionPool(host='cdns-preview-c.dzcdn.net', port=443): Max retries exceeded with url: /stream/c-c810999483a2bd14bc7b37417fa643ef-7.mp3 (Caused by NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x00000207D45FC580>: Failed to establish a new connection: [WinError 10060] A connection attempt failed because the connected party did not properly respond after a period of time, or established connection failed because connected host has failed to respond'))
Error processing F.T.W.: HTTPSConnectionPool(host='cdns-preview-2.dzcdn.net', port=443): Max retries exceeded with url: /stream/c-26f6a971e73b1354e87aaa021050696a-2.mp3 (Caused by NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x00000207DAD8C9D0>: Failed to establish a new connection: [WinError 10060] A connection attempt failed because the connected party did not properly respond after a period of time, or established connection failed because conne

Processing Songs:  22%|██▏       | 5123/23126 [2:52:57<6:41:29,  1.34s/it]

Error processing Higher: HTTPSConnectionPool(host='cdns-preview-3.dzcdn.net', port=443): Max retries exceeded with url: /stream/c-35993c83c7915821a27c9aadeaf446c5-6.mp3 (Caused by NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x00000207CDFB7040>: Failed to establish a new connection: [WinError 10060] A connection attempt failed because the connected party did not properly respond after a period of time, or established connection failed because connected host has failed to respond'))


Processing Songs:  22%|██▏       | 5126/23126 [2:52:58<3:51:29,  1.30it/s]

Error processing Everything is broken (withoute feature): HTTPSConnectionPool(host='cdns-preview-6.dzcdn.net', port=443): Max retries exceeded with url: /stream/c-62a2b69b87860827a9368e1967c99b68-6.mp3 (Caused by NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x00000207CDF1E910>: Failed to establish a new connection: [WinError 10060] A connection attempt failed because the connected party did not properly respond after a period of time, or established connection failed because connected host has failed to respond'))


Processing Songs:  22%|██▏       | 5127/23126 [2:52:59<3:33:08,  1.41it/s]

Error processing Redford (for Yia Yia & Pappou): ('Connection aborted.', TimeoutError(10060, 'A connection attempt failed because the connected party did not properly respond after a period of time, or established connection failed because connected host has failed to respond', None, 10060, None))


Processing Songs:  22%|██▏       | 5132/23126 [2:53:09<8:06:12,  1.62s/it] 

Error processing Don't Play (ft. Big Sean & The 1975): HTTPSConnectionPool(host='cdns-preview-5.dzcdn.net', port=443): Max retries exceeded with url: /stream/c-5f61637d08195b36cf8ef36fb971a9c2-1.mp3 (Caused by NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x00000207EBEA8760>: Failed to establish a new connection: [WinError 10060] A connection attempt failed because the connected party did not properly respond after a period of time, or established connection failed because connected host has failed to respond'))


Processing Songs:  22%|██▏       | 5134/23126 [2:53:16<11:53:30,  2.38s/it]

Error processing With A Thousand Words To Say But One: HTTPSConnectionPool(host='cdns-preview-3.dzcdn.net', port=443): Max retries exceeded with url: /stream/c-365b8cce46c0f75cc5fb848f52924cde-3.mp3 (Caused by NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x00000207D466FEE0>: Failed to establish a new connection: [WinError 10060] A connection attempt failed because the connected party did not properly respond after a period of time, or established connection failed because connected host has failed to respond'))


Processing Songs:  22%|██▏       | 5137/23126 [2:53:21<11:27:08,  2.29s/it]

Error processing Palm Tree: HTTPSConnectionPool(host='cdns-preview-4.dzcdn.net', port=443): Max retries exceeded with url: /stream/c-40c416902851bf108b34a079d7d5203f-2.mp3 (Caused by NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x00000207EBB46820>: Failed to establish a new connection: [WinError 10060] A connection attempt failed because the connected party did not properly respond after a period of time, or established connection failed because connected host has failed to respond'))


  return pitch_tuning(
Processing Songs:  35%|███▌      | 8167/23126 [4:28:13<2:47:04,  1.49it/s] 

Error processing Skid: Failed to download audio file from https://cdns-preview-4.dzcdn.net/stream/c-44056fe5e9f222a84eec47dcb4b844ee-0.mp3


Processing Songs:  43%|████▎     | 9851/23126 [5:20:45<3:13:35,  1.14it/s] 

Error processing Our Song: ('Connection aborted.', TimeoutError(10060, 'A connection attempt failed because the connected party did not properly respond after a period of time, or established connection failed because connected host has failed to respond', None, 10060, None))


Processing Songs:  86%|████████▌ | 19833/23126 [10:39:33<2:06:40,  2.31s/it]

Error processing Peso (Prod. By ASAP Ty Beats): ('Connection aborted.', TimeoutError(10060, 'A connection attempt failed because the connected party did not properly respond after a period of time, or established connection failed because connected host has failed to respond', None, 10060, None))


Processing Songs:  86%|████████▌ | 19836/23126 [10:39:39<2:08:00,  2.33s/it]

Error processing So Gangsta: HTTPSConnectionPool(host='cdns-preview-9.dzcdn.net', port=443): Max retries exceeded with url: /stream/c-9c1c7505edff77211ab8909bdf6d58b0-7.mp3 (Caused by NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x00000207E6784490>: Failed to establish a new connection: [WinError 10060] A connection attempt failed because the connected party did not properly respond after a period of time, or established connection failed because connected host has failed to respond'))


Processing Songs:  86%|████████▌ | 19837/23126 [10:39:45<3:10:26,  3.47s/it]

Error processing Barter 3: HTTPSConnectionPool(host='cdns-preview-6.dzcdn.net', port=443): Max retries exceeded with url: /stream/c-6ff170148c48a6e154458a4736cf6c5e-2.mp3 (Caused by NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x000002080FD813D0>: Failed to establish a new connection: [WinError 10060] A connection attempt failed because the connected party did not properly respond after a period of time, or established connection failed because connected host has failed to respond'))


Processing Songs:  94%|█████████▍| 21726/23126 [11:40:00<44:18,  1.90s/it]  

Request failed: 403 Client Error: Forbidden for url: https://api.deezer.com/search/track?q=Orange+Alert+%28DFA+remix%29+artist%3A%22Metro+Area%22


Processing Songs: 100%|██████████| 23126/23126 [12:25:14<00:00,  1.93s/it]  
