In [1]:
import requests
import json
import pandas as pd
# import librosa
import numpy as np
# import lyricsgenius
# import langdetect
import re
import tempfile
import sys,os,os.path
import requests
from bs4 import BeautifulSoup
from collections import deque
from dotenv import load_dotenv, find_dotenv
_ = load_dotenv(find_dotenv())

LASTFM_API_KEY = os.environ['lastfm_api_key']
LASTFM_SECRET = os.environ['lastfm_secret']

# Dataset (uncleaned user w song)  
# users - top - liked
# 10k users - 100k unique songs?

### LastFM API to get Song info

In [3]:
base_url = 'http://ws.audioscrobbler.com/2.0/'

def lastfm_get(payload):
    headers = {'user-agent': 'DataCollectorBot'}
    payload['api_key'] = LASTFM_API_KEY
    payload['format'] = 'json'
    response = requests.get(base_url, headers=headers, params=payload)
    return response.json()

def get_song_details(artist_name, track_name):
    payload = {
        'method': 'track.getInfo',
        'artist': artist_name,
        'track': track_name,
    }
    json_response = lastfm_get(payload)
    return json_response.get('track', {})

def get_artist_details(artist_name):
    payload = {
        'method': 'artist.getInfo',
        'artist': artist_name,
    }
    json_response = lastfm_get(payload)
    return json_response.get('artist', {})

def get_recommendations(artist_name, track_name, limit=20):
    payload = {
        'method': 'track.getSimilar',
        'artist': artist_name,
        'track': track_name,
        'limit': limit,
    }
    json_response = lastfm_get(payload)
    return json_response.get('similartracks', {}).get('track', [])


### LastFM API to get top / trending tracks (Global and by country)
def get_global_top_tracks(limit=50):
    payload = {
        'method': 'chart.getTopTracks',
        'limit': limit,
    }
    json_response = lastfm_get(payload)
    return json_response.get('tracks', {}).get('track', [])

def get_top_tracks_by_country(country, limit=50):
    payload = {
        'method': 'geo.getTopTracks',
        'country': country,
        'limit': limit,
    }
    json_response = lastfm_get(payload)
    return json_response.get('tracks', {}).get('track', [])

### Main function for song info (Non spotify)

In [None]:
# def combine_song_data(song_ids):
#     # Placeholder for the token retrieval and authentication methods
#     # Since the actual implementation depends on the specific API being used
#     # You would replace this with Deezer authentication if necessary
#     token = "YOUR_DEEZER_TOKEN_HERE"  # Placeholder token

#     songs_data = []
#     for song_id in song_ids:
#         # Get track details from Deezer
#         track_info = get_deezer_track_info(song_id)

#         # Since Deezer does not provide all the desired audio features directly,
#         # we need to use `librosa` for analyzing the audio and extracting features
#         preview_url = track_info.get("preview_url")
#         if preview_url:
#             librosa_features = extract_librosa_features_from_url(preview_url)
#         else:
#             librosa_features = {}

#         # Lyrics and language detection
#         artist_name = track_info["artist"]
#         song_name = track_info["title"]
#         lyrics = fetch_and_clean_lyrics(artist_name, song_name)
#         language = langdetect.detect_langs(lyrics) if lyrics != "Lyrics not found" else None
#         # language detection here depends on the actual content and method used for detection
        
#         # Combining data into a dictionary
#         song_data = {
#             "song_id": song_id,
#             "song_title": track_info.get("title"),
#             "song_album": track_info.get("album"),
#             "artist": track_info.get("artist"),
#             "featured_artists": track_info.get("featured_artists"),
#             "duration": track_info.get("duration"),
#             # "genre": Not directly provided; might use LastFM or manual mapping
#             # "available_markets", "release_date", "artist_popularity": Not available from Deezer directly
#             "preview_path": preview_url,
#             "lyrics": lyrics,
#             'language': language,
#             **librosa_features  # This unpacks the mfcc, chroma, beat_times directly into the dictionary
#         }
        
#         # Additional details like genre, popularity, etc., would need to be fetched from other APIs or manually filled
#         songs_data.append(song_data)
    
#     # Creating a DataFrame from the combined data
#     df = pd.DataFrame(songs_data)
#     return df


# # Example song IDs for Deezer; replace these with actual Deezer track IDs you want to analyze
# song_ids = ["1109731"]  # Example Deezer track IDs
# df = combine_song_data(song_ids)
# print(df)

### friends friends_loved_tracks

In [None]:

def get_lastfm_friends_loved_tracks(start_username, api_key, min_users=100, tracks_per_user=50):
    discovered = set([start_username])
    queue = deque([start_username])
    users_loved_tracks = {}  # Dictionary to hold users and their loved tracks

    while queue and len(users_loved_tracks) < min_users:
        current_user = queue.popleft()
        # Fetch friends
        friends_url = f"http://ws.audioscrobbler.com/2.0/?method=user.getfriends&user={current_user}&api_key={api_key}&format=json"
        
        try:
            response = requests.get(friends_url)
            data = response.json()
            
            if 'error' in data:
                print(f"Error fetching data for user {current_user}: {data['message']}")
                continue
            
            users = data.get('friends', {}).get('user', [])
            for user in users:
                friend_name = user['name']
                if friend_name not in discovered:
                    discovered.add(friend_name)
                    queue.append(friend_name)
                    # Fetch loved tracks for this friend
                    tracks_url = f"http://ws.audioscrobbler.com/2.0/?method=user.getlovedtracks&user={friend_name}&api_key={api_key}&format=json&limit={tracks_per_user}"
                    tracks_response = requests.get(tracks_url)
                    tracks_data = tracks_response.json()
                    
                    if 'error' not in tracks_data:
                        loved_tracks = tracks_data.get('lovedtracks', {}).get('track', [])
                        users_loved_tracks[friend_name] = [track['name'] for track in loved_tracks]

            print(f"Collected {len(users_loved_tracks)} users' loved tracks so far...")
        except Exception as e:
            print(f"An error occurred while processing user {current_user}: {e}")
            continue
    
    return users_loved_tracks


api_key = LASTFM_API_KEY
start_username = 'Bans77'
users_loved_tracks = get_lastfm_friends_loved_tracks(start_username, api_key)
print(f"Collected loved tracks for {len(users_loved_tracks)} users.")

### scraping

In [None]:
def get_lastfm_friends_bfs(start_username, api_key, min_users=5000):
    discovered = set([start_username])  # Users that have been discovered
    queue = deque([start_username])     # Users to be explored
    collected_friends = []              # Collected friends

    while queue and len(collected_friends) < min_users:
        current_user = queue.popleft()
        url = f"http://ws.audioscrobbler.com/2.0/?method=user.getfriends&user={current_user}&api_key={api_key}&format=json"
        
        try:
            response = requests.get(url)
            data = response.json()
            
            if 'error' in data:
                print(f"Error fetching data for user {current_user}: {data['message']}")
                continue
            
            users = data.get('friends', {}).get('user', [])
            for user in users:
                friend_name = user['name']
                if friend_name not in discovered:
                    discovered.add(friend_name)
                    queue.append(friend_name)
                    collected_friends.append(friend_name)
                    if len(collected_friends) >= min_users:
                        break  # Stop if we have collected enough friends

            print(f"Collected {len(collected_friends)} friends so far...")
        except Exception as e:
            print(f"An error occurred while processing user {current_user}: {e}")
    
    return collected_friends[:min_users]

# Example usage
# api_key = LASTFM_API_KEY
# start_username = 'Bans77'
# friends = get_lastfm_friends_bfs(start_username, api_key)
# print(f"Collected {len(friends)} unique friends.")


In [None]:

LAST_FM_URL = "https://www.last.fm/user/"


def get_following_user_names(curr_user_name):
  curr_user_following_URL = LAST_FM_URL + curr_user_name + "/following"
  text = BeautifulSoup(requests.get(curr_user_following_URL).text)

  return text



def get_followers_user_names(curr_user_name):
  curr_user_followers_URL = LAST_FM_URL + curr_user_name + "/followers"
  text = BeautifulSoup(requests.get(curr_user_followers_URL).text)

  return text


def get_users(): # will cause request timeout
  users = set()
  queue = deque(["bans77", "bouquet-of-sun", "chippy_boi", "joehoots", "czmiles-gb", "bunnycakex", "jakeledoux"])

  while len(queue) != 0:
    curr_size = len(queue)
    for i in range(0, curr_size):
      curr_user = queue.popleft()
      curr_user_follower_text = get_followers_user_names(curr_user)
      for h4 in curr_user_follower_text.find_all(class_="user-list-name"):
        new_user = h4.a.text
        if new_user not in users:
          queue.append(new_user)
      curr_user_follower_text = get_following_user_names(curr_user)
      for h4 in curr_user_follower_text.find_all(class_="user-list-name"):
        new_user = h4.a.text
        if new_user not in users:
          queue.append(new_user)
      users.add(curr_user)

  return users

def get_loved_tracks(username):
  loved_tracks_URL = LAST_FM_URL + username + "/loved"
  text = BeautifulSoup(requests.get(loved_tracks_URL).text)

  track_tag = text.find_all('td', class_='chartlist-name')
  tracks = [tag.a.text for tag in track_tag]

  return tracks


def get_data():
  users = get_users()
  data = {}
  for user in users:
    data[user] = get_loved_tracks(user)


  return data

### Last.FM API for user data

In [2]:
base_url = 'http://ws.audioscrobbler.com/2.0/'

def lastfm_get(payload):
    headers = {'user-agent': 'DataCollectorBot'}
    payload['api_key'] = LASTFM_API_KEY
    payload['format'] = 'json'
    response = requests.get(base_url, headers=headers, params=payload)
    return response.json()

# Function for each endpoint
def get_user_info(user):
    payload = {'method': 'user.getinfo', 'user': user}
    return lastfm_get(payload)

def get_loved_tracks(user):
    payload = {'method': 'user.getlovedtracks', 'user': user}
    return lastfm_get(payload)

def get_personal_tags(user):
    """user (Required) : The user who performed the taggings.
    tag (Required) : The tag you're interested in.
    taggingtype[artist|album|track] (Required) : The type of items which have been tagged
    ...
    """
    payload = {'method': 'user.getPersonalTags', 'user': user}
    return lastfm_get(payload)

def get_recent_tracks(user):
    payload = {'method': 'user.getrecenttracks', 'user': user}
    return lastfm_get(payload)

def get_top_albums(user):
    payload = {'method': 'user.gettopalbums', 'user': user}
    return lastfm_get(payload)

def get_top_artists(user):
    payload = {'method': 'user.gettopartists', 'user': user}
    return lastfm_get(payload)

def get_top_tags(user):
    payload = {'method': 'user.gettoptags', 'user': user}
    return lastfm_get(payload)

def get_top_tracks(user):
    payload = {'method': 'user.gettoptracks', 'user': user}
    return lastfm_get(payload)

def get_weekly_album_chart(user):
    payload = {'method': 'user.getweeklyalbumchart', 'user': user}
    return lastfm_get(payload)

def get_weekly_artist_chart(user):
    payload = {'method': 'user.getweeklyartistchart', 'user': user}
    return lastfm_get(payload)

def get_weekly_chart_list(user):
    payload = {'method': 'user.getweeklychartlist', 'user': user}
    return lastfm_get(payload)

def get_weekly_track_chart(user):
    payload = {'method': 'user.getweeklytrackchart', 'user': user}
    return lastfm_get(payload)

# username = 'joehoots'
# user_data = {
#     'info': get_user_info(username),
#     'loved_tracks': get_loved_tracks(username),
#     # 'personal_tags': get_personal_tags(username),
#     'recent_tracks': get_recent_tracks(username),
#     'top_albums': get_top_albums(username),
#     'top_artists': get_top_artists(username),
#     'top_tags': get_top_tags(username),
#     'top_tracks': get_top_tracks(username),
#     # 'weekly_album_chart': get_weekly_album_chart(username),
#     # 'weekly_artist_chart': get_weekly_artist_chart(username),
#     # 'weekly_chart_list': get_weekly_chart_list(username),
#     # 'weekly_track_chart': get_weekly_track_chart(username)
# }


In [3]:
user_data

{'info': {'user': {'name': 'joehoots',
   'age': '0',
   'subscriber': '0',
   'realname': 'Hoots',
   'bootstrap': '0',
   'playcount': '130988',
   'artist_count': '1184',
   'playlists': '0',
   'track_count': '16889',
   'album_count': '2628',
   'image': [{'size': 'small',
     '#text': 'https://lastfm.freetls.fastly.net/i/u/34s/41c9518e2213898c127494f6746ec3b1.png'},
    {'size': 'medium',
     '#text': 'https://lastfm.freetls.fastly.net/i/u/64s/41c9518e2213898c127494f6746ec3b1.png'},
    {'size': 'large',
     '#text': 'https://lastfm.freetls.fastly.net/i/u/174s/41c9518e2213898c127494f6746ec3b1.png'},
    {'size': 'extralarge',
     '#text': 'https://lastfm.freetls.fastly.net/i/u/300x300/41c9518e2213898c127494f6746ec3b1.png'}],
   'registered': {'unixtime': '1484237303', '#text': 1484237303},
   'country': 'United Kingdom',
   'gender': 'n',
   'url': 'https://www.last.fm/user/joehoots',
   'type': 'user'}},
 'loved_tracks': {'lovedtracks': {'track': [{'artist': {'url': 'https:/

In [20]:
# TESTING
infokeys = []
for k in user_data.keys():
    infokeys.extend(user_data[k].keys())
infokeys

# {'message': 'User not found', 'error': 6},
# 'JekabsBalodis', 'Bans77', 'BloodOnLeaves', 'chippy_boi', 'joehoots', ....

['user',
 'lovedtracks',
 'recenttracks',
 'topalbums',
 'topartists',
 'toptags',
 'toptracks']

In [None]:
recent_tracks = user_data['recent_tracks']

# Print each track with its listen timestamp
for track in recent_tracks['recenttracks']['track']:
    track_name = track['name']
    artist_name = track['artist']['#text']
    # Check if the 'date' key exists
    if 'date' in track:
        listen_time = track['date']['#text']  # Human-readable timestamp
        listen_timestamp = track['date']['uts']  # Unix timestamp
        print(f"{artist_name} - {track_name} listened to at {listen_time} (timestamp: {listen_timestamp})")
    else:
        # If 'date' key does not exist, it might be currently playing
        print(f"{artist_name} - {track_name} is currently playing or missing timestamp data.")

### func for main dataset

In [3]:
min_users = 10000
min_songs = 100000
users_songs = {}  # This will store user: {"top_50": [], "loved": []}
all_songs = set()

def get_lastfm_data(start_username, api_key):
    discovered = set([start_username])
    queue = deque([start_username])

    while queue and (len(users_songs) < min_users or len(all_songs) < min_songs):
        current_user = queue.popleft()
        
        friends_url = f"http://ws.audioscrobbler.com/2.0/?method=user.getfriends&user={current_user}&api_key={api_key}&format=json"
        try:
            friends_response = requests.get(friends_url)
            friends_data = friends_response.json()
            
            if 'error' in friends_data:
                print(f"Error fetching friends for user {current_user}: {friends_data['message']}")
                continue
            
            for user in friends_data.get('friends', {}).get('user', []):
                friend_name = user['name']
                if friend_name not in discovered:
                    discovered.add(friend_name)
                    queue.append(friend_name)

                    loved_tracks_url = f"http://ws.audioscrobbler.com/2.0/?method=user.getlovedtracks&user={friend_name}&api_key={api_key}&format=json"
                    top_tracks_url = f"http://ws.audioscrobbler.com/2.0/?method=user.gettoptracks&user={friend_name}&api_key={api_key}&format=json&limit=50"
                    # recent_tracks_url = f"http://ws.audioscrobbler.com/2.0/?method=user.getrecenttracks&user={friend_name}&api_key={api_key}&format=json&limit=50"
                    
                    loved_tracks_response = requests.get(loved_tracks_url)
                    top_tracks_response = requests.get(top_tracks_url)
                    # recent_tracks_response = requests.get(recent_tracks_url)

                    user_info = get_user_info(friend_name)
                    loved_tracks_data = loved_tracks_response.json()
                    top_tracks_data = top_tracks_response.json()
                    # recent_tracks_data = recent_tracks_response.json()
                    
                    # user exists
                    if (len(user_info) == 1) and ('error' not in loved_tracks_data) and ('error' not in top_tracks_data):
                        # and ('error' not in recent_tracks_data)

                        # 'name', 'artist' for loved_tracks, may be empty
                        loved_tracks = [(track['name'], track['artist']['name']) for track in loved_tracks_data.get('lovedtracks', {}).get('track', [])]

                        # ['name', 'artist', 'rank', 'playcount'] for top_track
                        top_tracks = [(track['name'], track['artist']['name'], track['@attr']['rank'], track['playcount']) for track in top_tracks_data.get('toptracks', {}).get('track', [])]
                        update_top = [(item[0], item[1]) for item in top_tracks]
                       
                        #    """taking too long, TODO it seperately"""
                        # ['name', 'artist', 'date']  for recent_tracks, date could be empty
                        # recent_tracks = [(track['name'], track['artist']['#text'], track['date']['#text']) for track in recent_tracks_data.get('recenttracks', {}).get('track', [])]
                        # update_recent = [(item[0], item[1]) for item in recent_tracks]

                        users_songs[friend_name] = {"user_info": user_info.get('user', []), "top_50": top_tracks, "loved": loved_tracks, 
                                                    # "recent50": recent_tracks
                                                    }

                        all_songs.update(loved_tracks, update_top, 
                                        #  update_recent
                                         )

            print(f"Collected {len(users_songs)} users and {len(all_songs)} unique songs so far...")

        except Exception as e:
            print(f"An error occurred while processing user {current_user}: {e}")
            continue
        
    df = pd.DataFrame([(user, data['user_info'], data['top_50'], data['loved'], 
                        # data['recent50']
                        ) for user, data in users_songs.items()], columns=['Username', 'User Info', 'Top 50 Songs', 'Liked Songs', 
                                                                        #    'Recent Songs'
                                                                           ])
    
    # print(len(users_songs))
        
    return users_songs, df

In [4]:
%%time
# 313m 20.5s

user_songs, df = get_lastfm_data("chippy_boi", LASTFM_API_KEY)

# start_username = ['JekabsBalodis', 'Bans77', 'BloodOnLeaves', 'chippy_boi', 'joehoots', "bouquet-of-sun"]


Collected 33 users and 2733 unique songs so far...
Collected 40 users and 3141 unique songs so far...
Collected 74 users and 5402 unique songs so far...
Error fetching friends for user riddy_pr: no such page
Error fetching friends for user TheGirlNamedSig: no such page
Collected 124 users and 8816 unique songs so far...
Collected 125 users and 8864 unique songs so far...
Collected 136 users and 9466 unique songs so far...
Error fetching friends for user Brotendo: no such page
Collected 184 users and 12504 unique songs so far...
Collected 233 users and 15399 unique songs so far...
Collected 281 users and 18535 unique songs so far...
Collected 331 users and 21259 unique songs so far...
Collected 381 users and 24310 unique songs so far...
Collected 416 users and 26250 unique songs so far...
Collected 459 users and 28483 unique songs so far...
Collected 507 users and 31591 unique songs so far...
Collected 518 users and 32009 unique songs so far...
Collected 568 users and 34308 unique songs

In [None]:
df.to_csv('lastfm_user_raw.csv', index=False)
print("Data saved to lastfm_user_raw.csv")

In [6]:
df

Unnamed: 0,Username,User Info,Top 50 Songs,Liked Songs
0,emosoup,"{'name': 'emosoup', 'age': '0', 'subscriber': ...","[(Higher, Sleep Token, 1, 1321), (The Love You...","[(Obsidian, Viscera), (yes, and?, Ariana Grand..."
1,gibelotte,"{'name': 'gibelotte', 'age': '0', 'subscriber'...","[(DAYWALKER! (feat. Corpse), Machine Gun Kelly...","[(Fragile, Young Cub), (Sarabeth's Song, Noah ..."
2,riddy_pr,"{'name': 'riddy_pr', 'age': '0', 'subscriber':...","[(Will We Talk?, Sam Fender, 1, 291), (Play, F...",[]
3,TheGirlNamedSig,"{'name': 'TheGirlNamedSig', 'age': '0', 'subsc...","[(Be Quiet and Drive (Far Away), Deftones, 1, ...","[(Beaver O'Lindy, Sparks), (Age of Consent - 2..."
4,eventually_,"{'name': 'eventually_', 'age': '0', 'subscribe...","[(Light, Tag Shai, 1, 6142), (Old You, New Me ...","[(Clock Hands, Harlow Road), (Waterslides (Ext..."
...,...,...,...,...
10032,Noodle703,"{'name': 'Noodle703', 'age': '0', 'subscriber'...","[(Hybrid, Siouxsie and the Banshees, 1, 247), ...","[(Fisherman's Blues, The Waterboys), (Plastic ..."
10033,fourohclok,"{'name': 'fourohclok', 'age': '0', 'subscriber...","[(VOID, Melanie Martinez, 1, 121), (Class Figh...","[(Me Pongo Loca, Kali Uchis), (Hey, IC3PEAK), ..."
10034,dedhedchemistry,"{'name': 'dedhedchemistry', 'age': '0', 'subsc...","[(Love Me Too Much, Car Seat Headrest, 1, 141)...","[(Sunny Day, Abandoned Pools), (Some Things La..."
10035,Txxinn,"{'name': 'Txxinn', 'age': '0', 'subscriber': '...","[(Pomkin Song, Vink2, 1, 1491), (Pomkin Song -...","[(yellow is the color of her eyes, Soccer Momm..."


In [7]:
users_songs

{'emosoup': {'user_info': {'name': 'emosoup',
   'age': '0',
   'subscriber': '1',
   'realname': 'Eli ☔',
   'bootstrap': '0',
   'playcount': '187276',
   'artist_count': '3386',
   'playlists': '0',
   'track_count': '13520',
   'album_count': '6171',
   'image': [{'size': 'small',
     '#text': 'https://lastfm.freetls.fastly.net/i/u/34s/00e3122ba649ec9e9d50224bef33b25c.png'},
    {'size': 'medium',
     '#text': 'https://lastfm.freetls.fastly.net/i/u/64s/00e3122ba649ec9e9d50224bef33b25c.png'},
    {'size': 'large',
     '#text': 'https://lastfm.freetls.fastly.net/i/u/174s/00e3122ba649ec9e9d50224bef33b25c.png'},
    {'size': 'extralarge',
     '#text': 'https://lastfm.freetls.fastly.net/i/u/300x300/00e3122ba649ec9e9d50224bef33b25c.png'}],
   'registered': {'unixtime': '1440951545', '#text': 1440951545},
   'country': 'United States',
   'gender': 'n',
   'url': 'https://www.last.fm/user/emosoup',
   'type': 'subscriber'},
  'top_50': [('Higher', 'Sleep Token', '1', '1321'),
   ('The

In [8]:
%%time
min_users = 10000
min_songs = 100000
users_songs = {}  # This will store user: {"top_50": [], "loved": []}
all_songs = set()
user_songs2, df2 = get_lastfm_data("bouquet-of-sun", LASTFM_API_KEY)
df2.to_csv('lastfm_main_user2.csv', index = False)

Collected 32 users and 2475 unique songs so far...
Collected 82 users and 6148 unique songs so far...
Collected 130 users and 9646 unique songs so far...
An error occurred while processing user aria-amethyst: HTTPConnectionPool(host='ws.audioscrobbler.com', port=80): Max retries exceeded with url: /2.0/?method=user.getinfo&user=gabb0es&api_key=97d5a64d5ba4a8bc580b752ceff3b87f&format=json (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x000002DC101F8A90>: Failed to establish a new connection: [WinError 10060] A connection attempt failed because the connected party did not properly respond after a period of time, or established connection failed because connected host has failed to respond'))
Collected 173 users and 12094 unique songs so far...
Collected 223 users and 14560 unique songs so far...
Collected 272 users and 16780 unique songs so far...
Collected 283 users and 17142 unique songs so far...
Collected 330 users and 19122 unique songs so far...
Collec