In [2]:
import pandas as pd

def extract_primary_artist(artist_string: str) -> str:
    """Extract the first artist before any comma."""
    return artist_string.split(",")[0].strip()

def normalize_artist_name(artist: str) -> str:
    """Normalize artist names for consistent comparison."""
    return artist.strip().lower()

def main():
    # Load all datasets
    df_liked = pd.read_csv("data/liked.csv")
    df_fav_albums = pd.read_csv("data/liked_albums.csv")

    # Extract primary artists from both datasets
    primary_artists_liked = df_liked["Artist Name(s)"].apply(extract_primary_artist)
    primary_artists_albums = df_fav_albums["Artist Name(s)"].apply(extract_primary_artist)

    # Combine and deduplicate
    all_artists = pd.concat([primary_artists_liked, primary_artists_albums]).drop_duplicates()

    # Normalize artist names (optional, for consistency)
    all_artists_normalized = all_artists.apply(normalize_artist_name)

    # Save to a new CSV
    all_artists.to_csv("data/unique_artists.csv", index=False, header=["Artist"])
    print(f"Saved {len(all_artists)} unique artists to data/unique_artists.csv")

if __name__ == "__main__":
    main()

Saved 2126 unique artists to data/unique_artists.csv


In [3]:
import pandas as pd
import requests
import concurrent.futures
from tqdm import tqdm  # For a nice progress bar

# Define your API details
API_KEY = "74a510ecc9fc62bf3e0edc6adc2e99f9"
SHARED_SECRET = "6184d698e147753ab718bbf8572c5b9b"

# Read in the list of unique artists
unique_artists = pd.read_csv('data/unique_artists.csv')

# Function to get similar artists from Last.fm
def get_similar_artists(artist_name):
    url = f"http://ws.audioscrobbler.com/2.0/?method=artist.getsimilar&artist={artist_name}&api_key={API_KEY}&format=json"
    try:
        response = requests.get(url)
        data = response.json()
        
        # Handle case where no similar artists are found
        if 'similarartists' in data and 'artist' in data['similarartists']:
            similar_artists = [similar_artist['name'] for similar_artist in data['similarartists']['artist']]
            return artist_name, similar_artists
        else:
            return artist_name, []
    except Exception as e:
        print(f"Error fetching data for {artist_name}: {e}")
        return artist_name, []

# List to store results
similar_artists_data = []

# Function to process each artist
def process_artist(artist):
    artist_name, similar_artists = get_similar_artists(artist)
    return [(artist_name, similar_artist) for similar_artist in similar_artists]

# Using ThreadPoolExecutor for concurrent requests
with concurrent.futures.ThreadPoolExecutor(max_workers=10) as executor:
    # Submit tasks to the executor for each artist
    futures = {executor.submit(process_artist, artist): artist for artist in unique_artists['Artist']}
    
    # Set up a progress bar
    for future in tqdm(concurrent.futures.as_completed(futures), total=len(futures), desc="Fetching similar artists"):
        result = future.result()
        for artist_name, similar_artist in result:
            similar_artists_data.append({"Artist": artist_name, "SimilarArtist": similar_artist})

# Convert to DataFrame and save as CSV
similar_artists_df = pd.DataFrame(similar_artists_data)
similar_artists_df.to_csv('data/liked_artists_only_similar.csv', index=False)

print(f"Saved similar artists data to data/liked_artists_only_similar.csv")


Fetching similar artists: 100%|██████████| 2126/2126 [07:40<00:00,  4.62it/s]


Saved similar artists data to data/liked_artists_only_similar.csv
