In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
from sentence_transformers import SentenceTransformer
from scipy.sparse import csr_matrix, hstack
from sklearn.decomposition import TruncatedSVD

## Loading of Data 

In [2]:
# Do Text based Representation TF - IDF (We have less context, not so resrouce_intensive)

import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler
from scipy.sparse import csr_matrix, hstack
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics.pairwise import cosine_similarity


df = pd.read_csv("../../Downloads/songs_feature_eng_pca.csv")
df = df[['track_name', "artist_name", "PC1", "PC2", "PC3", "PC4", "PC5", "PC6"]]


In [3]:
from annoy import AnnoyIndex
import pandas as pd
import numpy as np

pcs = df[['PC1', "PC2", "PC3", "PC4", "PC5", "PC6"]]

f = pcs.shape[1]

t = AnnoyIndex(f, 'angular')  # 'angular' is equivalent to cosine similarity

for i, vector in enumerate(pcs.to_numpy()):
    t.add_item(i, vector)

t.build(10) # Adjust more - more precise but take longer

top_5_similar = {i: [] for i in range(pcs.shape[0])}

for i in range(pcs.shape[0]):
    nearest = t.get_nns_by_item(i, 6, include_distances=True)
    
    indices, distances = nearest[0][1:], nearest[1][1:]
    
    similarities = [1 - d for d in distances]
    
    top_5_similar[i] = list(zip(indices, similarities))

for i in range(1, 6):
    df[f'Track_Name_{i}'] = np.nan
    df[f'Artist_Name_{i}'] = np.nan
    df[f'Similarity_{i}'] = np.nan

for idx, sims in top_5_similar.items():
    for i, (sim_idx, sim_score) in enumerate(sims, start=1):
        df.at[idx, f'Track_Name_{i}'] = df.at[sim_idx, 'track_name']
        df.at[idx, f'Artist_Name_{i}'] = df.at[sim_idx, 'artist_name']
        df.at[idx, f'Similarity_{i}'] = sim_score

df.to_csv("../../Downloads/BT4222ProjectExcel/songs_with_similarities_final.csv", index=False)



## Implicit

In [None]:
# from lightfm.data import Dataset
import pandas as pd
import numpy as np
from scipy.sparse import coo_matrix
from lightfm import LightFM
from lightfm.data import Dataset


lastfm_api_key = "97d5a64d5ba4a8bc580b752ceff3b87f"
lastfm_secret = "35175090bd61f6f16ac607bd26e5b1de"



In [4]:
import pandas as pd
import requests
from collections import deque
def get_lastfm_friends_bfs(start_username, api_key, min_users=5000):
    discovered = set([start_username]) 
    queue = deque([start_username])     
    collected_friends = []              

    while queue and len(collected_friends) < min_users:
        current_user = queue.popleft()
        url = f"http://ws.audioscrobbler.com/2.0/?method=user.getfriends&user={current_user}&api_key={api_key}&format=json"
        
        try:
            response = requests.get(url)
            data = response.json()
            
            if 'error' in data:
                print(f"Error fetching data for user {current_user}: {data['message']}")
                continue
            
            users = data.get('friends', {}).get('user', [])
            for user in users:
                friend_name = user['name']
                if friend_name not in discovered:
                    discovered.add(friend_name)
                    queue.append(friend_name)
                    collected_friends.append(friend_name)
                    if len(collected_friends) >= min_users:
                        break  

            print(f"Collected {len(collected_friends)} friends so far...")
        except Exception as e:
            print(f"An error occurred while processing user {current_user}: {e}")
    
    return collected_friends[:min_users]

def get_top_tracks_for_users(users, api_key):
    user_tracks = []
    user_artists = []
    user_playcounts = []
    user_ids = []
    
    for user in users:
        result = get_top_tracks(user, api_key)  
        tracks, artists, playcounts = [], [], []
        
        for item in result['toptracks']['track'][:10]: 
            tracks.append(item['name'])
            artists.append(item['artist']['name'])
            playcounts.append(item['playcount'])
        
        user_tracks.append(tracks)
        user_artists.append(artists)
        user_playcounts.append(playcounts)
        user_ids.append(user)
    
    df = pd.DataFrame({
        'User': user_ids,
        'Tracks': user_tracks,
        'Artists': user_artists,
        'Playcounts': user_playcounts
    })
    
    return df

def get_top_tracks(user, api_key):
    url = f"http://ws.audioscrobbler.com/2.0/?method=user.gettoptracks&user={user}&api_key={api_key}&format=json"
    response = requests.get(url)
    result = response.json()
    return result

start_username = "Bans77" 

users = get_lastfm_friends_bfs(start_username, lastfm_api_key, min_users=100)

df_top_tracks = get_top_tracks_for_users(users, lastfm_api_key)

print(df_top_tracks.head())


Collected 27 friends so far...
Error fetching data for user astonbrown: no such page
Collected 77 friends so far...
Error fetching data for user latenightcryout: no such page
Collected 100 friends so far...
              User                                             Tracks  \
0       astonbrown  [Bring Me Your Loves, Rebound, Digital Witness...   
1         liliwer7  [Fuck The Industry Pt. 2, Calling My Phone, Gl...   
2  latenightcryout  [I H3ART Y0U, Jealous, Romantic Homicide, Touc...   
3         cabnfver  [Dionysus, IDOL, Maneater, Mad World, PUMPED U...   
4  no_eyes_no_ears  [6 Five Heartbeats (feat. Vince Staples), Free...   

                                             Artists  \
0  [St. Vincent, Jennifer Lopez, St. Vincent, Ari...   
1  [YoungBoy Never Broke Again, Lil Tjay, 6lack, ...   
2  [BOY FANTASY, Eyedress, d4vd, Cigarettes After...   
3  [BTS, BTS, Nelly Furtado, Tears for Fears, 3TE...   
4  [The Alchemist, IceWear Vezzo, Gogetter, Veeze...   

                 

In [5]:
records = []
for i, row in df_top_tracks.iterrows():
    user = row['User']
    for track, artist, playcount in zip(row['Tracks'], row['Artists'], row['Playcounts']):
        track_artist = f"{track} - {artist}"
        records.append((user, track_artist, playcount))

df_flat = pd.DataFrame(records, columns=['User', 'Track_Artist', 'Playcount'])

dataset = Dataset()
dataset.fit(users=df_flat['User'].unique(),
            items=df_flat['Track_Artist'].unique())

(interactions, weights) = dataset.build_interactions([(x['User'], x['Track_Artist'], float(x['Playcount'])) for index, x in df_flat.iterrows()])

In [6]:
from scipy.sparse import coo_matrix
from implicit.als import AlternatingLeastSquares

df_flat['user_id_code'] = df_flat['User'].astype("category").cat.codes
df_flat['item_id_code'] = df_flat['Track_Artist'].astype("category").cat.codes

user_item_matrix = coo_matrix((df_flat['Playcount'].astype(np.float32),
                                (df_flat['user_id_code'], df_flat['item_id_code'])))

user_item_matrix_csr = user_item_matrix.tocsr()

item_user_matrix = user_item_matrix.T.tocsr()

In [7]:

model_implicit = AlternatingLeastSquares(factors=10, iterations=5, calculate_training_loss=True)
model_implicit.fit(user_item_matrix_csr)

user_id_map = dict(zip(df_flat['User'].astype("category"), df_flat['user_id_code']))
item_id_map = dict(zip(df_flat['Track_Artist'].astype("category"), df_flat['item_id_code']))

user_code_to_id_map = {v: k for k, v in user_id_map.items()}
item_code_to_id_map = {v: k for k, v in item_id_map.items()}

def recommend_implicit(user_id, model, user_item_matrix_csr, user_id_map, item_code_to_id_map, n_items=10):

    user_code = user_id_map.get(user_id)
    if user_code is None:
        raise ValueError(f"User ID {user_id} not found.")

    recommended, _ = model.recommend(user_code, user_item_matrix_csr[user_code], N=n_items)

    return [item_code_to_id_map.get(item_index, 'Unknown Item') for item_index in recommended]



recommended_tracks = recommend_implicit("SolarSerenity", model_implicit, user_item_matrix_csr, user_id_map, item_code_to_id_map)
print(f"Recommended tracks for user Bans77 using Implicit: {recommended_tracks}")


  check_blas_config()


  0%|          | 0/5 [00:00<?, ?it/s]

Recommended tracks for user Bans77 using Implicit: ['How to disappear - Lana Del Rey', 'A&W - Lana Del Rey', 'Pistol - Cigarettes After Sex', 'Fuck it I love you - Lana Del Rey', 'Sweet - Lana Del Rey', 'Paris, Texas (feat. SYML) - Lana Del Rey', 'Kintsugi - Lana Del Rey', 'If You Lie Down With Me - Lana Del Rey', 'Sunset - Caroline Polachek', 'Happiness is a butterfly - Lana Del Rey']
