In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
from sentence_transformers import SentenceTransformer
from scipy.sparse import csr_matrix, hstack
from sklearn.decomposition import TruncatedSVD

  from .autonotebook import tqdm as notebook_tqdm


## Loading of Data 

In [11]:
# Do Text based Representation TF - IDF (We have less context, not so resrouce_intensive)

import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler
from scipy.sparse import csr_matrix, hstack
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics.pairwise import cosine_similarity

def parse_array(s):
    try:
        if pd.isna(s) or not isinstance(s, str):
            return np.array([])
        numbers = s.strip("[]").split()
        return np.array([float(num) for num in numbers])
    except ValueError:
        return np.array([])

df = pd.read_excel("../../Downloads/ReRun7.xlsx")
df = df[['Song', "Artist", "featured_artists", "duration", "album", "mfcc", 'chroma', 'rms', 'spectral_centroid', 'zcr', 'tempo']]
df = df.dropna(subset=['mfcc', 'chroma', 'rms', 'spectral_centroid', 'zcr', 'tempo'])
df['combined_text_features'] = df['Song'] + ' ' + df['Artist'] + " " + df['featured_artists'] + ' ' + df['album']
# Determine the size of non-text features by looking at the first non-empty row
# This assumes that all non-empty rows have features of the same size
non_text_features = ['mfcc', 'chroma', 'rms', 'spectral_centroid', 'zcr', 'tempo']
feature_sizes = {}
for feature in non_text_features:
    df[feature] = df[feature].apply(parse_array)
    
    for array in df[feature]:
        if array.size > 0:  
            feature_sizes[feature] = array.size
            break  

for feature, size in feature_sizes.items():
    df[feature] = df[feature].apply(lambda x: np.pad(x, (0, max(0, size - x.size)), mode='constant')[:size] if x.size > 0 else np.zeros(size))

df['combined_non_text_features'] = df.apply(lambda row: np.concatenate([row[feat] for feat in non_text_features]), axis=1)

scaler = StandardScaler()
scaled_non_text_features = scaler.fit_transform(np.stack(df['combined_non_text_features'].values))

tfidf_vectorizer = TfidfVectorizer(max_features=5000)
tfidf_matrix = tfidf_vectorizer.fit_transform(df['combined_text_features'].values.astype('U'))

scaled_non_text_features_sparse = csr_matrix(scaled_non_text_features)
combined_features = hstack([tfidf_matrix, scaled_non_text_features_sparse])

svd = TruncatedSVD(n_components=1000) 
reduced_features = svd.fit_transform(combined_features)
cosine_sim_matrix = cosine_similarity(reduced_features)

In [12]:
def recommend_song(song_name, data, similarity_matrix, top_k=10):

    normalized_song_names = data['Song'].str.lower().str.strip()
    song_name_normalized = song_name.lower().strip()
    
    if song_name_normalized not in normalized_song_names.values:
        print(f"Song named '{song_name}' does not exist in the dataset.")
        return None
    
    song_idx = data.index[data['Song'] == song_name].tolist()[0]
    
    song_similarities = similarity_matrix[song_idx]
    similar_idxs = np.argsort(song_similarities)[::-1]
    
    top_k_idxs = similar_idxs[1:top_k + 1]
    
    top_similar_songs = data.iloc[top_k_idxs].copy()
    top_similar_songs['Similarity'] = song_similarities[top_k_idxs]
    
    return top_similar_songs

song_name = "I Don't Care"  
similar_songs = recommend_song(song_name, df, cosine_sim_matrix, top_k=5)
print(similar_songs[['Song', 'Artist', 'Similarity']])


                               Song          Artist  Similarity
44870              You're Beautiful     James Blunt    0.805199
8324   Wicked Game - Acoustic; Live      Stone Sour    0.758321
22668                     Sometimes  Britney Spears    0.753746
30463                        Circle        Greywind    0.744073
15597           Another Man's Woman      Supertramp    0.741546


## LightFM

In [3]:
# from lightfm.data import Dataset
import pandas as pd
import numpy as np
from scipy.sparse import coo_matrix
from lightfm import LightFM
from lightfm.data import Dataset


lastfm_api_key = "97d5a64d5ba4a8bc580b752ceff3b87f"
lastfm_secret = "35175090bd61f6f16ac607bd26e5b1de"



In [4]:
import pandas as pd
import requests
from collections import deque
def get_lastfm_friends_bfs(start_username, api_key, min_users=5000):
    discovered = set([start_username]) 
    queue = deque([start_username])     
    collected_friends = []              

    while queue and len(collected_friends) < min_users:
        current_user = queue.popleft()
        url = f"http://ws.audioscrobbler.com/2.0/?method=user.getfriends&user={current_user}&api_key={api_key}&format=json"
        
        try:
            response = requests.get(url)
            data = response.json()
            
            if 'error' in data:
                print(f"Error fetching data for user {current_user}: {data['message']}")
                continue
            
            users = data.get('friends', {}).get('user', [])
            for user in users:
                friend_name = user['name']
                if friend_name not in discovered:
                    discovered.add(friend_name)
                    queue.append(friend_name)
                    collected_friends.append(friend_name)
                    if len(collected_friends) >= min_users:
                        break  

            print(f"Collected {len(collected_friends)} friends so far...")
        except Exception as e:
            print(f"An error occurred while processing user {current_user}: {e}")
    
    return collected_friends[:min_users]

def get_top_tracks_for_users(users, api_key):
    user_tracks = []
    user_artists = []
    user_playcounts = []
    user_ids = []
    
    for user in users:
        result = get_top_tracks(user, api_key)  
        tracks, artists, playcounts = [], [], []
        
        for item in result['toptracks']['track'][:10]: 
            tracks.append(item['name'])
            artists.append(item['artist']['name'])
            playcounts.append(item['playcount'])
        
        user_tracks.append(tracks)
        user_artists.append(artists)
        user_playcounts.append(playcounts)
        user_ids.append(user)
    
    df = pd.DataFrame({
        'User': user_ids,
        'Tracks': user_tracks,
        'Artists': user_artists,
        'Playcounts': user_playcounts
    })
    
    return df

def get_top_tracks(user, api_key):
    url = f"http://ws.audioscrobbler.com/2.0/?method=user.gettoptracks&user={user}&api_key={api_key}&format=json"
    response = requests.get(url)
    result = response.json()
    return result

start_username = "Bans77" 

users = get_lastfm_friends_bfs(start_username, lastfm_api_key, min_users=100)

df_top_tracks = get_top_tracks_for_users(users, lastfm_api_key)

print(df_top_tracks.head())


Collected 27 friends so far...
Collected 33 friends so far...
Collected 83 friends so far...
Error fetching data for user latenightcryout: no such page
Collected 100 friends so far...
              User                                             Tracks  \
0       astonbrown  [Bring Me Your Loves, Rebound, Digital Witness...   
1         liliwer7  [Fuck The Industry Pt. 2, Calling My Phone, Gl...   
2  latenightcryout  [I H3ART Y0U, Jealous, Romantic Homicide, Touc...   
3         cabnfver  [Dionysus, IDOL, Maneater, Mad World, PUMPED U...   
4  no_eyes_no_ears  [6 Five Heartbeats (feat. Vince Staples), Free...   

                                             Artists  \
0  [St. Vincent, Jennifer Lopez, St. Vincent, Ari...   
1  [YoungBoy Never Broke Again, Lil Tjay, 6lack, ...   
2  [BOY FANTASY, Eyedress, d4vd, Cigarettes After...   
3  [BTS, BTS, Nelly Furtado, Tears for Fears, 3TE...   
4  [The Alchemist, IceWear Vezzo, Gogetter, Veeze...   

                                        

In [5]:
records = []
for i, row in df_top_tracks.iterrows():
    user = row['User']
    for track, artist, playcount in zip(row['Tracks'], row['Artists'], row['Playcounts']):
        track_artist = f"{track} - {artist}"
        records.append((user, track_artist, playcount))

df_flat = pd.DataFrame(records, columns=['User', 'Track_Artist', 'Playcount'])

dataset = Dataset()
dataset.fit(users=df_flat['User'].unique(),
            items=df_flat['Track_Artist'].unique())

(interactions, weights) = dataset.build_interactions([(x['User'], x['Track_Artist'], float(x['Playcount'])) for index, x in df_flat.iterrows()])

In [5]:

model = LightFM(no_components=10, loss='warp')

model.fit(interactions, sample_weight=weights, epochs=5)


: 

In [1]:
def recommend(user_id, model, data, interactions, n_items=10):
    user_index = data.mapping()[0][user_id]
    
    scores = model.predict(user_index, np.arange(interactions.shape[1]))
    
    item_indices = np.argsort(-scores)[:n_items]  
    
    # Convert item indices back to item IDs
    item_ids = [list(data.mapping()[2].keys())[i] for i in item_indices]
    
    return item_ids

user_id = 'Bans77'  
recommended_tracks = recommend(user_id, model, dataset, interactions, n_items=10)
print(f"Recommended tracks for user {user_id}: {recommended_tracks}")


NameError: name 'model' is not defined

## K Means Clustering 

In [9]:
import pandas as pd
from sklearn.cluster import KMeans
from scipy.sparse import csr_matrix
from sklearn.preprocessing import normalize

pivot_table = df_flat.pivot_table(index='User', columns='Track_Artist', values='Playcount', fill_value=0)
user_item_matrix = csr_matrix(pivot_table.values)  

user_item_matrix_normalized = normalize(user_item_matrix, axis=1)

k = 5  
kmeans = KMeans(n_clusters=k, random_state=42)
user_clusters = kmeans.fit_predict(user_item_matrix_normalized)

pivot_table['Cluster'] = user_clusters

def recommend_songs_for_user(user_id, pivot_table):
    user_cluster = pivot_table.loc[user_id, 'Cluster']
    cluster_table = pivot_table[pivot_table['Cluster'] == user_cluster]
    
    song_popularity = cluster_table.drop('Cluster', axis=1).mean().sort_values(ascending=False)
    top_recommendations = song_popularity.head(10).index.tolist()
    return top_recommendations


recommendations = recommend_songs_for_user('astonbrown', pivot_table)
print(recommendations)


['MANIAC - Stray Kids', 'Charmer - Stray Kids', 'VENOM - Stray Kids', 'New Tank - Playboi Carti', 'FREEZE - Stray Kids', 'Lonely St. - Stray Kids', 'Muddy Water (Changbin, Hyunjin, HAN, Felix) - Stray Kids', "[EN-TER key] ENHYPEN's Imaginarium - ENHYPEN (엔하이픈) (ENG/JPN) - ENHYPEN", 'Waiting For Us (Bang Chan, Lee Know, Seungmin, I.N) - Stray Kids', 'Tinnitus (Wanna Be a Rock) - TOMORROW X TOGETHER']




In [None]:
# To check user's original songs
df_flat[df_flat['User'] == 'astonbrown'][['Track_Artist', 'Playcount']]
