In [4]:
pip install lightfm



# LightFM


In [5]:
import numpy as np
import pandas as pd
from lightfm import LightFM
from lightfm.data import Dataset
from lightfm.evaluation import precision_at_k, auc_score
from lightfm.cross_validation import random_train_test_split
import ast
from collections import deque
from google.colab import drive
drive.mount('/content/drive')
df = pd.read_csv('/content/drive/MyDrive/Googlecolab/user_songs_filtered.csv')
df['toptags'] = df['toptags'].apply(ast.literal_eval)

# Assuming `df` is your DataFrame with the structure provided.
df_copy = df

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [6]:
df['emotion1'].unique()
emotion_mapping = {
    'joy': 1,
    'positive': 2,
    'trust': 3,
    'surprise': 4,
    'negative': 5,
    'sadness': 6,
    'fear': 7,
    'disgust': 8,
    'anger':9
}
df['emotion1_encoded'] = df['emotion1'].map(emotion_mapping)
df['emotion2_encoded'] = df['emotion2'].map(emotion_mapping)
last_2_interactions = df.groupby('Username').tail(2)
train_df = df.drop(last_2_interactions.index)

In [7]:
dataset = Dataset()
df['track_id'] = df['track_name'] + ' - ' + df['artist_name']
unique_toptags = set(tag for sublist in df['toptags'].dropna() for tag in sublist)
unique_countries = set(df['country'].unique())
item_features_list = ['listeners', 'total_playcount', "profanity_density",
                      "polarity", "subjectivity", "emotion1_encoded", "emotion1_score",
                      "emotion2_encoded", "emotion2_score", "mfcc", "chroma", "rms",
                      "spectral_centroid", "zcr", "tempo"] + list(unique_toptags)

# Preparing the complete list of user features including 'country'
user_features_list = ['registered_year', "track_count", "artist_count"] + list(unique_countries)

dataset = Dataset()
dataset.fit(
    users=df['Username'].unique(),
    items=df['track_id'].unique(),
    user_features=user_features_list,
    item_features=item_features_list
)


In [8]:
item_features_data = []
for index, row in df.iterrows():
    chroma_list = np.fromstring(row.chroma.strip('[]'), sep=' ')
    chroma_avg = np.mean(chroma_list)
    mfcc_list = np.fromstring(row.mfcc.strip('[]'), sep=' ')
    mfcc_avg = np.mean(mfcc_list)
    # Preparing a dictionary for the current row/item with feature weights
    features_dict = {

        'listeners': int(row.listeners),
        'total_playcount': int(row.total_playcount),
        'profanity_density': float(row.profanity_density),
        'polarity': float(row.polarity),
        'subjectivity': float(row.subjectivity),
        'emotion1_encoded': int(row.emotion1_encoded),  # Convert to integer if it's encoded as a numeric string
        'emotion1_score': float(row.emotion1_score),
        'emotion2_encoded': int(row.emotion2_encoded),  # Convert to integer if it's encoded as a numeric string
        'emotion2_score': float(row.emotion2_score),
        'mfcc': float(mfcc_avg),
        'chroma': float(chroma_avg),
        'rms': float(row.rms),
        'spectral_centroid': float(row.spectral_centroid),
        'zcr': float(row.zcr),
        'tempo': float(row.tempo)
    }
    toptags_features = {tag: 1.0 for tag in row.toptags}
    features_dict.update(toptags_features)
    # Add the item id and its features to the list
    item_features_data.append((row.track_id, features_dict))

# Now, build the item features matrix with this data
item_features = dataset.build_item_features(item_features_data, normalize=True)


In [9]:
user_features_data = []
for index, row in df.iterrows():
    # Preparing a dictionary for the current row/item with feature weights
    features_dict = {
        row.country: 1.0,
        'registered_year': int(row.registered_year),
        'track_count': int(row.track_count),
        'artist_count': int(row.artist_count)}

    # Add the item id and its features to the list
    item_features_data.append((row.Username, features_dict))

# Now, build the item features matrix with this data
user_features = dataset.build_user_features(user_features_data, normalize=True)

(interactions, weights) = dataset.build_interactions(((row.Username, row.track_id, row.playcount) for index, row in df.iterrows()))

train_interactions, test_interactions = random_train_test_split(interactions, test_percentage=0.2, random_state=None)

In [11]:
model = LightFM(loss='warp')

# Train the model
model.fit(interactions, user_features=user_features, item_features=item_features, epochs=50, num_threads=4)

from lightfm.evaluation import precision_at_k, recall_at_k, auc_score, reciprocal_rank

model_filename = '/content/drive/MyDrive/Googlecolab/model_lightfm.pkl'  # Update the path to your desired Google Drive folder

import pickle
with open(model_filename, 'wb') as model_file:
    pickle.dump(model, model_file)

In [None]:
# Compute and print the precision at k
precision_at_k = precision_at_k(model, test_interactions, k=30, user_features=user_features, item_features=item_features).mean()
print(f"Precision at k: {precision_at_k}")
recall_at_k = recall_at_k(model, test_interactions, k=30, user_features=user_features, item_features=item_features).mean()
print(f"recall at k: {recall_at_k}")
auc_score = auc_score(model, test_interactions, user_features=user_features, item_features=item_features).mean()
print(f"auc score: {auc_score}")
reciprocal_rank = reciprocal_rank(model, test_interactions,  user_features=user_features, item_features=item_features).mean()
print(f"reciprocal_rank: {reciprocal_rank}")

In [12]:
# Load the model from the file
import pickle
with open('/content/drive/MyDrive/Googlecolab/model_lightfm.pkl', 'rb') as f:
    model = pickle.load(f)
def recommend(user_id, model, data, interactions, n_items=10):
    user_index = data.mapping()[0][user_id]

    scores = model.predict(user_index, np.arange(interactions.shape[1]))

    item_indices = np.argsort(-scores)[:n_items]

    # Convert item indices back to item IDs
    item_ids = [list(data.mapping()[2].keys())[i] for i in item_indices]

    return item_ids

user_id = 'emosoup'
recommended_tracks = recommend(user_id, model, dataset, test_interactions, n_items=10)
print(f"Recommended tracks for user {user_id}: {recommended_tracks}")

Recommended tracks for user emosoup: ["Ain't it Pretty - will.i.am", 'You Dont Know My Name - Alicia Keys', 'Home - Will Hanson', 'No Feelings - Remastered 2012 - Sex Pistols', 'The Chemistry Between Us - Suede', 'No Distance Left to Run - Blur', 'Return Trip - Electric Wizard', 'Pet Sematary - Ramones', 'Pilot - 50 Cent', 'Under Pressure (Ice Ice Baby) - Jedward']


In [14]:
def calculate_artist_diversity(recommended_lists):
    diversity_scores = []

    for recommended_list in recommended_lists:
        artists = [get_artist_for_song(song_id) for song_id in recommended_list]
        unique_artists = set(artists)
        diversity_score = len(unique_artists) / len(recommended_list) if recommended_list else 0
        diversity_scores.append(diversity_score)

    # Calculate the average diversity score across all recommendation lists
    average_diversity = sum(diversity_scores) / len(diversity_scores) if diversity_scores else 0
    return average_diversity

def get_artist_for_song(song_id):
    # Implement this function based on your dataset
    # It should return the artist for the given song ID
    pass

# Assume `recommendations` is a list of lists, where each inner list contains recommended song IDs for a user
average_diversity_score = calculate_artist_diversity(recommended_tracks)
print(f"Average Artist Diversity Score: {average_diversity_score}")


Average Artist Diversity Score: 0.038124816483377025


The AUC score is relatively high, suggesting the model is quite good at distinguishing between songs that a user will like and songs they won’t, when considering the entire dataset.

In the context of a recommendation system using the LightFM model and a user-song interaction matrix, a "relevant song" typically refers to a song that has some form of positive interaction recorded in the interaction matrix for a given user.

epoch: 30 k=5
Precision at k: 0.0002954209630843252
recall at k: 0.00018907612281741844
auc score: 0.7672815918922424

epopch 50, k=100
Precision at k: 0.00016875856090337038
recall at k: 0.0022899736471617073
auc score: 0.7718917727470398

learning rate 0.05, no.components 30
Precision at k: 0.00020050653256475925
recall at k: 0.002668390848592225
auc score: 0.7745291590690613