In [6]:
import pandas as pd
import sqlite3
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import accuracy_score
from item_similarity_recommender import item_similarity_recommender_py
from sklearn.model_selection import train_test_split


In [7]:
conn = sqlite3.connect('data/track_metadata.db')

song_metadata_query = "SELECT track_id as song_id, title, artist_name FROM songs"
song_metadata = pd.read_sql_query(song_metadata_query, conn)

data_path = "data/train_triplets.txt"
df = pd.read_csv(data_path, delimiter='\t', header=None, names=['user_id', 'song_id', 'listen_count'])

print("Song metadata sample:")
print(song_metadata.head())

print("Dataset sample:")
print(df.head())


conn.close()

Song metadata sample:
              song_id              title       artist_name
0  TRMMMYQ128F932D901       Silent Night  Faster Pussy cat
1  TRMMMKD128F425225D        Tanssi vaan  Karkkiautomaatti
2  TRMMMRX128F93187D9  No One Could Ever    Hudson Mohawke
3  TRMMMCH128F425532C      Si Vos Querés       Yerba Brava
4  TRMMMWA128F426B589   Tangle Of Aspens        Der Mystic
Dataset sample:
                                    user_id             song_id  listen_count
0  b80344d063b5ccb3212f76538f3d9e43d87dca9e  SOAKIMP12A8C130995             1
1  b80344d063b5ccb3212f76538f3d9e43d87dca9e  SOAPDEY12A81C210A9             1
2  b80344d063b5ccb3212f76538f3d9e43d87dca9e  SOBBMDR12A8C13253B             2
3  b80344d063b5ccb3212f76538f3d9e43d87dca9e  SOBFNSP12AF72A0E22             1
4  b80344d063b5ccb3212f76538f3d9e43d87dca9e  SOBFOVM12A58A7D494             1


In [8]:
# Create the recommender system instance
is_model = item_similarity_recommender_py()
is_model.create(df, 'user_id', 'song_id')
is_model.set_song_metadata(song_metadata)

In [9]:
from sklearn.model_selection import train_test_split

# Assuming df is your DataFrame containing user-song interactions
train_data, test_data = train_test_split(df, test_size=0.2, random_state=42)

# Create the recommender system using training data
is_model.create(train_data, 'user_id', 'song_id')

# Predict and calculate accuracy
def calculate_accuracy(test_data):
    correct_predictions = 0
    total_predictions = 0
    
    for user_id in test_data['user_id'].unique():
        user_test_data = test_data[test_data['user_id'] == user_id]
        user_items = user_test_data['song_id'].tolist()
        recommendations = is_model.recommend(user_id, top_n=10)
        
        for song_id in user_items:
            if song_id in recommendations:
                correct_predictions += 1
            total_predictions += 1
    
    accuracy = correct_predictions / total_predictions if total_predictions > 0 else 0
    return accuracy

accuracy = calculate_accuracy(test_data)
print(f"Recommendation System Accuracy: {accuracy * 100:.2f}%")


TypeError: item_similarity_recommender_py.recommend() got an unexpected keyword argument 'top_n'

In [None]:
# Plot the distribution of listen counts
plt.figure(figsize=(10, 6))
sns.histplot(df['listen_count'], bins=50, kde=True)
plt.title('Distribution of Listen Counts')
plt.xlabel('Listen Count')
plt.ylabel('Frequency')
plt.show()

# Plot the top 10 most popular songs
top_songs = df['song_id'].value_counts().head(10)
top_songs_metadata = song_metadata[song_metadata['song_id'].isin(top_songs.index)]

plt.figure(figsize=(12, 8))
sns.barplot(x=top_songs_metadata['title'], y=top_songs.values)
plt.xticks(rotation=90)
plt.title('Top 10 Most Popular Songs')
plt.xlabel('Song Title')
plt.ylabel('Number of Listens')
plt.show()