In [14]:
import pandas as pd
import sqlite3
from sklearn.model_selection import train_test_split
from recommender import popularity_recommender_py, item_similarity_recommender_py

In [15]:
# Load a smaller chunk of the triplet data
triplets_df_chunk = pd.read_csv('C:\\Users\\HARRY\\Desktop\\Music Recomendation\\data\\train_triplets.txt', sep='\t', names=['user_id', 'song_id', 'listen_count'], nrows=500000)

# Load the metadata
metadata_df = pd.read_sql_query('SELECT song_id, title, release, artist_name FROM songs', 'sqlite:///C:\\Users\\HARRY\\Desktop\\Music Recomendation\\data\\track_metadata.db')

print(triplets_df_chunk.head())
print(metadata_df.head())


                                    user_id             song_id  listen_count
0  b80344d063b5ccb3212f76538f3d9e43d87dca9e  SOAKIMP12A8C130995             1
1  b80344d063b5ccb3212f76538f3d9e43d87dca9e  SOAPDEY12A81C210A9             1
2  b80344d063b5ccb3212f76538f3d9e43d87dca9e  SOBBMDR12A8C13253B             2
3  b80344d063b5ccb3212f76538f3d9e43d87dca9e  SOBFNSP12AF72A0E22             1
4  b80344d063b5ccb3212f76538f3d9e43d87dca9e  SOBFOVM12A58A7D494             1
              song_id              title  \
0  SOQMMHC12AB0180CB8       Silent Night   
1  SOVFVAK12A8C1350D9        Tanssi vaan   
2  SOGTUKN12AB017F4F1  No One Could Ever   
3  SOBNYVR12A8C13558C      Si Vos Querés   
4  SOHSBXH12A8C13B0DF   Tangle Of Aspens   

                                release       artist_name  
0                 Monster Ballads X-Mas  Faster Pussy cat  
1                           Karkuteillä  Karkkiautomaatti  
2                                Butter    Hudson Mohawke  
3                          

In [16]:
# Merge the dataframes on 'song_id'
combined_df_chunk = pd.merge(triplets_df_chunk, metadata_df, on='song_id', how='inner')

# Drop any duplicate columns if necessary
combined_df_chunk = combined_df_chunk.loc[:, ~combined_df_chunk.columns.duplicated()]

# Display the combined dataframe
print(combined_df_chunk.head())


                                    user_id             song_id  listen_count  \
0  b80344d063b5ccb3212f76538f3d9e43d87dca9e  SOAKIMP12A8C130995             1   
1  b80344d063b5ccb3212f76538f3d9e43d87dca9e  SOAPDEY12A81C210A9             1   
2  b80344d063b5ccb3212f76538f3d9e43d87dca9e  SOBBMDR12A8C13253B             2   
3  b80344d063b5ccb3212f76538f3d9e43d87dca9e  SOBFNSP12AF72A0E22             1   
4  b80344d063b5ccb3212f76538f3d9e43d87dca9e  SOBFOVM12A58A7D494             1   

                             title  \
0                         The Cove   
1             Nothing from Nothing   
2                  Entre Dos Aguas   
3            Under Cold Blue Stars   
4  Riot Radio (Soundtrack Version)   

                                             release    artist_name  
0                                 Thicker Than Water   Jack Johnson  
1                                         To Die For  Billy Preston  
2                                Flamenco Para Niños  Paco De Lucia  
3   

In [17]:
# Create a combined song_artist column
combined_df_chunk['song_artist'] = combined_df_chunk['title'] + ' - ' + combined_df_chunk['artist_name']

# Select the first 10,000 songs
subset_df_chunk = combined_df_chunk.head(10000)

# Group by 'song_artist' and calculate the total listen count for each song
grouped_df_chunk = subset_df_chunk.groupby('song_artist')['listen_count'].sum().reset_index()

# Calculate the sum of all listen counts
total_listens_chunk = grouped_df_chunk['listen_count'].sum()

# Add a new column 'percentage' that calculates the percentage of total listens
grouped_df_chunk['percentage'] = (grouped_df_chunk['listen_count'] / total_listens_chunk) * 100

# Sort the songs in ascending order of popularity (percentage)
sorted_df_chunk = grouped_df_chunk.sort_values(by='percentage', ascending=True)

# Display the sorted dataframe
print(sorted_df_chunk.head())


                                       song_artist  listen_count  percentage
28  16th St. Dozens (Album Version) - Two Gallants             1     0.00395
29                              19-2000 - Gorillaz             1     0.00395
30                                  1958 - Skalpel             1     0.00395
31                               1975 - Gene Clark             1     0.00395
35                                     1995 - Luna             1     0.00395


In [18]:
num_users = combined_df_chunk['user_id'].nunique()
num_songs = combined_df_chunk['song_id'].nunique()

print(f"Number of unique users: {num_users}")
print(f"Number of unique songs: {num_songs}")


Number of unique users: 10354
Number of unique songs: 108797


In [19]:
train_data, test_data = train_test_split(combined_df_chunk, test_size=0.2, random_state=42)

print(f"Training data shape: {train_data.shape}")
print(f"Testing data shape: {test_data.shape}")


Training data shape: (410288, 7)
Testing data shape: (102573, 7)


In [20]:
# Create an instance of popularity-based recommender class
popularity_model = popularity_recommender_py()
popularity_model.create(train_data, 'user_id', 'song_id')


In [21]:
# Make recommendations for a user
user_id = 'some_user_id'  # Replace 'some_user_id' with an actual user id from your dataset
recommendations = popularity_model.recommend(user_id)
print(recommendations)


            user_id             song_id  score  Rank
22689  some_user_id  SOFRQTD12A81C233C0    933   1.0
83992  some_user_id  SOWCKVR12A8C142411    835   2.0
3551   some_user_id  SOAXGDH12A8C13F8A1    748   3.0
3195   some_user_id  SOAUWYT12A81C206F1    733   4.0
6162   some_user_id  SOBONKR12A58A7A7E0    648   5.0
54400  some_user_id  SONYKOW12AB01849C9    637   6.0
72625  some_user_id  SOSXLTC12AF72A7F54    624   7.0
13091  some_user_id  SODGVGW12AC9075A8D    572   8.0
61721  some_user_id  SOPXKYD12A6D4FA876    553   9.0
16992  some_user_id  SOEGIYH12A6D4FC0E3    549  10.0


In [22]:
# Create an instance of item similarity-based recommender class
item_similarity_model = item_similarity_recommender_py()
item_similarity_model.create(train_data, 'user_id', 'song_id')


In [None]:
# Make item similarity-based recommendations for a user
item_recommendations = item_similarity_model.recommend(user_id)
print(item_recommendations)


No. of unique songs for the user: 0
no. of unique songs in the training set: 97655
