In [1]:
# Part 1: Load, Explore, and Prepare the Data
import pandas as pd

print("Step 1: Loading and preparing data...")

column_names = ['user_id', 'song_id', 'play_count']
df = pd.read_csv('train_triplets.txt', 
                 sep='\t', 
                 header=None, 
                 names=column_names, 
                 nrows=1000000)

# Filter for popular songs and active users
song_play_counts = df.groupby('song_id')['play_count'].count()
popular_songs = song_play_counts[song_play_counts >= 50].index
df_popular = df[df['song_id'].isin(popular_songs)]

user_play_counts = df_popular.groupby('user_id')['play_count'].count()
active_users = user_play_counts[user_play_counts >= 20].index
df_final = df_popular[df_popular['user_id'].isin(active_users)]

print("Data preparation complete. Final data shape:", df_final.shape)
print("-" * 40)


# Part 2: Build the Recommendation Model
from surprise import Reader, Dataset, SVD

print("Step 2: Training the recommendation model...")
print("This might take a minute...")

# Load the data into the Surprise library format
reader = Reader(rating_scale=(df_final['play_count'].min(), df_final['play_count'].max()))
data = Dataset.load_from_df(df_final[['user_id', 'song_id', 'play_count']], reader)

# Build a training set from the entire dataset
trainset = data.build_full_trainset()

# Create and train the SVD model
model = SVD()
model.fit(trainset)

print("\nModel training complete! ðŸŽ‰")

Step 1: Loading and preparing data...
Data preparation complete. Final data shape: (210323, 3)
----------------------------------------
Step 2: Training the recommendation model...
This might take a minute...

Model training complete! ðŸŽ‰


In [2]:
# --- Pick a user to recommend for (we'll just take the first user in our dataset) ---
target_user_id = df_final['user_id'].iloc[0]
print(f"Getting recommendations for user: {target_user_id}\n")

# --- Get a list of all unique song IDs in the dataset ---
all_song_ids = df_final['song_id'].unique()

# --- Get the list of songs the user has already listened to ---
listened_song_ids = df_final[df_final['user_id'] == target_user_id]['song_id'].tolist()

# --- Create a list of songs the user has NOT listened to ---
unheard_songs = [song_id for song_id in all_song_ids if song_id not in listened_song_ids]

# --- Predict scores for all the unheard songs ---
print("Predicting scores for unheard songs...")
predictions = []
for song_id in unheard_songs:
    # The model.predict() method returns a prediction object.
    # The predicted rating is in the 'est' attribute (for estimate).
    predicted_score = model.predict(uid=target_user_id, iid=song_id).est
    predictions.append((song_id, predicted_score))

# --- Sort the predictions and get the top 10 ---
# We'll put the predictions into a pandas DataFrame for easy sorting
recommendations_df = pd.DataFrame(predictions, columns=['song_id', 'predicted_score'])
top_10_recommendations = recommendations_df.sort_values(by='predicted_score', ascending=False).head(10)

# --- Display the results ---
print("\n------------------------------------")
print("Top 10 song recommendations:")
print(top_10_recommendations)

Getting recommendations for user: b80344d063b5ccb3212f76538f3d9e43d87dca9e

Predicting scores for unheard songs...

------------------------------------
Top 10 song recommendations:
                 song_id  predicted_score
0     SOAUWYT12A81C206F1              436
1930  SOCGFSH12A81C238C6              436
1910  SOYCGZK12AB018B261              436
1911  SOYLYFL12A8AE45FA4              436
1912  SOZWQWL12A58A7EA09              436
1913  SOBJMPB12A6701F785              436
1914  SOBNTFK12A6701F1CF              436
1915  SOGUKDL12A6D4FAAB9              436
1916  SOHYRWW12A6D4F7A41              436
1917  SOLQRUG12A8AE45DE1              436


In [3]:
from surprise.model_selection import train_test_split
from surprise import accuracy

# Split the data into a training set and a testing set (80% / 20%)
trainset, testset = train_test_split(data, test_size=0.20)

# Create and train the SVD model on the training set
model = SVD()
model.fit(trainset)

# Make predictions on the test set
predictions = model.test(testset)

# Calculate and print the RMSE score
rmse = accuracy.rmse(predictions)
print(f"\nThe RMSE score for our model is: {rmse:.4f}")

RMSE: 432.9985

The RMSE score for our model is: 432.9985


In [4]:
import pickle

# --- Save the Model to a File ---
model_filename = 'music_recommender_model.pkl'
print(f"Saving model to {model_filename}...")
with open(model_filename, 'wb') as file:
    pickle.dump(model, file)
print("Model saved successfully!")


# --- Load the Model From the File ---
print("\nLoading model back from file...")
with open(model_filename, 'rb') as file:
    loaded_model = pickle.load(file)
print("Model loaded successfully!")


# --- Test the Loaded Model ---
# Let's test it by predicting a score for a user and song
# (We'll just use the IDs from our test in the previous step)
test_user = df_final['user_id'].iloc[0]
test_song = df_final['song_id'].iloc[0]

prediction = loaded_model.predict(uid=test_user, iid=test_song)
print(f"\nTest prediction with loaded model: {prediction.est:.4f}")

Saving model to music_recommender_model.pkl...
Model saved successfully!

Loading model back from file...
Model loaded successfully!

Test prediction with loaded model: 436.0000
