### Setup

In [73]:
import pandas as pd
from google.colab import drive
from pyspark.sql import SparkSession
from pyspark.ml.feature import StringIndexer
from pyspark.ml.recommendation import ALS
from pyspark.sql.functions import col, explode
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.sql.functions import udf
from scipy.sparse import csr_matrix
from sklearn.metrics.pairwise import cosine_similarity
from scipy.sparse import csr_matrix
from sklearn.preprocessing import LabelEncoder
import numpy as np
from sklearn.model_selection import train_test_split
from math import sqrt
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import OrdinalEncoder

In [74]:
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


### Load dataset

In [75]:
file_path = "/content/drive/MyDrive/train_triplets.txt"

df = pd.read_csv(file_path, delimiter='\t', header=None, names=['user_id', 'song_id', 'play_count'])

print(df.head())

                                    user_id             song_id  play_count
0  b80344d063b5ccb3212f76538f3d9e43d87dca9e  SOAKIMP12A8C130995           1
1  b80344d063b5ccb3212f76538f3d9e43d87dca9e  SOAPDEY12A81C210A9           1
2  b80344d063b5ccb3212f76538f3d9e43d87dca9e  SOBBMDR12A8C13253B           2
3  b80344d063b5ccb3212f76538f3d9e43d87dca9e  SOBFNSP12AF72A0E22           1
4  b80344d063b5ccb3212f76538f3d9e43d87dca9e  SOBFOVM12A58A7D494           1


In [76]:
df = df.iloc[:500_000] # To make testing easier we only use part of th dataset. To use all data remove this line

df.drop_duplicates(subset=['user_id', 'song_id'], inplace=True)

user_counts = df['user_id'].value_counts()
df = df[df['user_id'].isin(user_counts[user_counts >= 5].index)]

song_counts = df['song_id'].value_counts()
df = df[df['song_id'].isin(song_counts[song_counts >= 5].index)]

print(f"Number of rows: {df.shape[0]}")

Number of rows: 358395


In [77]:
spark = SparkSession.builder.appName("CollaborativeFiltering").getOrCreate()

df_spark = spark.createDataFrame(df)

user_indexer = StringIndexer(inputCol="user_id", outputCol="user_index")
user_indexer_model = user_indexer.fit(df_spark)
df_spark = user_indexer_model.transform(df_spark)

song_indexer = StringIndexer(inputCol="song_id", outputCol="song_index")
song_indexer_model = song_indexer.fit(df_spark)
df_spark = song_indexer_model.transform(df_spark)

df_spark.show(5)

+--------------------+------------------+----------+----------+----------+
|             user_id|           song_id|play_count|user_index|song_index|
+--------------------+------------------+----------+----------+----------+
|b80344d063b5ccb32...|SOAKIMP12A8C130995|         1|    1239.0|    2248.0|
|b80344d063b5ccb32...|SOAPDEY12A81C210A9|         1|    1239.0|   13544.0|
|b80344d063b5ccb32...|SOBBMDR12A8C13253B|         2|    1239.0|    4870.0|
|b80344d063b5ccb32...|SOBFOVM12A58A7D494|         1|    1239.0|   18936.0|
|b80344d063b5ccb32...|SOBSUJE12A6D4F8CF5|         2|    1239.0|   10420.0|
+--------------------+------------------+----------+----------+----------+
only showing top 5 rows



### Recommendation with ALS

In [78]:
df_spark = df_spark.selectExpr("user_index as user", "song_index as item", "play_count as rating")

# Train ALS model
als = ALS(userCol="user", itemCol="item", ratingCol="rating", nonnegative=True, rank=10, maxIter=10, regParam=0.1)
model = als.fit(df_spark)

### Test ALS recommendation

In [79]:
(training_data, test_data) = df_spark.randomSplit([0.8, 0.2])

model = als.fit(training_data)

predictions = model.transform(test_data)

predictions = predictions.na.drop()

evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating", predictionCol="prediction")
rmse = evaluator.evaluate(predictions)
print(f"Root Mean Squared Error (RMSE): {rmse}")

Root Mean Squared Error (RMSE): 6.505492574894155


### Recommend with User based Utility Matrix

In [68]:
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

user_encoder = LabelEncoder()
train_df['user_id_encoded'] = user_encoder.fit_transform(train_df['user_id'])
test_df['user_id_encoded'] = user_encoder.transform(test_df['user_id'])

song_encoder = LabelEncoder()
train_df['song_id_encoded'] = song_encoder.fit_transform(train_df['song_id'])
test_df['song_id_encoded'] = song_encoder.transform(test_df['song_id'])

utility_matrix = csr_matrix(
    (train_df['play_count'], (train_df['user_id_encoded'], train_df['song_id_encoded']))
)

user_similarity = cosine_similarity(utility_matrix)

### Test User based Utility-Matrix recommendations

In [69]:
def predict_ratings_for_test_set(test_df, utility_matrix, similarity_matrix, k=2):
    actual_ratings = []
    predicted_ratings = []

    for _, row in test_df.iterrows():
        user_id = row['user_id_encoded']
        item_id = row['song_id_encoded']
        actual_rating = row['play_count']

        # Predict the rating
        user_ratings = utility_matrix[user_id].toarray().flatten()
        user_similarities = similarity_matrix[user_id]
        similar_users = np.argsort(-user_similarities)[1:k+1]

        numerator = 0
        denominator = 0
        for similar_user in similar_users:
            similar_user_ratings = utility_matrix[similar_user, item_id]
            if similar_user_ratings > 0:
                numerator += user_similarities[similar_user] * similar_user_ratings
                denominator += abs(user_similarities[similar_user])

        predicted_rating = numerator / denominator if denominator > 0 else 0

        actual_ratings.append(actual_rating)
        predicted_ratings.append(predicted_rating)

    return actual_ratings, predicted_ratings

# Get actual and predicted ratings for the test set
actual_ratings, predicted_ratings = predict_ratings_for_test_set(
    test_df, utility_matrix, user_similarity, k=2
)

# Calculate RMSE
rmse = sqrt(mean_squared_error(actual_ratings, predicted_ratings))
print(f"\nRoot Mean Squared Error (RMSE): {rmse}")


Root Mean Squared Error (RMSE): 6.941470333324735


### Recommend with Item based Utility Matrix

In [71]:
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

user_encoder = LabelEncoder()
train_df['user_id_encoded'] = user_encoder.fit_transform(train_df['user_id'])
test_df['user_id_encoded'] = user_encoder.transform(test_df['user_id'])

song_encoder = LabelEncoder()
train_df['song_id_encoded'] = song_encoder.fit_transform(train_df['song_id'])
test_df['song_id_encoded'] = song_encoder.transform(test_df['song_id'])

utility_matrix = csr_matrix(
    (train_df['play_count'], (train_df['user_id_encoded'], train_df['song_id_encoded']))
)

utility_matrix_item_item = utility_matrix.T

item_similarity = cosine_similarity(utility_matrix_item_item)

### Test Item based Utility-Matrix recommendations

In [72]:
def predict_ratings_for_test_set_item_item(test_df, utility_matrix, similarity_matrix, k=2):
    actual_ratings = []
    predicted_ratings = []

    for _, row in test_df.iterrows():
        user_id = row['user_id_encoded']
        item_id = row['song_id_encoded']
        actual_rating = row['play_count']

        user_ratings = utility_matrix[:, user_id].toarray().flatten()
        item_similarities = similarity_matrix[item_id]
        similar_items = np.argsort(-item_similarities)[1:k+1]

        numerator = 0
        denominator = 0
        for similar_item in similar_items:
            similar_item_rating = utility_matrix[similar_item, user_id]
            if similar_item_rating > 0:
                numerator += item_similarities[similar_item] * similar_item_rating
                denominator += abs(item_similarities[similar_item])

        predicted_rating = numerator / denominator if denominator > 0 else 0

        actual_ratings.append(actual_rating)
        predicted_ratings.append(predicted_rating)

    return actual_ratings, predicted_ratings


utility_matrix_item_item = utility_matrix.T

actual_ratings, predicted_ratings = predict_ratings_for_test_set_item_item(
    test_df, utility_matrix_item_item, item_similarity, k=2
)

rmse = sqrt(mean_squared_error(actual_ratings, predicted_ratings))
print(f"\nRoot Mean Squared Error (RMSE): {rmse}")


Root Mean Squared Error (RMSE): 6.8412624505020565
