In [4]:
!pip install --no-cache-dir --force-reinstall numpy==1.23.5 scipy==1.9.3
!pip install scikit-surprise

Collecting numpy==1.23.5
  Downloading numpy-1.23.5-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (2.3 kB)
Collecting scipy==1.9.3
  Downloading scipy-1.9.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (58 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m58.4/58.4 kB[0m [31m4.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading numpy-1.23.5-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (17.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m17.1/17.1 MB[0m [31m190.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading scipy-1.9.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (33.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m33.4/33.4 MB[0m [31m186.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: numpy, scipy
  Attempting uninstall: numpy
    Found existing installation: numpy 2.0.2
    Uninstalling numpy-2.0.2:
      Successfully uninstalled 



In [1]:
import pandas as pd
import surprise

In [3]:
df = pd.read_csv('songs_data.csv')
df.head()

Unnamed: 0,song_id,artist_id,song_genre,user_id,n_listen,publish_year
0,537,368,4,2066,13,2002
1,921,107,1,1179,5,2006
2,352,188,1,1468,11,2013
3,853,370,4,460,9,2020
4,479,408,2,1125,3,2020


In [4]:
df['n_listen'].min(), df['n_listen'].max()

(0, 15)

In [11]:
from surprise.dataset import Dataset
from surprise.reader import Reader
from surprise.model_selection import train_test_split

In [7]:
reader = Reader(rating_scale=(0, 15))

In [8]:
dataset = Dataset.load_from_df(df[['user_id', 'song_id', 'n_listen']], reader)

In [9]:
dataset

<surprise.dataset.DatasetAutoFolds at 0x78b12c47d810>

--------

1. Full Trainset

In [12]:
trainset = dataset.build_full_trainset()

In [13]:
trainset

<surprise.trainset.Trainset at 0x78b11c0e8e50>

In [14]:
trainset.n_items, trainset.n_users, trainset.n_ratings

(1000, 3000, 460400)

--------

2. Train - Test Split

In [17]:
trainset, testset = train_test_split(dataset, test_size=0.25, random_state=123)

In [18]:
trainset.n_items, trainset.n_users, trainset.n_ratings

(1000, 3000, 345300)

In [21]:
len(testset)

115100

-------------
3. Cross Validation

In [22]:
from surprise.model_selection import LeaveOneOut

In [27]:
loo_cv = LeaveOneOut(n_splits=5, random_state=123, min_n_ratings=1)

In [28]:
for (trainset_loo, testset_loo) in loo_cv.split(dataset):
    print(f"Trainset Ratings: {trainset_loo.n_ratings}, Test Ratings: {len(testset_loo)}")

Trainset Ratings: 457400, Test Ratings: 3000
Trainset Ratings: 457400, Test Ratings: 3000
Trainset Ratings: 457400, Test Ratings: 3000
Trainset Ratings: 457400, Test Ratings: 3000
Trainset Ratings: 457400, Test Ratings: 3000


-----------

# Modelling

In [29]:
from surprise.prediction_algorithms.knns import KNNBasic

In [30]:
knn_basic = KNNBasic(k=20, sim_options={'name': 'pearson', 'user_based': True})

In [31]:
knn_basic.fit(trainset)

Computing the pearson similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNBasic at 0x78b107b37350>

In [32]:
knn_basic.predict(uid=110, iid=15)

Prediction(uid=110, iid=15, r_ui=None, est=7.004589604762891, details={'actual_k': 20, 'was_impossible': False})

In [33]:
from surprise.accuracy import mse, mae, rmse

In [40]:
testset[0]

(219, 626, 8.0)

In [37]:
preds = knn_basic.test(testset)

In [38]:
preds[0]

Prediction(uid=219, iid=626, r_ui=8.0, est=8.361496812426797, details={'actual_k': 20, 'was_impossible': False})

In [39]:
mse(preds)

MSE: 22.2606


22.260559391980458

In [41]:
rmse(preds)

RMSE: 4.7181


4.718109726572758

In [42]:
mae(preds)

MAE:  4.0504


4.050408200778756