In [1]:
import pandas as pd
from surprise import Dataset


In [2]:

import pandas as pd
from surprise.model_selection import cross_validate
from surprise import Dataset, KNNBasic
from surprise import NormalPredictor, Reader


In [3]:
ratings_dict = {'itemid': ['Tuote1', 'Tuote2', 'Tuote3', 'Tuote4', 'Tuote1', 'Tuote2', 'Tuote3', 'Tuote4', 'Tuote1', 'Tuote3', 'Tuote2', 'Tuote1', 'Tuote4','Tuote1','Tuote2','Tuote3'],
           'userid': ['Maria', 'Maria', 'Maria', 'Maria', 'Timo', 'Timo', 'Timo', 'Timo', 'Arttu','Leo', 'Leo', 'Anna', 'Anna', 'Saara', 'Saara', 'Saara'],
           'rating': [1,1,5,5,5,5,2,2,5,5,1,2,4,1,1,5]}

In [4]:
df = pd.DataFrame(ratings_dict)
df

Unnamed: 0,itemid,userid,rating
0,Tuote1,Maria,1
1,Tuote2,Maria,1
2,Tuote3,Maria,5
3,Tuote4,Maria,5
4,Tuote1,Timo,5
5,Tuote2,Timo,5
6,Tuote3,Timo,2
7,Tuote4,Timo,2
8,Tuote1,Arttu,5
9,Tuote3,Leo,5


In [5]:
x = df.groupby("itemid")[['rating']].value_counts()
x

itemid  rating
Tuote1  1         2
        5         2
        2         1
Tuote2  1         3
        5         1
Tuote3  5         3
        2         1
Tuote4  2         1
        4         1
        5         1
Name: count, dtype: int64

In [6]:
mean_ratings = df.groupby('itemid')[['rating']].mean()
mean_ratings.sort_values(by = 'rating', ascending = False)

Unnamed: 0_level_0,rating
itemid,Unnamed: 1_level_1
Tuote3,4.25
Tuote4,3.666667
Tuote1,2.8
Tuote2,2.0


In [7]:
df.describe()

Unnamed: 0,rating
count,16.0
mean,3.125
std,1.857418
min,1.0
25%,1.0
50%,3.0
75%,5.0
max,5.0


In [8]:
reader = Reader(rating_scale = (1, 5))
reader.indexes

[0, 1, 2]

In [9]:
data = Dataset.load_from_df(df[['userid', 'itemid', 'rating']], reader)
data

<surprise.dataset.DatasetAutoFolds at 0x21452fabb20>

In [10]:
from surprise.model_selection import train_test_split
trainset, testset = train_test_split(data, test_size = 0.25)
print(trainset)
print(testset)

<surprise.trainset.Trainset object at 0x00000214530285B0>
[('Timo', 'Tuote2', 5.0), ('Maria', 'Tuote4', 5.0), ('Anna', 'Tuote4', 4.0), ('Leo', 'Tuote2', 1.0)]


In [11]:
trainset = data.build_full_trainset()
trainset.all_items

<bound method Trainset.all_items of <surprise.trainset.Trainset object at 0x0000021450BF39D0>>

In [12]:
sim_options = {'user_based' : True}

algo = KNNBasic(sim_options = sim_options)
algo.fit(trainset)

Computing the msd similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNBasic at 0x21450bf14b0>

In [13]:
user_id = 'Leo'
item_id = 'Tuote3'

predLeo = algo.predict(user_id, item_id, r_ui = 5, verbose = True)
predLeo

user: Leo        item: Tuote3     r_ui = 5.00   est = 4.93   {'actual_k': 4, 'was_impossible': False}


Prediction(uid='Leo', iid='Tuote3', r_ui=5, est=4.927710843373494, details={'actual_k': 4, 'was_impossible': False})

In [14]:
testset = algo.test(testset)
testset

[Prediction(uid='Timo', iid='Tuote2', r_ui=5.0, est=4.288581314878894, details={'actual_k': 4, 'was_impossible': False}),
 Prediction(uid='Maria', iid='Tuote4', r_ui=5.0, est=4.541176470588235, details={'actual_k': 3, 'was_impossible': False}),
 Prediction(uid='Anna', iid='Tuote4', r_ui=4.0, est=4.142857142857143, details={'actual_k': 3, 'was_impossible': False}),
 Prediction(uid='Leo', iid='Tuote2', r_ui=1.0, est=1.0963855421686748, details={'actual_k': 4, 'was_impossible': False})]

In [15]:
testset = trainset.build_anti_testset()
pred_antitest = algo.test(testset)
pred_antitest

[Prediction(uid='Arttu', iid='Tuote2', r_ui=3.125, est=4.578947368421052, details={'actual_k': 3, 'was_impossible': False}),
 Prediction(uid='Arttu', iid='Tuote3', r_ui=3.125, est=2.31578947368421, details={'actual_k': 3, 'was_impossible': False}),
 Prediction(uid='Arttu', iid='Tuote4', r_ui=3.125, est=2.3248730964467, details={'actual_k': 3, 'was_impossible': False}),
 Prediction(uid='Leo', iid='Tuote1', r_ui=3.125, est=1.1428571428571428, details={'actual_k': 3, 'was_impossible': False}),
 Prediction(uid='Leo', iid='Tuote4', r_ui=3.125, est=4.793103448275862, details={'actual_k': 2, 'was_impossible': False}),
 Prediction(uid='Anna', iid='Tuote2', r_ui=3.125, est=1.4705882352941175, details={'actual_k': 3, 'was_impossible': False}),
 Prediction(uid='Anna', iid='Tuote3', r_ui=3.125, est=4.647058823529412, details={'actual_k': 3, 'was_impossible': False}),
 Prediction(uid='Saara', iid='Tuote4', r_ui=3.125, est=4.55072463768116, details={'actual_k': 3, 'was_impossible': False})]

In [16]:
from collections import defaultdict

def get_top_n(predictions, n= 10):
    top_n = defaultdict(list)
    for uid, iid, true_r, est, _ in predictions:
        top_n[uid].append((iid, est))
    
    for uid, user_ratings in top_n.items():
        user_ratings.sort(key = lambda x: x[1], reverse = True)
        top_n[uid] = user_ratings[:n]
    
    return top_n

top_n = get_top_n(pred_antitest, n= 10)


for uid, user_ratings in top_n.items():
    print(uid, [iid for (iid, _) in user_ratings])

Arttu ['Tuote2', 'Tuote4', 'Tuote3']
Leo ['Tuote4', 'Tuote1']
Anna ['Tuote3', 'Tuote2']
Saara ['Tuote4']


In [17]:
cross_validate(algo, data, measures = ['MAE', 'RMSE'], cv = 10, verbose = True)

Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Evaluating MAE, RMSE of algorithm KNNBasic on 10 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Fold 6  Fold 7  Fold 8  Fold 9  Fold 10 Mean    Std     
MAE (testset)     1.5714  0.1209  0.3452  2.7093  1.1143  3.2128  0.0857  0.1429  1.2400  3.436

{'test_mae': array([1.57142857, 0.12087912, 0.3452381 , 2.70930233, 1.11428571,
        3.21276596, 0.08571429, 0.14285714, 1.24      , 3.43686584]),
 'test_rmse': array([2.12372411, 0.12286088, 0.37833925, 3.00103642, 1.51644052,
        3.30780933, 0.08571429, 0.14285714, 1.24      , 3.43686584]),
 'fit_time': (0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0),
 'test_time': (0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0)}

In [18]:
#!python -m pip install nbconvert

import nbformat
from nbconvert import HTMLExporter

# Load notebook
with open("tuote_esimerkki.ipynb") as f:
    nb = nbformat.read(f, as_version=4)

# Convert to HTML
html_exporter = HTMLExporter()
(body, resources) = html_exporter.from_notebook_node(nb)

# Save HTML
with open("tuote_esimerkki.html", "w", encoding="utf-8") as f:
    f.write(body)

ModuleNotFoundError: No module named 'nbformat'