# Benchmarks

## Surprise on Data

In [2]:
from surprise import Dataset, Reader, SVD, KNNWithMeans
from surprise.model_selection import cross_validate
import pandas as pd

In [17]:
df = pd.read_csv('data/full_pruned.csv')
df.head()

Unnamed: 0,app_id,helpful,funny,date,is_recommended,hours,user_id,review_id
0,1942280,0,2,2022-12-31,True,7.1,5719965,13815934
1,392160,2,0,2022-12-31,True,170.9,466189,5938400
2,1273400,0,0,2022-12-31,True,10.6,102303,11145613
3,1032430,0,0,2022-12-31,True,0.3,3395651,6226031
4,1794680,0,0,2022-12-31,True,8.2,6842278,1417575


In [18]:
df = df[['app_id', 'hours', 'user_id']]
df.head()

Unnamed: 0,app_id,hours,user_id
0,1942280,7.1,5719965
1,392160,170.9,466189
2,1273400,10.6,102303
3,1032430,0.3,3395651
4,1794680,8.2,6842278


In [31]:
df['hours'].max()

999.9

In [32]:
reader = Reader(line_format='item rating user', sep=',', skip_lines=1, rating_scale=(0, int(df['hours'].max())))
data = Dataset.load_from_file('data/cleaned.csv', reader=reader)
full_train = data.build_full_trainset()

In [58]:
svd = SVD()
# results = cross_validate(svd, data, measures=['RMSE'], cv=5, verbose=True)

In [59]:
svd_fit = svd.fit(full_train)

In [60]:
svd_fit.bi

array([-0.07727568,  0.02606336,  0.08794973, ...,  0.07376122,
       -0.41861737,  0.04727697])

In [73]:
user = 2
user = full_train.to_raw_uid(user)
top = []
for item in full_train.all_items():
    item = full_train.to_raw_iid(item)
    prediction = svd_fit.predict(user, item)
    uid, iid, true, pred = prediction[0], prediction[1], prediction[2], prediction[3]
    top.append((pred, iid))
    top.sort(key=lambda x: x[0], reverse=True)
    top = top[:10]

In [74]:
df2 = pd.read_csv('data/games.csv')

In [75]:
print(top)

[(1.238901071207976, '1052990'), (1.2260111306007015, '1316230'), (1.219381166122263, '698540'), (1.2163785046054, '1423600'), (1.2060602947261878, '1338580'), (1.1983151356173496, '611760'), (1.1974606131020131, '583270'), (1.180806177018449, '339350'), (1.173936397816079, '1483780'), (1.1708440497768535, '1253920')]


In [76]:
df2[df2['app_id'].isin([int(id) for r, id in top])]

Unnamed: 0,app_id,title,date_release,win,mac,linux,rating,positive_ratio,user_reviews,price_final,price_original,discount,steam_deck
60,339350,Choice of Robots,2014-12-19,True,True,True,Overwhelmingly Positive,96,1834,6.99,6.99,0.0,True
145,583270,Cosmic Express,2017-03-16,True,True,True,Very Positive,94,321,9.99,9.99,0.0,True
3286,1423600,BLUE REFLECTION: Second Light,2021-11-08,True,False,False,Mostly Positive,79,678,59.99,59.99,0.0,True
3694,698540,Dungeon Warfare 2,2018-07-06,True,True,False,Very Positive,90,860,14.99,14.99,0.0,True
3823,1253920,Rogue Legacy 2,2022-04-28,True,True,True,Very Positive,90,11679,24.99,24.99,0.0,True
4356,1316230,Force of Nature 2: Ghost Keeper,2021-05-27,True,False,False,Very Positive,82,1445,15.99,15.99,0.0,True
9235,1483780,Tested on Humans: Escape Room,2021-02-24,True,False,False,Very Positive,88,291,9.99,9.99,0.0,True
9879,1338580,McPixel 3,2022-11-14,True,True,True,Very Positive,97,288,9.99,9.99,0.0,True
11041,611760,Don't Escape: 4 Days to Survive,2019-03-11,True,True,False,Overwhelmingly Positive,99,1224,14.99,14.99,0.0,True
13001,1052990,A Monster's Expedition,2020-09-10,True,True,True,Overwhelmingly Positive,95,870,19.99,19.99,0.0,True


## ItemKNN

In [33]:
knn = KNNWithMeans(sim_options={'name':'pearson', 'user_based': False})

In [34]:
knn_fit = knn.fit(full_train)

Computing the pearson similarity matrix...
Done computing similarity matrix.


In [54]:
user = 12322
user = full_train.to_raw_uid(user)
top = []
for item in full_train.all_items():
    item = full_train.to_raw_iid(item)
    prediction = knn_fit.predict(user, item)
    uid, iid, true, pred = prediction[0], prediction[1], prediction[2], prediction[3]
    top.append((pred, iid))
    top.sort(key=lambda x: x[0], reverse=True)
    top = top[:10]

In [55]:
print(top)

[(1.3830595135155457, '1118200'), (1.3670848470829422, '716710'), (1.3598866189544814, '725480'), (1.3454398254379205, '1495860'), (1.336945812807882, '1227890'), (1.3313182199832074, '1392650'), (1.3292865169064478, '1221250'), (1.3035339593997604, '702050'), (1.3027388242791806, '1677770'), (1.296501718582371, '1059990')]


In [56]:
df2[df2['app_id'].isin([int(id) for r, id in top])]

Unnamed: 0,app_id,title,date_release,win,mac,linux,rating,positive_ratio,user_reviews,price_final,price_original,discount,steam_deck
190,716710,東方憑依華　～ Antinomy of Common Flowers.,2018-01-05,True,False,False,Very Positive,90,1662,24.99,24.99,0.0,True
2084,1227890,Summer Memories,2020-06-14,True,False,False,Very Positive,93,9621,13.99,19.99,30.0,True
7013,1392650,BLASTRONAUT,2022-07-27,True,True,True,Very Positive,81,110,18.99,18.99,0.0,True
10061,702050,The Song of Saya,2019-08-12,True,False,False,Very Positive,92,3726,14.99,14.99,0.0,True
10212,1495860,What The Dub?!,2021-04-08,True,True,False,Very Positive,94,223,3.99,7.99,50.0,True
10498,725480,Slap City,2020-09-17,True,False,False,Overwhelmingly Positive,95,2376,19.99,19.99,0.0,True
14248,1059990,Trombone Champ,2022-09-15,True,True,False,Overwhelmingly Positive,98,5491,14.99,14.99,0.0,True
15690,1118200,People Playground,2019-07-23,True,False,False,Overwhelmingly Positive,98,158686,9.99,9.99,0.0,True
18458,1221250,NORCO,2022-03-24,True,True,False,Very Positive,93,1181,14.99,14.99,0.0,True
21018,1677770,The Case of the Golden Idol,2022-10-13,True,True,False,Overwhelmingly Positive,99,2304,17.99,17.99,0.0,True
