In [1]:
import pandas as pd
from RecData import RecData
import numpy as np
import random

In [2]:
# recs = pd.read_csv('data/recommendations.csv')
# recs = pd.read_csv('data/pruned.csv')
# recs = pd.read_csv('data/2plus.csv')
recs = pd.read_csv('data/full_pruned.csv')

# recs = recs[:5]


recs = recs.sort_values(by='date')
recs = recs.drop_duplicates(subset=['user_id', 'app_id'], keep='last')

USED_COLS = ['app_id', 'is_recommended', 'user_id']
recs = recs[USED_COLS]

item_data = pd.read_csv('data/games.csv')
titles = item_data[['app_id', 'title']]

print("Shape:", recs.shape)
recs.sort_values(by=['user_id', 'app_id']).head()


Shape: (1482464, 3)


Unnamed: 0,app_id,is_recommended,user_id
420790,12210,True,240
675882,22380,True,240
246231,239140,True,240
539676,251570,True,240
521289,270880,True,240


In [3]:
random.seed(42)
np.random.seed(42)
rec_data = RecData()
rec_data.create_from_dataframe(recs)
rec_data.set_titles(titles)

# del recs

print("Creating splits...")
train_data, val = rec_data.leave_k_out_split(k=1, create_val=False)
print("Done creating splits.")

Creating utility matrix...
Done utility matrix.
Creating splits...
Done user 1 / 63175
Done user 10001 / 63175
Done user 20001 / 63175
Done user 30001 / 63175
Done user 40001 / 63175
Done user 50001 / 63175
Done user 60001 / 63175
Done creating splits.


In [4]:
meta_data = pd.read_json('data/games_metadata.json', lines=True)
meta_data.head()

Unnamed: 0,app_id,description,tags
0,10090,"Call of Duty is back, redefining war like you'...","[Zombies, World War II, FPS, Multiplayer, Acti..."
1,13500,Enter the dark underworld of Prince of Persia ...,"[Action, Adventure, Parkour, Third Person, Gre..."
2,22364,,[Action]
3,113020,Monaco: What's Yours Is Mine is a single playe...,"[Co-op, Stealth, Indie, Heist, Local Co-Op, St..."
4,226560,Escape Dead Island is a Survival-Mystery adven...,"[Zombies, Adventure, Survival, Action, Third P..."


In [5]:
meta_data = meta_data[meta_data['app_id'].isin(recs['app_id'])]
meta_data.shape

(2215, 3)

In [6]:
meta_data['app_id'] = meta_data['app_id'].apply(lambda id: train_data.item_id_to_index(id))
meta_data.head()

Unnamed: 0,app_id,description,tags
0,34,"Call of Duty is back, redefining war like you'...","[Zombies, World War II, FPS, Multiplayer, Acti..."
1,98,Enter the dark underworld of Prince of Persia ...,"[Action, Adventure, Parkour, Third Person, Gre..."
3,173,Monaco: What's Yours Is Mine is a single playe...,"[Co-op, Stealth, Indie, Heist, Local Co-Op, St..."
4,327,Escape Dead Island is a Survival-Mystery adven...,"[Zombies, Adventure, Survival, Action, Third P..."
6,229,"“METAL SLUG 3”, the masterpiece in SNK’s emble...","[Arcade, Classic, Action, Co-op, Side Scroller..."


In [7]:
meta_data = meta_data.sort_values(by='app_id')
meta_data.head()

Unnamed: 0,app_id,description,tags
267,0,The sequel to the million-plus selling Dead Ri...,"[Zombies, Action, Open World, Co-op, Adventure..."
9698,1,"Create, discover, and download new player-crea...","[Turn-Based Strategy, Strategy, Turn-Based, Mu..."
11955,2,Fight in the theatre of war that changed the w...,"[World War II, Action, FPS, Realistic, Multipl..."
16484,3,Ride your music. Audiosurf is a music-adapting...,"[Music, Rhythm, Indie, Casual, Music-Based Pro..."
10445,4,Counter-Strike: Source blends Counter-Strike's...,"[Shooter, Action, FPS, Multiplayer, Team-Based..."


In [8]:
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [9]:
mlb = MultiLabelBinarizer(sparse_output=True)
meta_tags = meta_data.join(pd.DataFrame.sparse.from_spmatrix(
    mlb.fit_transform(meta_data.pop('tags')),
    index=meta_data.index,
    columns=mlb.classes_
))
meta_tags.head()

Unnamed: 0,app_id,description,1980s,1990's,2.5D,2D,2D Fighter,2D Platformer,360 Video,3D,...,Well-Written,Werewolves,Western,Wholesome,Word Game,World War I,World War II,Wrestling,Zombies,eSports
267,0,The sequel to the million-plus selling Dead Ri...,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
9698,1,"Create, discover, and download new player-crea...",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
11955,2,Fight in the theatre of war that changed the w...,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
16484,3,Ride your music. Audiosurf is a music-adapting...,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
10445,4,Counter-Strike: Source blends Counter-Strike's...,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1


In [10]:
genre_data = meta_tags.drop(columns=['description'])
genre_data.head()

Unnamed: 0,app_id,1980s,1990's,2.5D,2D,2D Fighter,2D Platformer,360 Video,3D,3D Fighter,...,Well-Written,Werewolves,Western,Wholesome,Word Game,World War I,World War II,Wrestling,Zombies,eSports
267,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
9698,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
11955,2,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
16484,3,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
10445,4,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1


In [11]:
desc_data = meta_tags[['app_id', 'description']]
desc_data.head()

Unnamed: 0,app_id,description
267,0,The sequel to the million-plus selling Dead Ri...
9698,1,"Create, discover, and download new player-crea..."
11955,2,Fight in the theatre of war that changed the w...
16484,3,Ride your music. Audiosurf is a music-adapting...
10445,4,Counter-Strike: Source blends Counter-Strike's...


In [12]:
tf = TfidfVectorizer()
X = tf.fit_transform(desc_data['description'])

In [13]:
desc_feats = desc_data.join(pd.DataFrame.sparse.from_spmatrix(X, index=desc_data.index))
desc_feats = desc_feats.drop(columns=['description'])
desc_feats.head()

Unnamed: 0,app_id,0,1,2,3,4,5,6,7,8,...,10057,10058,10059,10060,10061,10062,10063,10064,10065,10066
267,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9698,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
11955,2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
16484,3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10445,4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [14]:
desc_sims = cosine_similarity(desc_feats.drop(columns=['app_id']), dense_output=True)
genre_sims = cosine_similarity(genre_data.drop(columns=['app_id']), dense_output=True)

In [15]:
from KNN import ContentKNN
from pprint import pprint

In [16]:
genre_knn = ContentKNN(k=40)
genre_knn.fit(genre_data.drop(columns=['app_id']))
desc_knn = ContentKNN(k=40)
desc_knn.fit(desc_feats.drop(columns=['app_id']))

In [17]:
train_data.search_title('dave')

[('DAVE THE DIVER', 2147)]

In [18]:
prefs = train_data.create_prefs([(810, 1), (0, 0), (618, 1), (642, 1), (39, 0), (1397, 0), (1292, 0)])
top = genre_knn.top_n(13, 10, prefs=prefs)
pprint(top)
pprint([train_data.index_to_title(i) for r, i in top])

[(1.0054124852471191, 988),
 (0.9529622433462526, 767),
 (0.9527062426235595, 1033),
 (0.9418403105430628, 1422),
 (0.9400045772443012, 1123),
 (0.9199155640655556, 1032),
 (0.9027514482609086, 1086),
 (0.9027062426235595, 731),
 (0.9027062426235595, 1532),
 (0.9027062426235595, 1746)]
['The Golf Club™ 2019 featuring PGA TOUR',
 'Capitalism 2',
 'Project Hospital',
 'Blackjack Championship',
 'Star Chef: Cooking & Restaurant Game',
 'Virtual Villagers Origins 2',
 'Wizard And Minion Idle',
 'Production Line : Car factory simulation',
 'Nebuchadnezzar',
 'Lords and Villeins']


In [19]:
train_data.search_title('runescape')

[('Old School RuneScape', 1537), ('RuneScape ®', 1454)]

In [20]:
prefs = train_data.create_prefs([(810, 1), (0, 0), (145, 0), (1326, 1), (285, 0), (1032, 0), (26, 0), (2122, 0)])
top = desc_knn.top_n(13, 10, prefs=prefs)
pprint(top)
pprint([train_data.index_to_title(i) for r, i in top])

[(0.27560074008154467, 1247),
 (0.27556227207046396, 2139),
 (0.25597898292080856, 1121),
 (0.2511147809961585, 628),
 (0.2264987806712772, 1149),
 (0.2230634377847922, 2127),
 (0.20460057680308444, 1089),
 (0.19511358085395164, 1202),
 (0.19273862949051448, 308),
 (0.1922027705200521, 595)]
['Unity of Command II',
 'Cosmoteer: Starship Architect & Commander',
 'SUPER DRAGON BALL HEROES WORLD MISSION',
 'Shadowverse CCG',
 'Iron Marines',
 'Right and Down',
 "Conqueror's Blade",
 'Fantasy General II',
 'TRON 2.0',
 'Crossout']


## Ensemble KNN

In [21]:
from KNN import EnsembleKNN, ItemKNN
import pickle

In [22]:
import numba as nb
from numba import jit
from KNN import ItemKNN

model_dir = "saved_models/knn/sim1.pkl" 

with open(model_dir, 'rb') as file:
    train_data, sims, means = pickle.load(file)

@jit
def make_dict(items):
    return {k: v for k,v in items}

means_prime = make_dict(tuple(means.items()))
knn = ItemKNN(k=40, mean_centered=True, iuf=True)
knn._sims = sims
knn._item_means = means_prime
knn._num_users, knn._num_items = train_data.get_matrix().shape
knn._M = train_data.get_matrix()
knn._store_rating_pairs(knn._M)
knn._store_item_means(knn._M)


  @jit
[1m
File "..\..\..\AppData\Local\Temp\ipykernel_23028\3095054923.py", line 10:[0m
[1m<source missing, REPL/exec in use?>[0m
[0m
Fall-back from the nopython compilation path to the object mode compilation path has been detected. This is deprecated behaviour that will be removed in Numba 0.59.0.

For more information visit https://numba.readthedocs.io/en/stable/reference/deprecation.html#deprecation-of-object-mode-fall-back-behaviour-when-using-jit
[1m
File "..\..\..\AppData\Local\Temp\ipykernel_23028\3095054923.py", line 10:[0m
[1m<source missing, REPL/exec in use?>[0m
[0m


Storing ratings in dictionary...
Done storing in dictionary.
Computing item means...
Done item 1 / 2215
Done item 101 / 2215
Done item 201 / 2215
Done item 301 / 2215
Done item 401 / 2215
Done item 501 / 2215
Done item 601 / 2215
Done item 701 / 2215
Done item 801 / 2215
Done item 901 / 2215
Done item 1001 / 2215
Done item 1101 / 2215
Done item 1201 / 2215
Done item 1301 / 2215
Done item 1401 / 2215
Done item 1501 / 2215
Done item 1601 / 2215
Done item 1701 / 2215
Done item 1801 / 2215
Done item 1901 / 2215
Done item 2001 / 2215
Done item 2101 / 2215
Done item 2201 / 2215
Done computing item means.


In [23]:
train_data.search_title('disn')

[('Disney•Pixar Cars 2: The Video Game', 1350),
 ('Disney•Pixar Brave: The Video Game', 2073),
 ('Disney G-Force', 1634),
 ('Disney Pirates of the Caribbean: At Worlds End', 1622),
 ("Disney•Pixar Cars Toon: Mater's Tall Tales", 2212),
 ('Disney Princess: My Fairytale Adventure', 2014),
 ('Disney Dreamlight Valley', 107),
 ('Disney•Pixar Cars: Radiator Springs Adventures', 2114),
 ('Disney Universe', 2077),
 ('Disney Tangled', 1051),
 ('Disneyland Adventures', 1918),
 ('Disney Alice in Wonderland', 1253),
 ("Disney's Chicken Little", 1960)]

In [24]:
prefs = train_data.create_prefs([(278, 1), (1489, 0), (2212, 0)])
top = knn.top_n(13, 10, prefs=prefs)
pprint(top)
pprint([train_data.index_to_title(i) for r, i in top])

[(6.97353306935975, 2103),
 (6.81256902191607, 1515),
 (6.81256902191607, 1617),
 (6.6810518189991726, 1254),
 (6.6810518189991726, 1680),
 (6.6810518189991726, 1685),
 (6.6810518189991726, 1818),
 (6.6810518189991726, 2000),
 (6.6810518189991726, 2114),
 (6.6810518189991726, 2140)]
['World Of Robots',
 'Right and Down',
 'Nancy Drew®: Secret of the Scarlet Hand',
 'Floor Plan 2',
 'Gaia Project',
 'Dummynation',
 'Warbox Sandbox',
 'Battle Chess',
 'Disney•Pixar Cars: Radiator Springs Adventures',
 "Tony Stewart's All-American Racing"]


In [43]:
ens_knn = EnsembleKNN(k=40)
ens_knn.set_sims([(knn._sims, 0.5), (genre_knn._sims, 0.25), (desc_knn._sims, 0.25)])

In [47]:
train_data.search_title('elder')

[('The Elder Scrolls III: Morrowind® Game of the Year Edition', 286),
 ('The Elder Scrolls IV: Oblivion® Game of the Year Edition', 371),
 ('The Elder Scrolls® Online', 431),
 ('The Elder Scrolls V: Skyrim VR', 635),
 ('The Elder Scrolls V: Skyrim Special Edition', 278)]

In [50]:
prefs = train_data.create_prefs([(278, 1), (577, 1), (286, 1), (371, 1), (365, 1)])
top = ens_knn.top_n(13, 10, prefs=prefs)
pprint(top)
pprint([train_data.index_to_title(i) for r, i in top])

[(2.0045590483279248, 578),
 (1.9603361963958728, 1085),
 (1.8486378637673875, 844),
 (1.8350801837942514, 1837),
 (1.8085100074522713, 1788),
 (1.8048041296803012, 1195),
 (1.7731068946801547, 533),
 (1.7709836709745974, 2026),
 (1.7510139710484254, 1607),
 (1.656881929271563, 2063)]
['暖雪 Warm Snow',
 'Wee Tanks!',
 'Miss Neko 2',
 'Find Yourself',
 'Soundfall',
 'Gunlocked',
 'The Oregon Trail',
 'My Little Blacksmith Shop',
 'Strategic Mind: The Pacific',
 'Trackmania® Turbo']


In [28]:
# Ensure file exists
model_dir = "saved_models/knn/ens_knn.pkl" 
file = open(model_dir, 'a')
file.close()

# Save model
print("Saving model...")
with open(model_dir, 'wb') as file:
    pickle.dump([train_data, ens_knn], file)
print("Done saving model.")

Saving model...
Done saving model.
