In [1]:
import pandas as pd
from RecData import RecData
import numpy as np
import random

In [2]:
recs = pd.read_csv('data/full_pruned.csv')

recs.head()

Unnamed: 0,app_id,helpful,funny,date,is_recommended,hours,user_id,review_id
0,1942280,0,2,2022-12-31,True,7.1,5719965,13815934
1,392160,2,0,2022-12-31,True,170.9,466189,5938400
2,1273400,0,0,2022-12-31,True,10.6,102303,11145613
3,1032430,0,0,2022-12-31,True,0.3,3395651,6226031
4,1794680,0,0,2022-12-31,True,8.2,6842278,1417575


In [3]:
USED_COLS = ['app_id', 'is_recommended', 'user_id']
recs = recs[USED_COLS]

item_data = pd.read_csv('data/games.csv')
titles = item_data[['app_id', 'title']]

print("Shape:", recs.shape)
recs.sort_values(by=['user_id', 'app_id']).head()

Shape: (1482464, 3)


Unnamed: 0,app_id,is_recommended,user_id
420790,12210,True,240
675882,22380,True,240
246231,239140,True,240
539676,251570,True,240
521289,270880,True,240


In [4]:
random.seed(42)
np.random.seed(42)
rec_data = RecData()
rec_data.create_from_dataframe(recs)
rec_data.set_titles(titles)

# del recs

print("Creating splits...")
train_data, val = rec_data.leave_k_out_split(k=1, create_val=False)
print("Done creating splits.")

Creating utility matrix...
Done utility matrix.
Creating splits...
Done user 1 / 63175
Done user 10001 / 63175
Done user 20001 / 63175
Done user 30001 / 63175
Done user 40001 / 63175
Done user 50001 / 63175
Done user 60001 / 63175
Done creating splits.


In [5]:
meta_data = pd.read_json('data/games_metadata.json', lines=True)
meta_data.head()

Unnamed: 0,app_id,description,tags
0,10090,"Call of Duty is back, redefining war like you'...","[Zombies, World War II, FPS, Multiplayer, Acti..."
1,13500,Enter the dark underworld of Prince of Persia ...,"[Action, Adventure, Parkour, Third Person, Gre..."
2,22364,,[Action]
3,113020,Monaco: What's Yours Is Mine is a single playe...,"[Co-op, Stealth, Indie, Heist, Local Co-Op, St..."
4,226560,Escape Dead Island is a Survival-Mystery adven...,"[Zombies, Adventure, Survival, Action, Third P..."


In [6]:
meta_data = meta_data[meta_data['app_id'].isin(recs['app_id'])]
meta_data.shape

(2215, 3)

In [7]:
meta_data['app_id'] = meta_data['app_id'].apply(lambda id: train_data.item_id_to_index(id))
meta_data.head()

Unnamed: 0,app_id,description,tags
0,619,"Call of Duty is back, redefining war like you'...","[Zombies, World War II, FPS, Multiplayer, Acti..."
1,442,Enter the dark underworld of Prince of Persia ...,"[Action, Adventure, Parkour, Third Person, Gre..."
3,1206,Monaco: What's Yours Is Mine is a single playe...,"[Co-op, Stealth, Indie, Heist, Local Co-Op, St..."
4,721,Escape Dead Island is a Survival-Mystery adven...,"[Zombies, Adventure, Survival, Action, Third P..."
6,507,"“METAL SLUG 3”, the masterpiece in SNK’s emble...","[Arcade, Classic, Action, Co-op, Side Scroller..."


In [8]:
meta_data = meta_data.sort_values(by='app_id')
meta_data.head()

Unnamed: 0,app_id,description,tags
6306,0,Brotato is a top-down arena shooter roguelite ...,"[Early Access, Action Roguelike, Roguelite, Ar..."
15339,1,X4: FOUNDATIONS brings our most sophisticated ...,"[Exploration, Sandbox, Space, Building, Econom..."
7303,2,Construction Simulator is back – bigger and be...,"[Simulation, Multiplayer, Relaxing, Online Co-..."
331,3,You are Wendigo. A hard-hearted contract kille...,"[Gore, Action, Violent, VR, Early Access, Horr..."
2671,4,Mow down thousands of night creatures and surv...,"[Action Roguelike, Bullet Hell, Pixel Graphics..."


In [9]:
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [10]:
mlb = MultiLabelBinarizer(sparse_output=True)
meta_tags = meta_data.join(pd.DataFrame.sparse.from_spmatrix(
    mlb.fit_transform(meta_data.pop('tags')),
    index=meta_data.index,
    columns=mlb.classes_
))
meta_tags.head()

Unnamed: 0,app_id,description,1980s,1990's,2.5D,2D,2D Fighter,2D Platformer,360 Video,3D,...,Well-Written,Werewolves,Western,Wholesome,Word Game,World War I,World War II,Wrestling,Zombies,eSports
6306,0,Brotato is a top-down arena shooter roguelite ...,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
15339,1,X4: FOUNDATIONS brings our most sophisticated ...,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7303,2,Construction Simulator is back – bigger and be...,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
331,3,You are Wendigo. A hard-hearted contract kille...,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2671,4,Mow down thousands of night creatures and surv...,1,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [11]:
genre_data = meta_tags.drop(columns=['description'])
genre_data.head()

Unnamed: 0,app_id,1980s,1990's,2.5D,2D,2D Fighter,2D Platformer,360 Video,3D,3D Fighter,...,Well-Written,Werewolves,Western,Wholesome,Word Game,World War I,World War II,Wrestling,Zombies,eSports
6306,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
15339,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7303,2,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
331,3,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2671,4,1,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [12]:
desc_data = meta_tags[['app_id', 'description']]
desc_data.head()

Unnamed: 0,app_id,description
6306,0,Brotato is a top-down arena shooter roguelite ...
15339,1,X4: FOUNDATIONS brings our most sophisticated ...
7303,2,Construction Simulator is back – bigger and be...
331,3,You are Wendigo. A hard-hearted contract kille...
2671,4,Mow down thousands of night creatures and surv...


In [13]:
tf = TfidfVectorizer()
X = tf.fit_transform(desc_data['description'])

In [14]:
desc_feats = desc_data.join(pd.DataFrame.sparse.from_spmatrix(X, index=desc_data.index))
desc_feats = desc_feats.drop(columns=['description'])
desc_feats.head()

Unnamed: 0,app_id,0,1,2,3,4,5,6,7,8,...,10057,10058,10059,10060,10061,10062,10063,10064,10065,10066
6306,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
15339,1,0.253324,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7303,2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
331,3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2671,4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [15]:
desc_sims = cosine_similarity(desc_feats.drop(columns=['app_id']), dense_output=True)
genre_sims = cosine_similarity(genre_data.drop(columns=['app_id']), dense_output=True)

In [16]:
from KNN import ContentKNN
from pprint import pprint

In [24]:
genre_knn = ContentKNN(k=40)
genre_knn.fit(genre_data.drop(columns=['app_id']))
desc_knn = ContentKNN(k=40)
desc_knn.fit(desc_feats.drop(columns=['app_id']))

In [22]:
train_data.search_title('fallout')

[('Fallout 76', 273),
 ('Fallout Tactics: Brotherhood of Steel', 1593),
 ('Fallout: New Vegas', 11),
 ('Fallout Shelter', 813),
 ('Fallout 4', 577),
 ('Fallout 4 VR', 872),
 ('Fallout 3: Game of the Year Edition', 126)]

In [23]:
prefs = train_data.create_prefs([(278, 1), (0, 0), (577, 1)])
top = genre_knn.top_n(13, 10, prefs=prefs)
pprint(top)
pprint([train_data.index_to_title(i) for r, i in top])

[(1.0, 8),
 (1.0, 24),
 (1.0, 59),
 (1.0, 100),
 (1.0, 172),
 (1.0, 238),
 (1.0, 279),
 (1.0, 283),
 (1.0, 313),
 (1.0, 365)]
['Age of Empires IV: Anniversary Edition',
 'ASTRONEER',
 'Detroit: Become Human',
 'Wylde Flowers',
 'SnowRunner',
 'Warhammer 40000: Gladius - Relics of War',
 'Sid Meier’s Civilization® VI',
 'What Remains of Edith Finch',
 'Anno 1800',
 'RIDE 4']
