# Steam

Напишите ИИ, который рекомендует видеоигры пользователям Steam, используя матричную факторизацию.

## Variables
*int* `user_id`: числовой идентификатор для идентификации при анонимизации пользователей Steam.
*str* `game_title`: название игры, с которой пользователь взаимодействовал.  
*str* `behavior`: тип поведения, демонстрируемый пользователем "purchase" or "play"  
*int* `value`: if the `behavior` value is "purchase", the `value` is always 1; otherwise, it specifies the number of hours the game has been played by the user


## Setup

Import the libraries and functions to be used.

In [None]:
#https://www.kaggle.com/tamber/steam-video-games

In [2]:
%reset -f

import csv

import numpy as np
import tensorflow as tf
import tensorflow_datasets as tfds
import tensorflow_recommenders as tfrs

from typing import Dict, Text

## Loading the dataset

Use the `DictReader()` function from the `csv` library to read the file and append each purchase as a dictionary mapping to a list.

In [3]:
data = []
with open('steam-200k.csv', 'r') as f:
    reader = csv.DictReader(f)
    for row in reader:
        if row['behavior'] == 'purchase':
            data.append(row)

for item in data[:3]:
    print(item)

{'user_id': '151603712', 'game_title': 'The Elder Scrolls V Skyrim', 'behavior': 'purchase', 'value': '1'}
{'user_id': '151603712', 'game_title': 'Fallout 4', 'behavior': 'purchase', 'value': '1'}
{'user_id': '151603712', 'game_title': 'Spore', 'behavior': 'purchase', 'value': '1'}


In [5]:
len(data)

129511

## Formatting the data

После выбора соответствующих функций преобразуйте данные в формат, который tensorflow может читать и обрабатывать.

In [7]:
purchases = tf.data.Dataset.from_tensor_slices({
        'user_id': list(map(lambda x: x['user_id'], data)),
        'game_title': list(map(lambda x: x['game_title'], data))
    }
)

games = tf.data.Dataset.from_tensor_slices(list(set(map(
            lambda x: x['game_title'], data)
        )
    )
)

## Preprocessing

Сопоставьте функции с целочисленными индексами для встраивания.

In [8]:
user_ids_vocabulary = tf.keras.layers.StringLookup() #Слой предварительной обработки, который сопоставляет строковые функции с целочисленными индексами 
user_ids_vocabulary.adapt(purchases.map(lambda x: x['user_id'])) 

game_titles_vocabulary = tf.keras.layers.StringLookup() #Слой предварительной обработки, который сопоставляет строковые функции с целочисленными индексами
game_titles_vocabulary.adapt(games)

In [None]:
#https://www.tensorflow.org/api_docs/python/tf/keras/layers/StringLookup

## Model design

Define a class specifying the `compute_loss` function.

In [9]:
class SteamModel(tfrs.Model):

    def __init__(
        self,
        user_model: tf.keras.Model,
        game_model: tf.keras.Model,
        task: tfrs.tasks.Retrieval
    ):

        super().__init__()

        self.user_model = user_model
        self.game_model = game_model

        self.task = task

    def compute_loss(self, features: Dict[Text, tf.Tensor], training=False) -> tf.Tensor:

        user_embeddings = self.user_model(features["user_id"])
        game_embeddings = self.game_model(features["game_title"])

        return self.task(user_embeddings, game_embeddings)

Add the embedding layers to the user and game models and define the factorized retrieval task

In [10]:
user_model = tf.keras.Sequential([
        user_ids_vocabulary,
        tf.keras.layers.Embedding(user_ids_vocabulary.vocabulary_size(), 64)
    ]
)

game_model = tf.keras.Sequential([
        game_titles_vocabulary,
        tf.keras.layers.Embedding(game_titles_vocabulary.vocabulary_size(), 64)
    ]
)

task = tfrs.tasks.Retrieval(
    metrics=tfrs.metrics.FactorizedTopK(
        games.batch(128).map(game_model)
    )
)

Initialize and train the retrieval model.

In [12]:
model = SteamModel(user_model, game_model, task)
model.compile(optimizer=tf.keras.optimizers.Adagrad(0.5))
model.fit(purchases.batch(4096), epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x259e5234ca0>

Get video game recommendations from the model.

In [13]:
index = tfrs.layers.factorized_top_k.BruteForce(model.user_model)
index.index_from_dataset(
    games.batch(100).map(lambda id: (id, model.game_model(id)))
)

<tensorflow_recommenders.layers.factorized_top_k.BruteForce at 0x2580c4d2c10>

In [36]:
USERS = ['151603712', '187131847', '59945701','100070732']

print(f"Top 3 games to play for users")
for user in USERS:
    _, ids = index(np.array([user]))
    print(f"    {user}: {ids[0, :10]}")

Top 3 games to play for users
    151603712: [b'PAC-MAN Championship Edition DX+'
 b'HuniePop Official Digital Art Collection'
 b'HuniePop Original Soundtrack' b'Aggression Europe Under Fire' b'Spore'
 b'HuniePop' b"Tony Hawk's Pro Skater HD" b'Jazzpunk' b'TerraTech'
 b'The Banner Saga - Mod Content']
    187131847: [b'Dota 2' b'Pyroblazer' b'The Amazing Spider-Man' b'Requiem'
 b'Dead or Alive 5 Last Round'
 b'Samantha Swift and the Hidden Roses of Athena'
 b'Nancy Drew Lights, Camera, Curses!'
 b'Midnight Mysteries Salem Witch Trials'
 b'Midnight Mysteries 4 Haunted Houdini'
 b'Midnight Mysteries 3 Devil on the Mississippi']
    59945701: [b'Cities in Motion 2' b'Skullgirls Endless Beta' b'Skullgirls'
 b'GUILTY GEAR XX ACCENT CORE PLUS R'
 b'FINAL FANTASY XIV A Realm Reborn CE (NA version)'
 b'THE KING OF FIGHTERS XIII STEAM EDITION' b'Sanctum'
 b'Ultra Street Fighter IV' b'Orcs Must Die!'
 b"Baldur's Gate Enhanced Edition"]
    100070732: [b'Cities Skylines' b'Arma 2'
 b'Arma 2 Opera

In [18]:
import pandas as pd
df = pd.DataFrame(data)

In [22]:
df

Unnamed: 0,user_id,game_title,behavior,value
0,151603712,The Elder Scrolls V Skyrim,purchase,1
1,151603712,Fallout 4,purchase,1
2,151603712,Spore,purchase,1
3,151603712,Fallout New Vegas,purchase,1
4,151603712,Left 4 Dead 2,purchase,1
...,...,...,...,...
129506,128470551,Fallen Earth,purchase,1
129507,128470551,Magic Duels,purchase,1
129508,128470551,Titan Souls,purchase,1
129509,128470551,Grand Theft Auto Vice City,purchase,1


In [27]:
df.groupby(['user_id'])['game_title'].count()

user_id
100012061     1
100053304     7
100057229     5
100070732     7
100096071    38
             ..
99812428      1
99906508      2
99940330      1
99961115      5
99992274      1
Name: game_title, Length: 12393, dtype: int64

In [68]:
for i in  _h :
    if not np.any((df[df['user_id']=='151603712']['game_title']==i)):
        print(i)
        print("-"*4)

b'PAC-MAN Championship Edition DX+'
----
b'HuniePop Official Digital Art Collection'
----
b'HuniePop Original Soundtrack'
----
b'Aggression Europe Under Fire'
----
b'Spore'
----
b'HuniePop'
----
b"Tony Hawk's Pro Skater HD"
----
b'Jazzpunk'
----
b'TerraTech'
----
b'The Banner Saga - Mod Content'
----


In [50]:
_h = [b'PAC-MAN Championship Edition DX+',
 b'HuniePop Official Digital Art Collection',
 b'HuniePop Original Soundtrack', b'Aggression Europe Under Fire', b'Spore',
 b'HuniePop', b"Tony Hawk's Pro Skater HD", b'Jazzpunk', b'TerraTech',
 b'The Banner Saga - Mod Content']