# Content-based рекомендации на базе TF-IDF 

#### Описание задачи
- по имеющимся данным о пользователе (понравившиеся ему игры) предоставить рекомендации, состоящие из M игр, похожих на его предпочтения.

#### Описание эксперимента
- рекомендации на основе схожести игр (content-based)
- объединение текстовых характеристик (описание, категории, теги) в единое текстовое описание для каждой игры
- векторизация текстового представления игр методом TF-IDF
- использование библиотеки faiss для быстрого поиска похожих векторов
- итоговая рекомендация строится по плану: исходя из числа M и числа N (количество понравившихся игр) рассчитывается сколько наиболее близких игр для каждой понравившейся игры будет найдено. Также из рекомендации отсекаются те игры, которые уже понравились.

## 1. Загрузка данных

In [1]:
import pandas as pd

In [2]:
data = pd.read_csv("/kaggle/input/recsys-cw-data/combined_steam_games.csv")
data.head()

Unnamed: 0,app_id,name,release_date,required_age,price,dlc_count,short_description,windows,mac,linux,...,discount,developers,publishers,categories,genres,supported_languages,full_audio_languages,tags,screenshots_count,min_package_price
0,20200,Galactic Bowling,"Oct 21, 2008",0,19.99,0,Galactic Bowling is an exaggerated and stylize...,True,False,False,...,0,Perpetual FX Creative,Perpetual FX Creative,"Single-player, Multi-player, Steam Achievement...","Casual, Indie, Sports",English,,"Indie:22, Casual:21, Sports:21, Bowling:6",10,19.99
1,655370,Train Bandit,"Oct 12, 2017",0,0.99,0,THE LAW!! Looks to be a showdown atop a train....,True,True,False,...,0,Rusty Moyher,Wild Rooster,"Single-player, Steam Achievements, Full contro...","Action, Indie","English, French, Italian, German, Spanish - Sp...",,"Indie:109, Action:103, Pixel Graphics:100, 2D:...",5,0.99
2,1732930,Jolt Project,"Nov 17, 2021",0,4.99,0,"Shoot vehicles, blow enemies with a special at...",True,False,False,...,0,Campião Games,Campião Games,Single-player,"Action, Adventure, Indie, Strategy","English, Portuguese - Brazil",,,6,4.99
3,1355720,Henosis™,"Jul 23, 2020",0,5.99,0,HENOSIS™ is a mysterious 2D Platform Puzzler w...,True,True,True,...,0,Odd Critter Games,Odd Critter Games,"Single-player, Full controller support","Adventure, Casual, Indie","English, French, Italian, German, Spanish - Sp...",,"2D Platformer:161, Atmospheric:154, Surreal:15...",7,5.99
4,1139950,Two Weeks in Painland,"Feb 3, 2020",0,0.0,0,Two Weeks in Painland is a story-driven game a...,True,True,False,...,0,Unusual Games,Unusual Games,"Single-player, Steam Achievements","Adventure, Indie","English, Spanish - Spain",,"Indie:42, Adventure:41, Nudity:22, Violent:21,...",24,0.0


In [3]:
# Оставляем только важные признаки
columns_to_save = [
    "app_id",
    "name",
    "short_description",
    "categories",
    "genres",
    "tags"
]

data = data[columns_to_save]
data.head()

Unnamed: 0,app_id,name,short_description,categories,genres,tags
0,20200,Galactic Bowling,Galactic Bowling is an exaggerated and stylize...,"Single-player, Multi-player, Steam Achievement...","Casual, Indie, Sports","Indie:22, Casual:21, Sports:21, Bowling:6"
1,655370,Train Bandit,THE LAW!! Looks to be a showdown atop a train....,"Single-player, Steam Achievements, Full contro...","Action, Indie","Indie:109, Action:103, Pixel Graphics:100, 2D:..."
2,1732930,Jolt Project,"Shoot vehicles, blow enemies with a special at...",Single-player,"Action, Adventure, Indie, Strategy",
3,1355720,Henosis™,HENOSIS™ is a mysterious 2D Platform Puzzler w...,"Single-player, Full controller support","Adventure, Casual, Indie","2D Platformer:161, Atmospheric:154, Surreal:15..."
4,1139950,Two Weeks in Painland,Two Weeks in Painland is a story-driven game a...,"Single-player, Steam Achievements","Adventure, Indie","Indie:42, Adventure:41, Nudity:22, Violent:21,..."


In [4]:
# Сопоставления между названиями игр и их id
name_to_id = data['name'].to_dict()
id_to_name = {item[1]: item[0] for item in name_to_id.items()}


# Сопоставления между id игр и их индексами в таблице (пригодится при сравнении векторов)
idx_to_id = data['app_id'].to_dict()
id_to_idx = {item[1]: item[0] for item in idx_to_id.items()}

## 2. Предобработка - формирование объединенного текстового описания

In [5]:
import re
import nltk
from nltk.tokenize import WordPunctTokenizer

nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [6]:
from nltk.corpus import stopwords

stop_words = set(stopwords.words("english"))


def process_short_description(text: str) -> str:
    """Предобработка описания игры"""
    
    if type(text) is not str: # обработка nan
        return ''
    
    text = text.lower()
    text = re.sub(r'[!"#$%&\'()*+,\-./:;<=>?@[\\\]^_`{|}~]', '', text)
    tokens = WordPunctTokenizer().tokenize(text)
    tokens = [token for token in tokens if token not in stop_words]
    
    return " ".join(tokens)


def process_categories(text: str) -> str:
    if type(text) is not str: # обработка nan
        return ''
    
    text = text.lower()
    # удаляем запятые
    text = [category.strip() for category in text.split(',')]
    return ' '.join(text)


def process_genres(text: str) -> str:
    if type(text) is not str: # обработка nan
        return ''
    
    text = text.lower()
    # удаляем запятые
    text = [category.strip() for category in text.split(',')]
    return ' '.join(text)


def process_tags(text: str) -> str:
    if type(text) is not str: # обработка nan
        return ''

    text = text.lower()
    text = [tag.strip().split(':')[0] for tag in text.split(',')]
    return ' '.join(text)


data['short_description'] = data['short_description'].apply(process_short_description)
data['categories'] = data['categories'].apply(process_categories)
data['genres'] = data['genres'].apply(process_genres)
data['tags'] = data['tags'].apply(process_tags)

In [7]:
def merge(row):
    short_description = row['short_description'].split()
    categories = row['categories'].split()
    genres = row['genres'].split()
    tags = row['tags'].split()

    merged_genres_tags = list(set(genres) | set(tags))

    return ' '.join(short_description + categories + merged_genres_tags)

In [8]:
corpus = data.apply(merge, axis=1)
corpus

0         galactic bowling exaggerated stylized bowling ...
1         law looks showdown atop train last fight good ...
2         shoot vehicles blow enemies special attack pro...
3         henosis ™ mysterious 2d platform puzzler playe...
4         two weeks painland storydriven game runofthemi...
                                ...                        
115489    classic survival horror meets striking anime v...
115490    minimetroidvania festive adventure help discov...
115491    post ww1 horror game soldier coming back trenc...
115492    entropia universe unique massively multiplayer...
115493    2d top horror game takes story girl named sara...
Length: 115494, dtype: object

## 3. Векторизация

In [9]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(max_features=500)

vectors = vectorizer.fit_transform(corpus).toarray()

In [10]:
def get_game_vector(game_id):
    idx = id_to_idx[game_id]
    return vectors[idx]

#### Faiss - движок для быстрого поиска k ближайших соседей
Статья: https://habr.com/ru/companies/okkamgroup/articles/509204/

In [None]:
!pip install faiss-cpu

In [12]:
import faiss
import numpy as np

index = faiss.IndexFlatIP(vectors.shape[1])  # индекс для расчета косинусной близости
index.add(vectors)

In [13]:
def find_similar_games(games: list[int], k: int = 10):
    """Поиск игр похожих на заданные"""
    vectors = []
    for game in games:
        vectors.append(get_game_vector(game))
    vectors = np.array(vectors)

    distances, indexes = index.search(vectors, k+1)  # +1 чтобы исключить исходные игры
    return distances[:, 1:], indexes[:, 1:]

In [14]:
find_similar_games([20200])

(array([[0.5696113 , 0.4905705 , 0.45238084, 0.44906962, 0.44654554,
         0.4126915 , 0.41168717, 0.40315405, 0.40002394, 0.39885405]],
       dtype=float32),
 array([[43683,   795, 71681, 44619, 19843, 30541, 98270,  4012, 63603,
          8963]]))

In [15]:
display(data[data['app_id'] == 20200])
display(data[data['app_id'] == idx_to_id[43683]])

Unnamed: 0,app_id,name,short_description,categories,genres,tags
0,20200,Galactic Bowling,galactic bowling exaggerated stylized bowling ...,single-player multi-player steam achievements ...,casual indie sports,indie casual sports bowling


Unnamed: 0,app_id,name,short_description,categories,genres,tags
43683,794570,XBall Champion,xball champion casual sport game unique game play,single-player multi-player steam achievements,sports,sports


## 4. Итоговая рекомендательная система

In [16]:
# Предположим на вход поступает набор из понравившихся пользователю игр
liked = [
    20200,
    655370,
    1732930,
    1355720,
    1139950
]

# и мы хотим порекомендовать ему еще M новых игр
M = 20 

# Алгоритм следующий:
# Получаем для каждой игры из набора M самых близких
# Объединяем все рекомендации в один отсортированный массив
# Проходимся по этому массиву и добавляем в рекомендации те игры, которые пользователь еще не оценил
from dataclasses import dataclass

@dataclass
class Recommendation:
    original: int # исходная игра
    recommendation: int # похожая на нее
    similarity: int # коэффициент похожести


def recommend(liked: list[int], M: int):
    distances, indexes = find_similar_games(liked, M)
    merged = []
    for i in range(len(liked)):
        liked_i = liked[i]
        rec_i = indexes[i]
        dists_i = distances[i]
        for j in range(M):
            merged.append((liked_i, rec_i[j], dists_i[j]))
            
    merged = sorted(merged, key=lambda x: x[2], reverse=True)

    answer = []
    liked = set(liked)
    for x in merged:
        if x[1] not in liked:
            answer.append(Recommendation(original=x[0], recommendation=x[1], similarity=x[2]))
            if len(answer) == M:
                break
    return answer

In [17]:
recommend(liked, M)

[Recommendation(original=655370, recommendation=659, similarity=0.7527969),
 Recommendation(original=655370, recommendation=71834, similarity=0.72972107),
 Recommendation(original=655370, recommendation=69787, similarity=0.72519106),
 Recommendation(original=655370, recommendation=43214, similarity=0.71842945),
 Recommendation(original=655370, recommendation=15862, similarity=0.71732473),
 Recommendation(original=655370, recommendation=70513, similarity=0.7147898),
 Recommendation(original=655370, recommendation=42958, similarity=0.70751804),
 Recommendation(original=655370, recommendation=13166, similarity=0.7037952),
 Recommendation(original=655370, recommendation=52706, similarity=0.70025134),
 Recommendation(original=655370, recommendation=39413, similarity=0.69923204),
 Recommendation(original=655370, recommendation=22292, similarity=0.6977149),
 Recommendation(original=655370, recommendation=30118, similarity=0.6960785),
 Recommendation(original=655370, recommendation=18692, simi

In [18]:
faiss.write_index(index, "games.index")