## Libraries

In [2]:
import pandas as pd
import numpy as np
import os
from ast import literal_eval as string_to_list
from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MultiLabelBinarizer
from sentence_transformers import SentenceTransformer
import faiss

  from .autonotebook import tqdm as notebook_tqdm


## Read in dataset + preprocessing

In [None]:
games = pd.read_csv(os.path.join(os.path.dirname(os.getcwd()), 'backloggd_games.csv'), index_col=0)
games['Summary'] = games['Summary'].fillna('')
games = games.drop_duplicates(subset='Title', ignore_index=True)
games[['Plays','Playing','Backlogs','Wishlist','Lists','Reviews']] = games[['Plays','Playing','Backlogs','Wishlist','Lists','Reviews']]     \
                                                                        .map(lambda x: float(x.replace('K','')) * 1000 if 'K' in x else float(x))
games[['Developers','Platforms','Genres']] = games[['Developers','Platforms','Genres']].map(string_to_list)

games_BM25 = games.copy()
games_BM25[['Developers','Platforms','Genres']] = games_BM25[['Developers','Platforms','Genres']].map(lambda x: ' '.join(x))
games_SBERT = games.copy()

queries = pd.read_csv(os.path.join(os.path.dirname(os.getcwd()), 'Query_processing', 'processed_queries.csv'))

## BM-25

In [47]:
def BM25_field_matrix(df, game_attribute, k_1=1.2, b=0.8, max_features=50000, min_df=2):    
    documents = df[game_attribute].to_list()
    pipe = Pipeline([('count', CountVectorizer(max_features=max_features, min_df=min_df)), ('tfid', TfidfTransformer())]).fit(documents)
    term_doc_matrix = pipe['count'].transform(documents)
    doc_lengths, avg_dl, idfs, tfs = term_doc_matrix.sum(axis=1), np.mean(term_doc_matrix.sum(axis=1)), pipe['tfid'].idf_.reshape(1, -1), term_doc_matrix.multiply(1 / term_doc_matrix.sum(axis=1))

    numerator = (k_1 + 1) * tfs
    denominator = k_1 * ((1 - b) + b * (doc_lengths / avg_dl)) + tfs
    BM25 = numerator.multiply(1 / denominator)
    BM25 = BM25.multiply(idfs)

    vocab = pipe['count'].get_feature_names_out()
    vocab = {term:index for index, term in enumerate(vocab)}

    return BM25.tocsr(), vocab

def retrieve_top_k(query, bm25_list, weights, doc_titles, top_k=100, epsilon=1e-6):

    def bm25_filtered(query, bm25):
        matrix, vocab = bm25
        query = query.split(' ')
        query_tokens = [vocab[term] if term in vocab else 'OOV' for term in query]

        IV = [term for term in query_tokens if term != 'OOV']

        if len(IV) != 0:
            doc_scores = matrix[:, IV].sum(axis=1)
            if 'OOV' in query:
                doc_scores += np.full((matrix.shape[0], 1), epsilon)

        else:
            doc_scores = np.full((matrix.shape[0], 1), epsilon)

        return doc_scores

    bm25_weighted = np.zeros((bm25_list[0][0].shape[0], 1))

    for bm25, weight in zip(bm25_list, weights):
        bm25_weighted += weight * bm25_filtered(query, bm25)

    scores = np.ravel(bm25_weighted)
    ranked = sorted(zip(enumerate(doc_titles), scores), key=lambda zipper: zipper[1], reverse=True)
    ranked_topk = ranked[:top_k]

    output = []
    for k in ranked_topk:
        doc_id, doc_title, score = k[0][0], k[0][1], k[1]
        output.append((query, doc_title, doc_id, score))    

    return output

### Generate Results

In [48]:
bm25_matrices = [
    BM25_field_matrix(games_BM25, 'Title', k_1=1.2, b=0.4, max_features=5000, min_df=1),
    BM25_field_matrix(games_BM25, 'Developers', k_1=1.1, b=0.3, max_features=4000, min_df=1),
    BM25_field_matrix(games_BM25, 'Summary', k_1=1.8, b=0.8, max_features=20000, min_df=2),
    BM25_field_matrix(games_BM25, 'Platforms', k_1=1.0, b=0.2, max_features=500, min_df=2),
    BM25_field_matrix(games_BM25, 'Genres', k_1=1.0, b=0.2, max_features=800, min_df=2)
        ]
weights = [2.0, 0.6, 1.5, 0.8, 0.8]

results = []
for query in queries['Processed']:
    topk = retrieve_top_k(query, bm25_matrices, weights, games_BM25['Title'])
    for k in topk: results.append(k)

  doc_lengths, avg_dl, idfs, tfs = term_doc_matrix.sum(axis=1), np.mean(term_doc_matrix.sum(axis=1)), pipe['tfid'].idf_.reshape(1, -1), term_doc_matrix.multiply(1 / term_doc_matrix.sum(axis=1))


## SBERT + FAISS

In [3]:
corpus = games_SBERT['Title'] +  ' ' + games_SBERT['Summary']
corpus = corpus.to_list()

model = SentenceTransformer('all-MiniLM-L6-v2')
embeddings = model.encode(corpus, show_progress_bar=True, normalize_embeddings=True)

mlb1, mlb2, mlb3 = MultiLabelBinarizer(), MultiLabelBinarizer(), MultiLabelBinarizer()
developer_onehot, platform_onehot, genre_onehot = mlb1.fit_transform(games_SBERT['Developers']), mlb2.fit_transform(games_SBERT['Platforms']), mlb3.fit_transform(games_SBERT['Genres'])

weights = [0.5, 0.2, 0.3, 0.4]
full_embeddings = np.hstack((weights[0] * embeddings,
                             weights[1] * developer_onehot,
                             weights[2] * platform_onehot,
                             weights[3] * genre_onehot))

Batches: 100%|██████████| 1281/1281 [00:30<00:00, 41.84it/s] 


## FAISS

In [14]:
index = faiss.IndexFlatIP(embeddings.shape[1])
index.add(embeddings)

In [18]:
test_query = ["The Witcher 3"]
embedded_query = model.encode(test_query, normalize_embeddings=True)

In [20]:
k = 50
distances, indexes = index.search(embedded_query, k)

In [23]:
games.iloc[indexes.flatten()]

Unnamed: 0,Title,Release_Date,Developers,Summary,Platforms,Genres,Rating,Plays,Playing,Backlogs,Wishlist,Lists,Reviews
51,The Witcher 3: Wild Hunt,"May 19, 2015","[CD Projekt RED, Spike ChunSoft]","RPG and sequel to The Witcher 2 (2011), The Wi...","[Windows PC, PlayStation 4, Xbox One, PlayStat...","[Adventure, RPG]",4.3,19000.0,1300.0,7100.0,1800.0,2300.0,944000.0
19782,Witch,TBD,[],"""Witch"", an 2D-in-3D Indie JRPG coming soon fr...",[],"[Indie, RPG]",,0.0,0.0,5000.0,3000.0,2000.0,0.0
470,The Witcher 3: Wild Hunt - Game of the Year Ed...,"Aug 29, 2016","[CD Projekt RED, CD Projekt]",Become a professional monster slayer and embar...,"[Windows PC, PlayStation 4, Xbox One]","[Adventure, RPG]",4.5,7000.0,435000.0,2500.0,535000.0,539000.0,272000.0
17121,The Button Witch,"Dec 27, 2020",[],"Adventure game full of magic, puzzles and cute...",[Windows PC],"[Adventure, Indie, Point-and-Click, Puzzle]",,1000.0,0.0,1000.0,4000.0,4000.0,1000.0
7760,Wizardry III & IV,"Mar 04, 1994",[],,[Turbografx-16/PC Engine CD],[],,1000.0,0.0,4000.0,0.0,1000.0,0.0
1235,The Witcher 3: Wild Hunt - Complete Edition,"Aug 29, 2016",[CD Projekt RED],"The Witcher 3: Wild Hunt is a story-driven, op...","[Windows PC, PlayStation 4, Xbox One, PlayStat...","[Adventure, RPG]",4.4,1300.0,150000.0,603000.0,151000.0,204000.0,56000.0
10989,Brave Dungeon + Dark Witch's Story: Combat,"Sep 28, 2017",[INSIDE SYSTEM],"A compilation of an HD version of ""Brave Dunge...","[Windows PC, Nintendo Switch]","[Adventure, RPG, Strategy]",3.4,17000.0,0.0,8000.0,9000.0,6000.0,0.0
29973,The Legend of Dark Witch 2,"Dec 17, 2015",[CIRCLE Entertainment],The sequel to The Legend of Dark Witch for the...,"[Windows PC, Nintendo 3DS]","[Adventure, Platform]",3.8,45000.0,0.0,33000.0,17000.0,15000.0,3000.0
39639,Witch's Weapon,TBD,[],,[],"[RPG, Simulator]",,0.0,0.0,0.0,1000.0,0.0,0.0
34145,The Witcher Remake,TBD,"[CD Projekt, Fool's Theory]","Previously codename Canis Majoris, The Witcher...","[Windows PC, PlayStation 5, Xbox Series]",[RPG],,1000.0,1000.0,36000.0,132000.0,42000.0,0.0
