In [5]:
import pandas as pd
import numpy as np
import os
from ast import literal_eval as string_to_list
from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer
from sklearn.pipeline import Pipeline

In [None]:
games = pd.read_csv(os.path.join(os.path.dirname(os.getcwd()), 'backloggd_games.csv'), index_col=0)
def string_to_int(x):
    if 'K' in x:
        x = float(x.replace('K','')) * 1000
    else:
        x = float(x) * 1000
    return x

games[['Plays','Playing','Backlogs','Wishlist','Lists','Reviews']] = games[['Plays','Playing','Backlogs','Wishlist','Lists','Reviews']].map(string_to_int)
games[['Developers','Platforms','Genres']] = games[['Developers','Platforms','Genres']].map(string_to_list)
games[['Developers','Platforms','Genres']] = games[['Developers','Platforms','Genres']].map(lambda x: ' '.join(x))
games['Summary'] = games['Summary'].fillna('')
games = games.drop_duplicates(subset='Title', ignore_index=True)

In [None]:
def BM25_field_matrix(df, game_attribute, k_1=1.2, b=0.8, max_features=50000, min_df=2):    
    documents = df[game_attribute].to_list()
    pipe = Pipeline([('count', CountVectorizer(max_features=max_features, min_df=min_df)), ('tfid', TfidfTransformer())]).fit(documents)
    term_doc_matrix = pipe['count'].transform(documents)
    doc_lengths, avg_dl, idfs, tfs = term_doc_matrix.sum(axis=1), np.mean(term_doc_matrix.sum(axis=1)), pipe['tfid'].idf_.reshape(1, -1), term_doc_matrix.multiply(1 / term_doc_matrix.sum(axis=1))

    numerator = (k_1 + 1) * tfs
    denominator = k_1 * ((1 - b) + b * (doc_lengths / avg_dl)) + tfs
    BM25 = numerator.multiply(1 / denominator)
    BM25 = BM25.multiply(idfs)

    return np.array(BM25.sum(axis=1))

bm25_scores = [
    BM25_field_matrix(games, 'Title', k_1=1.2, b=0.4, max_features=5000, min_df=1),
    BM25_field_matrix(games, 'Developers', k_1=1.1, b=0.3, max_features=4000, min_df=1),
    BM25_field_matrix(games, 'Summary', k_1=1.8, b=0.8, max_features=20000, min_df=2),
    BM25_field_matrix(games, 'Platforms', k_1=1.0, b=0.2, max_features=500, min_df=2),
    BM25_field_matrix(games, 'Genres', k_1=1.0, b=0.2, max_features=800, min_df=2)
        ]
weights = [..., ..., ..., ..., ...]
bm25 = sum(score * weight for score, weight in zip(bm25_scores, weights))

  doc_lengths, avg_dl, idfs, tfs = term_doc_matrix.sum(axis=1), np.mean(term_doc_matrix.sum(axis=1)), pipe['tfid'].idf_.reshape(1, -1), term_doc_matrix.multiply(1 / term_doc_matrix.sum(axis=1))


In [28]:
games.head()

Unnamed: 0,Title,Release_Date,Developers,Summary,Platforms,Genres,Rating,Plays,Playing,Backlogs,Wishlist,Lists,Reviews
0,Elden Ring,"Feb 25, 2022",FromSoftware Bandai Namco Entertainment,"Elden Ring is a fantasy, action and open world...",Windows PC PlayStation 4 Xbox One PlayStation ...,Adventure RPG,4.5,21000.0,4100.0,5600.0,5500.0,4600.0,3000.0
1,The Legend of Zelda: Breath of the Wild,"Mar 03, 2017",Nintendo Nintendo EPD Production Group No. 3,The Legend of Zelda: Breath of the Wild is the...,Wii U Nintendo Switch,Adventure Puzzle,4.4,35000.0,3100.0,5600.0,3000.0,5100.0,3000.0
2,Hades,"Dec 07, 2018",Supergiant Games,A rogue-lite hack and slash dungeon crawler in...,Windows PC Mac PlayStation 4 Xbox One PlayStat...,Adventure Brawler Indie RPG,4.3,25000.0,3500.0,7300.0,4000.0,3200.0,2100.0
3,Hollow Knight,"Feb 24, 2017",Team Cherry,A 2D metroidvania with an emphasis on close co...,Windows PC Mac Linux Nintendo Switch,Adventure Indie Platform,4.4,25000.0,2700.0,9600.0,2600.0,3400.0,2100.0
4,Undertale,"Sep 15, 2015",tobyfox 8-4,"A small child falls into the Underground, wher...",Windows PC Mac Linux PlayStation 4 Xbox One Pl...,Adventure Indie RPG Turn Based Strategy,4.2,32000.0,728000.0,5700.0,2100.0,3900.0,2500.0
