## Libraries

In [1]:
import pandas as pd
import numpy as np
import os
from ast import literal_eval as string_to_list
from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MultiLabelBinarizer, normalize
from sentence_transformers import SentenceTransformer
import faiss

  from .autonotebook import tqdm as notebook_tqdm


## Read in dataset + preprocessing

In [2]:
# Read in csv
games = pd.read_csv(os.path.join(os.path.dirname(os.getcwd()), 'backloggd_games.csv'), index_col=0)

# Clean data types + fill missing values + drop duplicates
games['Summary'] = games['Summary'].fillna('')
games[['Plays','Playing','Backlogs','Wishlist','Lists','Reviews']] = games[['Plays','Playing','Backlogs','Wishlist','Lists','Reviews']]     \
                                                                        .map(lambda x: float(x.replace('K','')) * 1000 if 'K' in x else float(x))
games[['Developers','Platforms','Genres']] = games[['Developers','Platforms','Genres']].map(string_to_list)
games = games.drop_duplicates(subset='Title', ignore_index=True)

# Creates dataset copies for BM-25 and SBERT
games_BM25 = games.copy()
games_BM25[['Developers','Platforms','Genres']] = games_BM25[['Developers','Platforms','Genres']].map(lambda listed: [x.lower() for x in listed])
games_BM25['Title'] = games_BM25['Title'].str.lower()
games_BM25['Summary'] = games_BM25['Summary'].str.lower()
games_BM25[['Developers','Platforms','Genres']] = games_BM25[['Developers','Platforms','Genres']].map(lambda x: ' '.join(x))

games_SBERT = games.copy()
games_SBERT[['Developers','Platforms','Genres']] = games_SBERT[['Developers','Platforms','Genres']].map(lambda listed: [x.lower() for x in listed])
games_SBERT['Title'] = games_SBERT['Title'].str.lower()
games_SBERT['Summary'] = games_SBERT['Summary'].str.lower()

# Reads in processed queries and confirms list data types
queries = pd.read_csv(os.path.join(os.path.dirname(os.getcwd()), 'Query_processing', 'processed_queries.csv'))
queries[['Developers','Platforms','Genres']] = queries[['Developers','Platforms','Genres']].map(string_to_list)

## BM-25

In [3]:
def BM25_field_matrix(df, game_attribute, k_1=1.2, b=0.8, max_features=50000, min_df=2):
    """
    Creates a BM-25 matrix for an individual attribute in the dataset
    Hyperparameters can be tuned to improve results
    Code is identical to the pre-processing implementation
    """

    documents = df[game_attribute].to_list()
    pipe = Pipeline([('count', CountVectorizer(max_features=max_features, min_df=min_df)), ('tfid', TfidfTransformer())]).fit(documents)
    term_doc_matrix = pipe['count'].transform(documents)
    doc_lengths, avg_dl, idfs, tfs = term_doc_matrix.sum(axis=1), np.mean(term_doc_matrix.sum(axis=1)), pipe['tfid'].idf_.reshape(1, -1), term_doc_matrix.multiply(1 / term_doc_matrix.sum(axis=1))
    numerator = (k_1 + 1) * tfs
    denominator = k_1 * ((1 - b) + b * (doc_lengths / avg_dl)) + tfs
    BM25 = numerator.multiply(1 / denominator)
    BM25 = BM25.multiply(idfs)
    vocab = pipe['count'].get_feature_names_out()
    vocab = {term:index for index, term in enumerate(vocab)}

    return BM25.tocsr(), vocab

def retrieve_top_k_bm25(query, bm25_list, weights, doc_titles, top_k=100, epsilon=1e-6):
    """
    Filters each of the individual BM-25 matrices based on a single query then weights and sums them up together
    Epsilon is again used for Out-Of-Vocabulary (OOV) terms
    Returns the top k results from the combined weighted BM-25 matrix
    """

    def bm25_filtered(query, bm25):
        """
        Filters an individual BM-25 matrix according to the input query
        Code is identical to the BM-25 result generating in the pre-processing
        """

        matrix, vocab = bm25
        query = query.split(' ')
        query_tokens = [vocab[term] if term in vocab else 'OOV' for term in query]
        IV = [term for term in query_tokens if term != 'OOV']
        if len(IV) != 0:
            doc_scores = matrix[:, IV].sum(axis=1)
            if 'OOV' in query:
                doc_scores += np.full((matrix.shape[0], 1), epsilon)
        else:
            doc_scores = np.full((matrix.shape[0], 1), epsilon)

        return doc_scores

    # Filters and weights the BM-25 matrices
    bm25_weighted = np.zeros((bm25_list[0][0].shape[0], 1))
    for bm25, weight in zip(bm25_list, weights):
        bm25_weighted += weight * bm25_filtered(query, bm25)

    # Sorts the results according to the BM-25 score and filters to only the top k -> ((doc_id, doc_title), score)
    scores = np.ravel(bm25_weighted)
    ranked = sorted(zip(enumerate(doc_titles), scores), key=lambda zipper: zipper[1], reverse=True)
    ranked_topk = ranked[:top_k]

    # Reformats results -> (query, doc_title, doc_id, score)
    output = []
    for k in ranked_topk:
        doc_id, doc_title, score = k[0][0], k[0][1], k[1]
        output.append((query, doc_title, doc_id, score, 0))    

    return output

### Generate Results

In [4]:
bm25_matrices = [
    BM25_field_matrix(games_BM25, 'Title', k_1=1.2, b=0.4, max_features=5000, min_df=1),
    BM25_field_matrix(games_BM25, 'Developers', k_1=1.1, b=0.3, max_features=4000, min_df=1),
    BM25_field_matrix(games_BM25, 'Summary', k_1=1.8, b=0.8, max_features=20000, min_df=2),
    BM25_field_matrix(games_BM25, 'Platforms', k_1=1.0, b=0.2, max_features=500, min_df=2),
    BM25_field_matrix(games_BM25, 'Genres', k_1=1.0, b=0.2, max_features=800, min_df=2)
        ]
weights = [2.0, 0.6, 1.5, 0.8, 0.8]

results_bm25 = []
for query in queries['Processed']:
    topk = retrieve_top_k_bm25(query, bm25_matrices, weights, games['Title'])
    for k in topk: results_bm25.append(k)

  doc_lengths, avg_dl, idfs, tfs = term_doc_matrix.sum(axis=1), np.mean(term_doc_matrix.sum(axis=1)), pipe['tfid'].idf_.reshape(1, -1), term_doc_matrix.multiply(1 / term_doc_matrix.sum(axis=1))


## SBERT + FAISS

In [5]:
model = SentenceTransformer('all-MiniLM-L6-v2')     # Initialise SBERT

# Embed the text attributes of the dataset, normalisation will occur after combining with the one-hot embeddings
embeddings = model.encode(games_SBERT['Title'] +  ' ' + games_SBERT['Summary'], show_progress_bar=True, normalize_embeddings=False)

# Fit and transform separate binarisers for the list attributes -> Same binarisers are used to convert the list attributes of the queries
mlb1, mlb2, mlb3 = MultiLabelBinarizer(), MultiLabelBinarizer(), MultiLabelBinarizer()
developer_onehot, platform_onehot, genre_onehot = mlb1.fit_transform(games_SBERT['Developers']), mlb2.fit_transform(games_SBERT['Platforms']), mlb3.fit_transform(games_SBERT['Genres'])

# Concatenates the embeddings and weights each one
# Result is normalised to ready the embeddings for cosine similarity
weights = [0.5, 0.2, 0.3, 0.4]
doc_embeddings = normalize(np.hstack((
                    weights[0] * embeddings,
                    weights[1] * developer_onehot,
                    weights[2] * platform_onehot,
                    weights[3] * genre_onehot
                    )), norm='l2', axis=1)

# Creates the FAISS index according to the generated embeddings
data_index = faiss.IndexFlatIP(doc_embeddings.shape[1])
data_index.add(doc_embeddings)


Batches: 100%|██████████| 1281/1281 [00:30<00:00, 42.07it/s] 


In [6]:
# Encodes the text component of the processed queries
model = SentenceTransformer('all-MiniLM-L6-v2')
embeddings = model.encode(queries['Processed'], show_progress_bar=True, normalize_embeddings=False)

# One-hot encodes the list attributes of the queries
developer_onehot, platform_onehot, genre_onehot = mlb1.transform(queries['Developers']), mlb2.transform(queries['Platforms']), mlb3.transform(queries['Genres'])

# Combines the query embeddings using the same weights 
query_embeddings = normalize(np.hstack((
                    embeddings,
                    developer_onehot,
                    platform_onehot,
                    genre_onehot
                    )), norm='l2', axis=1)

Batches: 100%|██████████| 4/4 [00:00<00:00, 48.63it/s]


### Generate Results

In [7]:
def retrieve_top_k_faiss(data_index, queries, query_embeddings, doc_titles, top_k=100):
    """
    Searches the FAISS index according to the query embedding matrix
    Returns 100 document IDs per query
    """

    # Returns the distances and indexes for each of the returned documents
    distance, index = data_index.search(query_embeddings, top_k)
    
    results = []

    # Loops through each of the queries
    for i, query in zip(range(index.shape[0]), queries):

        # Loops through each of the documents in the query results and appends the results in the same format as the BM-25 results -> (query, doc_title, doc_id, score)
        for id, score in zip(index[i], distance[i]):
            results.append((query, doc_titles[id], id, 0, score))

    return results

results_faiss = retrieve_top_k_faiss(data_index, queries['Processed'], query_embeddings, games['Title'])

## Result Pooling

In [15]:
results_bm25_df = pd.DataFrame(results_bm25, columns=['Query', 'Title', 'ID', 'BM25 Score', 'SBERT Score'])
results_faiss_df = pd.DataFrame(results_faiss, columns=['Query', 'Title', 'ID', 'BM25 Score', 'SBERT Score'])
results_pooled = pd.concat([results_bm25_df, results_faiss_df], axis=0).drop_duplicates(subset=['Query','ID']).sort_values(by='Query', ignore_index=True)
results_pooled = results_pooled.rename(columns={'Query':'Processed Query'})
results_pooled = pd.merge(results_pooled, queries[['Original','Processed']], how='left', left_on='Processed Query', right_on='Processed')
results_pooled = results_pooled.drop(columns=['Processed']).reindex(columns=['Original', 'Processed Query', 'Title', 'ID', 'BM25 Score', 'SBERT Score'])
results_pooled = results_pooled.rename(columns={'Original':'Original Query'})

results_pooled.to_csv('pooled_results.csv', index=False)