In [1]:
from pprint import pprint

import sys
import os
import pandas as pd
# from dotenv import load_dotenv
import csv

from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from gensim.utils import simple_preprocess
import numpy as np

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [2]:
CURRENT_DIR = os.getcwd()

DATA_ROOT_DIR = os.path.abspath(os.path.join(CURRENT_DIR, "..", "dataset", "data_files"))
USER_GAMES_FILENAME = os.path.join(DATA_ROOT_DIR, "users_games.csv")

df = pd.read_csv(USER_GAMES_FILENAME)

pprint(df)

                 user_id  game_id  playtime_2weeks  playtime_forever
0      76561197962641822       10              NaN                 0
1      76561197962641822       20              NaN                 0
2      76561197962641822       30              NaN                 0
3      76561197962641822       40              NaN                 0
4      76561197962641822       50              NaN                 0
...                  ...      ...              ...               ...
33773  76561199615987653      730           3302.0              3985
33774  76561199622123452      730           1820.0              1820
33775  76561199625697401     3590            976.0               976
33776  76561199625697401  1966720            443.0               443
33777  76561199625697401  1721470             17.0                17

[33778 rows x 4 columns]


In [3]:
# Creates model from an array of phrases
def createDoc2VecModel(phrases):
    tagged_phrases = [TaggedDocument(words=simple_preprocess(phrase), tags=[i]) for i, phrase in enumerate(phrases)]

    model = Doc2Vec(vector_size=100, window=2, min_count=1, workers=4, epochs=20)
    model.build_vocab(tagged_phrases)
    model.train(tagged_phrases, total_examples=model.corpus_count, epochs=model.epochs)

    return model

# Function predicts from a given model and phrase
def predict_from_model(model, phrase):
    words = simple_preprocess(phrase)
    vector = model.infer_vector(words)

    return vector




In [4]:
SAVE_FILE_NAME = "test1"


# Training Data for now
phrases = [
    "This is true",
    "This is false",
    "Another phrase here",
    "Yet another phrase"
]

# Test data for now
p1 = "A new phrase"

train_model = createDoc2VecModel(phrases)
new_vector = predict_from_model(train_model, p1)

pprint(new_vector)

train_model.save(f"{SAVE_FILE_NAME}")


array([-4.2955405e-03, -1.2566516e-03, -1.1358461e-03,  2.8091623e-03,
        4.8042769e-03, -3.4050839e-03, -1.5851003e-03, -7.8252732e-04,
        1.1932040e-03,  2.6794416e-03,  3.5121440e-05, -3.1282031e-03,
        8.9966477e-04, -1.0092604e-03, -6.0295017e-04,  1.1950815e-03,
        7.7892182e-04, -1.8898797e-03, -3.9974065e-03, -2.1094640e-03,
        3.8106078e-03, -2.0799476e-03, -2.3410097e-03,  3.5194522e-03,
        5.1934959e-04,  4.8315711e-03, -2.3233932e-03, -2.2570926e-03,
        1.7017705e-03,  1.4264066e-03,  2.4114465e-03,  3.0330783e-03,
       -2.3996213e-03, -1.4328749e-04,  2.5727593e-03, -3.3479286e-03,
       -3.4885108e-04,  2.9519193e-03, -4.2942776e-03, -2.4941396e-03,
        1.4284265e-03, -5.3469482e-04,  2.2629797e-04,  2.0546687e-03,
       -2.4133262e-03,  1.7350203e-03,  4.1870181e-03,  3.6002554e-03,
       -9.9216367e-04,  3.7687409e-03, -4.3176180e-03,  4.5567439e-03,
        4.9991454e-03,  1.6506905e-03,  9.6107903e-04,  1.6420037e-03,
      

In [None]:
#### Testing with BERT

from transformers import BertTokenizer, BertModel
import torch

def generate_bert_embeddings(descriptions, model_name='bert-base-uncased'):
    """
    Generate BERT embeddings for a list of game descriptions.

    Args:
    descriptions (list): List of game descriptions (strings).
    model_name (str): Name of the pre-trained BERT model to use.

    Returns:
    embeddings (list): List of BERT embeddings for each description.
    """

    tokenizer = BertTokenizer.from_pretrained(model_name)
    model = BertModel.from_pretrained(model_name)

    embeddings = []
    for description in descriptions:
        tokens = tokenizer.tokenize(description)
        input_ids = tokenizer.convert_tokens_to_ids(tokens)
        input_tensor = torch.tensor([input_ids])

        with torch.no_grad():
            outputs = model(input_tensor)
            embeddings.append(outputs[0][:, 0, :].numpy())

    return embeddings


game_descriptions = [
    "At its core, Minecraft is a game where players place blocks and go on adventures. This includes anything from crafting simple items like containers or weapons, to building structures like houses, castles, and cities, or even making complex mechanical devices, all within the game's world.",
    "Terraria is a 2D sandbox game with gameplay that revolves around exploration, building, crafting, combat, survival, and mining, playable in both single-player and multiplayer modes. The game has a 2D sprite tile-based graphical style reminiscent of the 16-bit sprites found on the Super NES.",
    "At its core, Roblox is a gaming platform that allows you to play dozens of user-created games or create games through Roblox Studio. From role-playing games to rhythm titles and shooters, the platform is a toy box of experiences for all ages.",
    "League of Legends is a team-based strategy game where two teams of five powerful champions face off to destroy the other's base. Choose from over 140 champions to make epic plays, secure kills, and take down towers as you battle your way to victory.",
]

embeddings = generate_bert_embeddings(game_descriptions)

for i in range(len(game_descriptions)):
    print(embeddings[i])
