## Game Feature Recommendation System
Recommends to the user on prompting game features extracted from game database information

In [148]:
# Library imports

import numpy as np
import spacy
import math
from nltk.tokenize import word_tokenize
from tqdm import tqdm
from nltk.corpus import stopwords
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity

nlp = spacy.load("en_core_web_sm")


In [149]:
# External data imports
def loadFeatData():
    DAT = {}
    with open("../data/game_datfeat.txt", "r") as f:
        lines = [l.strip() for l in f.readlines()]
        CUR_GAME = ""
        for l in lines:
            # empty line (between entries)
            if l == "":
                continue
            #new entry
            else:
                if l[0] == "+":
                    CUR_GAME = l[2:]
                    DAT[CUR_GAME] = {"tags":[],"entities":[],"features":[]}
                elif l[0] == "#":
                    DAT[CUR_GAME]["tags"] = [t.lower() for t in l[2:].split(",")]
                elif l[0] == "@":
                    DAT[CUR_GAME]["entities"] = l[2:].split(",")
                else:
                    DAT[CUR_GAME]["features"].append(l)
    return DAT


#get all of the tags and the entities from the game data
def getTagsEntities():
    # get all of the tags and entities
    ALL_TAGS = []
    ALL_ENTITIES = []
    for g in GAME_DATA:
        ALL_TAGS += [t.lower() for t in GAME_DATA[g]["tags"]]
        ALL_ENTITIES += [e.lower() for e in GAME_DATA[g]["entities"]]

    # remove duplicates
    ALL_TAGS = list(set(ALL_TAGS))
    ALL_ENTITIES = list(set(ALL_ENTITIES))

    return ALL_TAGS, ALL_ENTITIES


GAME_DATA = loadFeatData()
ALL_TAGS, ALL_ENTITIES = getTagsEntities()


## EXPERIMENTS
* Sample input text: "_top-down multiplayer action game about knights, castles, and assassinations_"
* Target selection games: [Castle Crashers, Hammerwatch, Kingsway], Thief Gold, Skyrim, Elden Ring

_Notes:_ 
- Maybe having some not perfect predictions is necessary for the user to prevent from making cloned games? -> #5 has best predictions so far, but aren't spot on

In [150]:
# general functions
TRAIN_MODE = "TFIDF_TRANS"
SENT_MODEL = None
custom_stopwords = ["game", ",", ".", "!"]

#setup abstract function
def getClosestGames(txt,k=3):
    return [("?",0)]

#tokenizes the text
def tokenize(txt):
    raw_toks = word_tokenize(txt)
    toks = [w.lower() for w in raw_toks if w.lower() not in stopwords.words("english") and w.lower() not in custom_stopwords]
    #add the custom tag words (can be compound words)
    for t in ALL_TAGS:
        if t in txt and t not in toks:
            toks.append(t)
    return toks

### 1. VECTORIZE THE TAGS + ENTITIES
- Note: Completely wrong. Vectorizing doesn't work with unbalanced dataset - picked the matches with fewest amount of data in vector
- Result: ["The Ramp", "Batman: Arkham City", "Among Us"]

In [151]:
###   VECTOR THE TAGS AND ENTITIES   ###
if TRAIN_MODE == "VECTOR":

    # get all of the tags and entities
    ALL_TAGS = []
    ALL_ENTITIES = []
    for g in GAME_DATA:
        ALL_TAGS += [t.lower() for t in GAME_DATA[g]["tags"]]
        ALL_ENTITIES += [e.lower() for e in GAME_DATA[g]["entities"]]

    # remove duplicates
    ALL_TAGS = list(set(ALL_TAGS))
    ALL_ENTITIES = list(set(ALL_ENTITIES))

    #make a tag embedding
    def tagEmbed(tags):
        vec = [0]*len(ALL_TAGS)
        for t in tags:
            tt = t.lower()
            if tt in ALL_TAGS:
                vec[ALL_TAGS.index(tt)] = 1
        return vec

    #make a entity embedding
    def entEmbed(entities):
        vec = [0]*len(ALL_ENTITIES)
        for e in entities:
            ee = e.lower()
            if ee in ALL_ENTITIES:
                vec[ALL_ENTITIES.index(ee)] = 1
        return vec


    # make the embeddings for each game
    TAG_EMBEDDINGS = {}
    ENTITY_EMBEDDINGS = {}
    for g in GAME_DATA:
        TAG_EMBEDDINGS[g] = tagEmbed(GAME_DATA[g]["tags"])
        ENTITY_EMBEDDINGS[g] = entEmbed(GAME_DATA[g]["entities"])

    #make a combined embedding
    FULL_EMVEDDINGS = {}
    for g in GAME_DATA:
        FULL_EMVEDDINGS[g] = TAG_EMBEDDINGS[g]+ENTITY_EMBEDDINGS[g]


    ## GET CLOSEST GAME FROM TEXT BASED ON EMBEDDINGS ##
    def getClosestGames(txt,k=3):
        # tokenize and remove all stop words
        toks = tokenize(txt)
        # print(toks)

        # get the embedding for the text
        txt_embed = tagEmbed(toks)+entEmbed(toks)

        # get the closest games using distance metrics
        dists = {}
        for g in GAME_DATA:
            dists[g] = np.linalg.norm(np.array(txt_embed) - np.array(FULL_EMVEDDINGS[g]))
        dists = sorted(dists.items(), key=lambda x: x[1])
        # print(dists)
        return dists[:k], None
    

### 2. COUNT THE MATCHES
- Note: Doesn't get the right matches for the secondary choices - especially if the tags themselves are too general. No way to account for tie-breakers
- Result: ["Castle Crashers", "Hammerwatch", "Among Us"]

In [152]:
# count the number of tags and entities matches 
if TRAIN_MODE == "COUNT":
    
    # get the closest games using count matches
    def getClosestGames(txt,k=3):
        # tokenize and remove all stop words
        toks = tokenize(txt)

        # initialize the counts
        cts = {}
        for g in GAME_DATA:
            cts[g] = 0
        
        # count the matches for each game
        matches = {}
        for t in toks:
            for g in GAME_DATA:
                if t in GAME_DATA[g]["tags"] or t in GAME_DATA[g]["entities"]:
                    cts[g] += 1
                    if g not in matches:
                        matches[g] = []
                    matches[g].append(t)

        # sort the counts
        cts = sorted(cts.items(), key=lambda x: x[1], reverse=True)
        return cts[:k], matches

### 3. BERT SENTENCE TRANSFORMER MODEL
- Note: Ok, but places too much weight on the word semantics themselves.
- Result: ["Kingsway", "Hammerwatch", "Batman: Arkham City"]

In [153]:
if TRAIN_MODE == "TRANSFORMER":
    # load the model
    if SENT_MODEL == None:
        SENT_MODEL = SentenceTransformer('bert-base-nli-mean-tokens')

In [154]:
if TRAIN_MODE == "TRANSFORMER":

    # turn game data into a "sentence" for the sentence-transformer
    def sentGame(g):
        return " ".join(GAME_DATA[g]["tags"])+" "+" ".join(GAME_DATA[g]["entities"])

    # encode the game data
    def encodeGames():
        emb_dat = {}
        with tqdm(total=len(GAME_DATA)) as pbar:
            for g in GAME_DATA:
                emb_dat[g] = SENT_MODEL.encode(sentGame(g))
                pbar.update(1)
        return emb_dat

    GAME_DAT_EMB = encodeGames()

In [155]:
if TRAIN_MODE == "TRANSFORMER":
    # find the closest game based on the encoded game data
    def getClosestGames(txt,k=3):
        # tokenize and remove all stop words
        toks = tokenize(txt)

        # encode the text
        txt_embed = SENT_MODEL.encode([" ".join(toks)])

        # get the closest games using distance metrics
        dists = {}
        for g in GAME_DATA:
            temb = np.array(txt_embed)
            gemb = np.array(GAME_DAT_EMB[g])
            dists[g] = cosine_similarity(temb,[gemb])
        dists = sorted(dists.items(), key=lambda x: x[1], reverse=True)
        # print(dists)
        return dists[:k], None

### 4. TAG MATCH + SENTENCE TRANSFORMER SIMILARITY
- Note: Ok, but has both problems of the counter and transformer experiments. Too much weight on tags, not enough on less frequently occuring entities
- Result: ["Hammerwatch", "Binding of Isaac", "Castle Crashers"]

In [156]:
if TRAIN_MODE == "CT_TRANS":
    # load the model
    if SENT_MODEL == None:
        SENT_MODEL = SentenceTransformer('bert-base-nli-mean-tokens')
    

In [157]:
if TRAIN_MODE == "CT_TRANS":
    # turn game data into a "sentence" for the sentence-transformer
    def sentGame(g):
        return " ".join(GAME_DATA[g]["tags"])+" "+" ".join(GAME_DATA[g]["entities"])

    # encode the game data
    def encodeGames():
        emb_dat = {}
        with tqdm(total=len(GAME_DATA)) as pbar:
            for g in GAME_DATA:
                emb_dat[g] = SENT_MODEL.encode(sentGame(g))
                pbar.update(1)
        return emb_dat

    GAME_DAT_EMB = encodeGames()

    # count the number of tags found in each game's list
    def tagsFound(txt):
        # tokenize and remove all stop words
        toks = tokenize(txt)

        # initialize the counts
        cts = {}
        for g in GAME_DATA:
            cts[g] = 0
        
        # count the matches for each game
        for g in GAME_DATA:
            for t in toks:
                if t in GAME_DATA[g]["tags"]:
                    cts[g] += 1

        # sort the counts
        cts = dict(sorted(cts.items(), key=lambda x: x[1], reverse=True))
        return cts


In [158]:
if TRAIN_MODE == "CT_TRANS":
    # get the closest games using count matches and the sentence transformer distance metric
    def getClosestGames(txt,k=3):
        # tokenize and remove all stop words
        toks = tokenize(txt)

        # encode the text
        txt_embed = SENT_MODEL.encode([" ".join(toks)])

        # get the closest games using distance metrics
        dists = {}
        for g in GAME_DATA:
            temb = np.array(txt_embed)
            gemb = np.array(GAME_DAT_EMB[g])
            dists[g] = cosine_similarity(temb,[gemb])

        #multiply by the tag matches
        TAG_MATCHES = tagsFound(txt)
        print(TAG_MATCHES)
        for g in dists:
            dists[g] *= TAG_MATCHES[g]

        # sort by distance
        dists = sorted(dists.items(), key=lambda x: x[1], reverse=True)
        # print(dists)
        return dists[:k], None

### 5. TF-IDF + SENTENCE TRANSFORMER SIMILARITY
- Notes: Closer... Needs a mix of tf-idf with semantic similarity score that the transformer can offer
- Result: ["Castle Crashers", "Hammerwatch", "Among Us"]


In [170]:
# TF-IDF Code heheh

#term frequency - how often a term appears in a document / total number of terms in the document
def tf(doc):  # assume doc is a list of words already tokenized
    tf_dict = {}
    for word in doc:
        if word not in tf_dict:
            tf_dict[word] = 1
        tf_dict[word] += 1
    for word in tf_dict:
        tf_dict[word] = tf_dict[word] / len(doc)
    return tf_dict

#inverse document frequency - log(total number of documents / number of documents with term t in it)
def idf(documents):  # assume documents is a list of lists of words already tokenized
    df = {}
    for doc in documents:  
        for word in doc:
            if word in df:
                df[word] += 1
            else:
                df[word] = 1

    idf_dict = {}
    for word in df:
        idf_dict[word] = math.log(len(documents) / df[word])
    return idf_dict
    
# get the full tfidf score for each word in each game's dataset
def tfidf(doc_set):  # assume doc_set is a dictionary of games with word lists already tokenized
    tfidf_dict = {}
    corpuses = [d for d in doc_set.values()]
    # print(corpuses)
    idf_dat = idf(corpuses)
    for game, doc in doc_set.items():
        tf_dat = tf(doc)
        tfidf_dict[game] = {}
        for word in doc:
            tfidf_dict[game][word] = tf_dat[word] * idf_dat[word]   #note: the tf will be uniform for each word in a doc since it only saves unique words
    return tfidf_dict

In [171]:
if TRAIN_MODE == "TFIDF_TRANS":
    # load the model
    if SENT_MODEL == None:
        SENT_MODEL = SentenceTransformer('bert-base-nli-mean-tokens')

    # turn game data into a "sentence" for the sentence-transformer
    def sentGame(g):
        return " ".join(GAME_DATA[g]["tags"])+" "+" ".join(GAME_DATA[g]["entities"])

    # encode the game data
    def encodeGames():
        emb_dat = {}
        with tqdm(total=len(GAME_DATA)) as pbar:
            for g in GAME_DATA:
                emb_dat[g] = SENT_MODEL.encode(sentGame(g))
                pbar.update(1)
        return emb_dat

    GAME_DAT_EMB = encodeGames()

    # get the tfidf scores for each game
    GAME_DOCS = {}
    for g in GAME_DATA:
        GAME_DOCS[g] = GAME_DATA[g]["tags"]+GAME_DATA[g]["entities"]
    GAME_DAT_TFIDF = tfidf(GAME_DOCS)

100%|██████████| 50/50 [00:09<00:00,  5.36it/s]


In [212]:
if TRAIN_MODE == "TFIDF_TRANS":
    def getClosestGames(txt,k=3):
        # tokenize and remove all stop words
        toks = tokenize(txt)

        # encode the text
        txt_embed = SENT_MODEL.encode([" ".join(toks)])

        # get the closest games using distance metrics
        dists = {}
        for g in GAME_DATA:
            temb = np.array(txt_embed)
            gemb = np.array(GAME_DAT_EMB[g])
            dists[g] = cosine_similarity(temb,[gemb])

        #multiply by the tag matches
        TAG_MATCHES = tagsFound(txt)
        for g in dists:
            dists[g] *= TAG_MATCHES[g]

        # multiply by the tfidf scores
        for g in dists:
            for t in toks:
                if t in GAME_DAT_TFIDF[g]:
                    dists[g] *= GAME_DAT_TFIDF[g][t]
                else:
                    dists[g] *= 0.0001

        # sort by distance
        dists = sorted(dists.items(), key=lambda x: x[1], reverse=True)
        # print(dists)
        return dists[:k], sorted([(d[0],d[1][0][0]) for d in dists], key=lambda x: x[1], reverse=True)

## RUN RECOMMENDATION SYSTEM

#### Steps: 
1. Load the data (games => tags, entities, features)
2. Get a user prompt for a game and its genre
3. Recommend some random features based on the text similarity and closest tags

* Note: Keep in mind, there are over 150k games with tags, features, and entities - the search cannot be too large


In [213]:
# recommend some features to the user based on a prompt
def recommendFeatures(userPrompt):
    # get the closest game
    closestGames, other = getClosestGames(userPrompt)
    print(closestGames)

    #show more debug info
    if other:
        print(other)
        print("")

    # get the features for that game
    for i in closestGames:
        g,v = i
        print(f"{g} -> {v}")
        print(sentGame(g))
        print(GAME_DATA[g]["features"])
        print("")

In [214]:
user_txt = "top-down multiplayer action game about knights, castles, and assassinations"  #ideally should return "castle crashers" or "hammerwatch" or "thief"
print(tokenize(user_txt))
recommendFeatures(user_txt)

['top-down', 'multiplayer', 'action', 'knights', 'castles', 'assassinations']
[('CASTLE CRASHERS', array([[3.9934258e-16]], dtype=float32)), ('HAMMERWATCH', array([[1.6724053e-16]], dtype=float32)), ('AMONG US', array([[2.7277695e-18]], dtype=float32))]
[('CASTLE CRASHERS', 3.9934258e-16), ('HAMMERWATCH', 1.6724053e-16), ('AMONG US', 2.7277695e-18), ('THE BINDING OF ISAAC', 4.2837838e-19), ('HYPER LIGHT DRIFTER', 3.2989625e-19), ('NIDHOGG', 1.9977411e-19), ('CUPHEAD', 1.4612603e-19), ('SCRIBBLENAUTS UNLIMITED', 1.3349024e-19), ('BATTLEBLOCK THEATER', 1.01988694e-19), ('MINI NINJAS', 8.5278806e-20), ('OVERCOOKED! 2', 7.959631e-20), ('GRAND THEFT AUTO V', 6.35636e-20), ('BABA IS YOU', 7.867711e-22), ('SUPER FANCY PANTS ADVENTURE', 5.4668944e-22), ('BATMAN: ARKHAM CITY', 4.4090236e-22), ('STREETS OF ROGUE', 3.1294016e-22), ('STARDEW VALLEY', 2.473096e-22), ('SPELUNKY', 1.7162156e-22), ('THE ELDER SCROLLS V: SKYRIM', 1.595181e-22), ('DOWNWELL', 1.413041e-22), ('GOAT SIMULATOR', 1.3032527e-

In [215]:
for gt in ["CASTLE CRASHERS", "HYPER LIGHT DRIFTER", "THIEF GOLD"]:
    print(gt)
    print(sentGame(gt))
    print(GAME_DATA[gt]["features"])
    print("")



CASTLE CRASHERS
action casual singleplayer adventure rpg 2d multiplayer cute arcade funny comedy co-op cartoony medieval online co-op local multiplayer hack and slash local co-op 4 player local beat 'em up adventure arcade award castles characters crashers friends hack hand hi kingdom princess res slash victory visuals way
['defend your kingdom', 'play locally or online to save your princess', 'smash your way to victory', 'crash some castles']

HYPER LIGHT DRIFTER
indie action singleplayer adventure rpg 2d atmospheric pixel graphics fantasy colorful exploration sci-fi difficult top-down great soundtrack action rpg hack and slash post-apocalyptic metroidvania souls-like action adventure bit blood classics collectors designs disease drifter drifters echoes grander histories illness knowledge land lands mechanics past resonate rpg scale technologies treasure vein way world
['quiet the vicious disease']

THIEF GOLD
action singleplayer adventure atmospheric story rich fantasy exploration fi

In [216]:
GAME_DAT_TFIDF["AMONG US"]

{'colorful': 0.15589076619017514,
 'multiplayer': 0.1035849348353059,
 'funny': 0.07059352631809057,
 'sci-fi': 0.16659831488621002,
 'survival': 0.25576461061454875,
 'top-down': 0.20932591754491328,
 'co-op': 0.12246124072423722,
 'pvp': 0.22961169493711414,
 'cartoony': 0.25576461061454875,
 'space': 0.22961169493711414,
 'online co-op': 0.15589076619017514,
 'local multiplayer': 0.14631253749400913,
 'aliens': 0.3556384550389224,
 'psychological': 0.3556384550389224,
 'party game': 0.29262507498801826,
 'minigames': 0.3556384550389224,
 'social deduction': 0.3556384550389224,
 'departure': 0.3556384550389224,
 'impostors': 0.3556384550389224,
 'player': 0.12246124072423722,
 'players': 0.10945207312053964,
 'spaceship': 0.3556384550389224}