## Game Feature Recommendation System
Recommends to the user on prompting game features extracted from game database information

In [33]:
# Library imports

import numpy as np
import spacy
import math
from nltk.tokenize import word_tokenize
from tqdm import tqdm
from nltk.corpus import stopwords
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity, euclidean_distances

nlp = spacy.load("en_core_web_sm")


In [34]:
# External data imports
def loadFeatData():
    DAT = {}
    with open("../data/game_datfeat.txt", "r") as f:
    # with open("../data/game_datfeat_FULL.txt", "r") as f:
        lines = [l.strip() for l in f.readlines()]
        CUR_GAME = ""
        for l in lines:
            # empty line (between entries)
            if l == "":
                continue
            #new entry
            else:
                if l[0] == "+":
                    CUR_GAME = l[2:]
                    DAT[CUR_GAME] = {"tags":[],"entities":[],"features":[]}
                elif l[0] == "#":
                    DAT[CUR_GAME]["tags"] = [t.lower() for t in l[2:].split(",")]
                elif l[0] == "@":
                    DAT[CUR_GAME]["entities"] = l[2:].split(",")
                elif l[0] == "-":
                    DAT[CUR_GAME]["features"].append(l[2:])
    return DAT


#get all of the tags and the entities from the game data
def getTagsEntities():
    # get all of the tags and entities
    ALL_TAGS = []
    ALL_ENTITIES = []
    for g in GAME_DATA:
        ALL_TAGS += [t.lower() for t in GAME_DATA[g]["tags"]]
        ALL_ENTITIES += [e.lower() for e in GAME_DATA[g]["entities"]]

    # remove duplicates
    ALL_TAGS = list(set(ALL_TAGS))
    ALL_ENTITIES = list(set(ALL_ENTITIES))

    return ALL_TAGS, ALL_ENTITIES


GAME_DATA = loadFeatData()
ALL_TAGS, ALL_ENTITIES = getTagsEntities()


## EXPERIMENTS
* Sample input text: "_top-down multiplayer action game about knights, castles, and assassinations_"
* Target selection games: [Castle Crashers, Hammerwatch, Kingsway], Thief Gold, Skyrim, Elden Ring

_Notes:_ 
- Maybe having some not perfect predictions is necessary for the user to prevent from making cloned games? -> #5 has best predictions so far, but aren't spot on

In [35]:
# general functions
TRAIN_MODE = "WORD_EMB"
SENT_MODEL = None
custom_stopwords = ["game", ",", ".", "!"]

#setup abstract function
def getClosestGames(txt,k=3):
    return [("?",0)]

#tokenizes the text
def tokenize(txt):
    raw_toks = word_tokenize(txt)
    toks = [w.lower() for w in raw_toks if w.lower() not in stopwords.words("english") and w.lower() not in custom_stopwords]
    #add the custom tag words (can be compound words)
    for t in ALL_TAGS:
        if t in txt and t not in toks:
            toks.append(t)
    return toks

### 1. VECTORIZE THE TAGS + ENTITIES
- Note: Completely wrong. Vectorizing doesn't work with unbalanced dataset - picked the matches with fewest amount of data in vector
- Result: ["The Ramp", "Batman: Arkham City", "Among Us"]

In [36]:
###   VECTOR THE TAGS AND ENTITIES   ###
if TRAIN_MODE == "VECTOR":

    # get all of the tags and entities
    ALL_TAGS = []
    ALL_ENTITIES = []
    for g in GAME_DATA:
        ALL_TAGS += [t.lower() for t in GAME_DATA[g]["tags"]]
        ALL_ENTITIES += [e.lower() for e in GAME_DATA[g]["entities"]]

    # remove duplicates
    ALL_TAGS = list(set(ALL_TAGS))
    ALL_ENTITIES = list(set(ALL_ENTITIES))

    #make a tag embedding
    def tagEmbed(tags):
        vec = [0]*len(ALL_TAGS)
        for t in tags:
            tt = t.lower()
            if tt in ALL_TAGS:
                vec[ALL_TAGS.index(tt)] = 1
        return vec

    #make a entity embedding
    def entEmbed(entities):
        vec = [0]*len(ALL_ENTITIES)
        for e in entities:
            ee = e.lower()
            if ee in ALL_ENTITIES:
                vec[ALL_ENTITIES.index(ee)] = 1
        return vec


    # make the embeddings for each game
    TAG_EMBEDDINGS = {}
    ENTITY_EMBEDDINGS = {}
    for g in GAME_DATA:
        TAG_EMBEDDINGS[g] = tagEmbed(GAME_DATA[g]["tags"])
        ENTITY_EMBEDDINGS[g] = entEmbed(GAME_DATA[g]["entities"])

    #make a combined embedding
    FULL_EMVEDDINGS = {}
    for g in GAME_DATA:
        FULL_EMVEDDINGS[g] = TAG_EMBEDDINGS[g]+ENTITY_EMBEDDINGS[g]


    ## GET CLOSEST GAME FROM TEXT BASED ON EMBEDDINGS ##
    def getClosestGames(txt,k=3):
        # tokenize and remove all stop words
        toks = tokenize(txt)
        # print(toks)

        # get the embedding for the text
        txt_embed = tagEmbed(toks)+entEmbed(toks)

        # get the closest games using distance metrics
        dists = {}
        for g in GAME_DATA:
            dists[g] = np.linalg.norm(np.array(txt_embed) - np.array(FULL_EMVEDDINGS[g]))
        dists = sorted(dists.items(), key=lambda x: x[1])
        # print(dists)
        return dists[:k], None
    

### 2. COUNT THE MATCHES
- Note: Doesn't get the right matches for the secondary choices - especially if the tags themselves are too general. No way to account for tie-breakers
- Result: ["Castle Crashers", "Hammerwatch", "Among Us"]

In [37]:
# count the number of tags and entities matches 
if TRAIN_MODE == "COUNT":
    
    # get the closest games using count matches
    def getClosestGames(txt,k=3):
        # tokenize and remove all stop words
        toks = tokenize(txt)

        # initialize the counts
        cts = {}
        for g in GAME_DATA:
            cts[g] = 0
        
        # count the matches for each game
        matches = {}
        for t in toks:
            for g in GAME_DATA:
                if t in GAME_DATA[g]["tags"] or t in GAME_DATA[g]["entities"]:
                    cts[g] += 1
                    if g not in matches:
                        matches[g] = []
                    matches[g].append(t)

        # sort the counts
        cts = sorted(cts.items(), key=lambda x: x[1], reverse=True)
        return cts[:k], matches

### 3. BERT SENTENCE TRANSFORMER MODEL
- Note: Ok, but places too much weight on the word semantics themselves.
- Result: ["Kingsway", "Hammerwatch", "Batman: Arkham City"]

In [38]:
if TRAIN_MODE == "TRANSFORMER":
    # load the model
    if SENT_MODEL == None:
        SENT_MODEL = SentenceTransformer('bert-base-nli-mean-tokens')

In [39]:
if TRAIN_MODE == "TRANSFORMER":

    # turn game data into a "sentence" for the sentence-transformer
    def sentGame(g):
        return " ".join(GAME_DATA[g]["tags"])+" "+" ".join(GAME_DATA[g]["entities"])

    # encode the game data
    def encodeGames():
        emb_dat = {}
        with tqdm(total=len(GAME_DATA)) as pbar:
            for g in GAME_DATA:
                emb_dat[g] = SENT_MODEL.encode(sentGame(g))
                pbar.update(1)
        return emb_dat

    GAME_DAT_EMB = encodeGames()

In [40]:
if TRAIN_MODE == "TRANSFORMER":
    # find the closest game based on the encoded game data
    def getClosestGames(txt,k=3):
        # tokenize and remove all stop words
        toks = tokenize(txt)

        # encode the text
        txt_embed = SENT_MODEL.encode([" ".join(toks)])

        # get the closest games using distance metrics
        dists = {}
        for g in GAME_DATA:
            temb = np.array(txt_embed)
            gemb = np.array(GAME_DAT_EMB[g])
            dists[g] = cosine_similarity(temb,[gemb])
        dists = sorted(dists.items(), key=lambda x: x[1], reverse=True)
        # print(dists)
        return dists[:k], None

### 4. TAG MATCH + SENTENCE TRANSFORMER SIMILARITY
- Note: Ok, but has both problems of the counter and transformer experiments. Too much weight on tags, not enough on less frequently occuring entities
- Result: ["Hammerwatch", "Binding of Isaac", "Castle Crashers"]

In [41]:
if TRAIN_MODE == "CT_TRANS":
    # load the model
    if SENT_MODEL == None:
        SENT_MODEL = SentenceTransformer('bert-base-nli-mean-tokens')
    

In [42]:
if TRAIN_MODE == "CT_TRANS":
    # turn game data into a "sentence" for the sentence-transformer
    def sentGame(g):
        return " ".join(GAME_DATA[g]["tags"])+" "+" ".join(GAME_DATA[g]["entities"])

    # encode the game data
    def encodeGames():
        emb_dat = {}
        with tqdm(total=len(GAME_DATA)) as pbar:
            for g in GAME_DATA:
                emb_dat[g] = SENT_MODEL.encode(sentGame(g))
                pbar.update(1)
        return emb_dat

    GAME_DAT_EMB = encodeGames()

    # count the number of tags found in each game's list
    def tagsFound(txt):
        # tokenize and remove all stop words
        toks = tokenize(txt)

        # initialize the counts
        cts = {}
        for g in GAME_DATA:
            cts[g] = 0
        
        # count the matches for each game
        for g in GAME_DATA:
            for t in toks:
                if t in GAME_DATA[g]["tags"]:
                    cts[g] += 1

        # sort the counts
        cts = dict(sorted(cts.items(), key=lambda x: x[1], reverse=True))
        return cts


In [43]:
if TRAIN_MODE == "CT_TRANS":
    # get the closest games using count matches and the sentence transformer distance metric
    def getClosestGames(txt,k=3):
        # tokenize and remove all stop words
        toks = tokenize(txt)

        # encode the text
        txt_embed = SENT_MODEL.encode([" ".join(toks)])

        # get the closest games using distance metrics
        dists = {}
        for g in GAME_DATA:
            temb = np.array(txt_embed)
            gemb = np.array(GAME_DAT_EMB[g])
            dists[g] = cosine_similarity(temb,[gemb])

        #multiply by the tag matches
        TAG_MATCHES = tagsFound(txt)
        print(TAG_MATCHES)
        for g in dists:
            dists[g] *= TAG_MATCHES[g]

        # sort by distance
        dists = sorted(dists.items(), key=lambda x: x[1], reverse=True)
        # print(dists)
        return dists[:k], None

### 5. TF-IDF + SENTENCE TRANSFORMER SIMILARITY
- Notes: Closer... Needs a mix of tf-idf with semantic similarity score that the transformer can offer
- Result: ["Castle Crashers", "Hammerwatch", "Among Us"]


In [44]:
# TF-IDF Code heheh

#term frequency - how often a term appears in a document / total number of terms in the document
def tf(doc):  # assume doc is a list of words already tokenized
    tf_dict = {}
    for word in doc:
        if word not in tf_dict:
            tf_dict[word] = 1
        tf_dict[word] += 1
    for word in tf_dict:
        tf_dict[word] = tf_dict[word] / len(doc)
    return tf_dict

#inverse document frequency - log(total number of documents / number of documents with term t in it)
def idf(documents):  # assume documents is a list of lists of words already tokenized
    df = {}
    for doc in documents:  
        for word in doc:
            if word in df:
                df[word] += 1
            else:
                df[word] = 1

    idf_dict = {}
    for word in df:
        idf_dict[word] = math.log(len(documents) / df[word])
    return idf_dict
    
# get the full tfidf score for each word in each game's dataset
def tfidf(doc_set):  # assume doc_set is a dictionary of games with word lists already tokenized
    tfidf_dict = {}
    corpuses = [d for d in doc_set.values()]
    # print(corpuses)
    idf_dat = idf(corpuses)
    for game, doc in doc_set.items():
        tf_dat = tf(doc)
        tfidf_dict[game] = {}
        for word in doc:
            # tfidf_dict[game][word] = tf_dat[word] * idf_dat[word]   #note: the tf will be uniform for each word in a doc since it only saves unique words
            tfidf_dict[game][word] = idf_dat[word]
    return tfidf_dict

In [45]:
if TRAIN_MODE == "TFIDF_TRANS":
    # load the model
    if SENT_MODEL == None:
        SENT_MODEL = SentenceTransformer('bert-base-nli-mean-tokens')

    # turn game data into a "sentence" for the sentence-transformer
    def sentGame(g):
        return " ".join(GAME_DATA[g]["tags"])+" "+" ".join(GAME_DATA[g]["entities"])

    # encode the game data
    def encodeGames():
        emb_dat = {}
        with tqdm(total=len(GAME_DATA)) as pbar:
            for g in GAME_DATA:
                emb_dat[g] = SENT_MODEL.encode(sentGame(g))
                pbar.update(1)
        return emb_dat

    GAME_DAT_EMB = encodeGames()

    # get the tfidf scores for each game
    GAME_DOCS = {}
    for g in GAME_DATA:
        GAME_DOCS[g] = list(set(GAME_DATA[g]["tags"]+GAME_DATA[g]["entities"]))
    GAME_DAT_TFIDF = tfidf(GAME_DOCS)

In [46]:
if TRAIN_MODE == "TFIDF_TRANS":
    def getClosestGames(txt,k=3):
        # tokenize and remove all stop words
        toks = tokenize(txt)

        # encode the text
        txt_embed = SENT_MODEL.encode([" ".join(toks)])

        # get the closest games using distance metrics
        dists = {}
        for g in GAME_DATA:
            temb = np.array(txt_embed)
            gemb = np.array(GAME_DAT_EMB[g])
            dists[g] = cosine_similarity(temb,[gemb])

        #multiply by the tag matches
        # TAG_MATCHES = tagsFound(txt)
        # for g in dists:
        #     dists[g] *= TAG_MATCHES[g]

        # multiply by the tfidf scores
        for g in dists:
            for t in toks:
                if t in GAME_DAT_TFIDF[g]:
                    dists[g] += GAME_DAT_TFIDF[g][t]*10
                # else:
                #     dists[g] *= 0.0001

        # sort by distance
        dists = sorted(dists.items(), key=lambda x: x[1], reverse=True)
        # print(dists)
        return dists[:k], sorted([(d[0],d[1][0][0]) for d in dists], key=lambda x: x[1], reverse=True)

### 6. WORD EMBEDDINGS
* Notes: 
  * Use 50d - no result change with increase except longer execution time. 
  * Getting the average didn't work as intended. 
  * Getting the max of each row got much better results. 
  * Using tf-idf gets about the same results. 
  * Using without tag multiplier but including in word embedding gets more accurate top 5. 
  * Only using sum gets even better. 
  * Adding TF-IDF to the end gets even better results.
* Results: ["Hammerwatch", "Castle Crashers", "Among Us"]

In [47]:
if TRAIN_MODE == "WORD_EMB":
    #import the word embedding data from the GloVe dataset
    GLOVE_DAT = {}
    with open("../data/glove.6B/glove.6B.50d.txt", "r", encoding="utf-8") as f:
        lines = f.readlines()
        with tqdm(total=len(lines)) as pbar:
            for line in lines:
                line = line.split()
                GLOVE_DAT[line[0]] = np.array([float(x) for x in line[1:]])
                pbar.update(1)

    # get the tfidf scores for each game
    GAME_DOCS = {}
    for g in GAME_DATA:
        GAME_DOCS[g] = list(set(GAME_DATA[g]["tags"]+GAME_DATA[g]["entities"]))
    GAME_DAT_TFIDF = tfidf(GAME_DOCS)

100%|██████████| 400000/400000 [00:06<00:00, 65053.99it/s]


In [48]:
if TRAIN_MODE == "WORD_EMB":
    def sentGame(g):
        return " ".join(GAME_DATA[g]["tags"])+" "+" ".join(GAME_DATA[g]["entities"])

    # return the distance between 2 game theme word sets
    def gameThemeDist(g1,g2): 
        # both game has a set of theme words of varying lengths
        d = cosine_similarity([GLOVE_DAT[w] for w in g1 if w in GLOVE_DAT],[GLOVE_DAT[w] for w in g2 if w in GLOVE_DAT if w in GLOVE_DAT])

        #average the distance
        # return sum([sum(x) for x in d])/(len(g1)*len(g2))

        #return the average of the max for each word in g1
        return sum([max(x) for x in d])

    # return the distance between 2 game entity word sets with tf-idf weighting
    def gameThemeDistTFIDF(g1,g2,game):
        # both game has a set of theme words of varying lengths
        d = []
        for w1 in g1:
            di = []
            for w2 in g2:
                if w1 in GLOVE_DAT and w2 in GLOVE_DAT:
                    di.append(cosine_similarity([GLOVE_DAT[w1]],[GLOVE_DAT[w2]])[0][0]*GAME_DAT_TFIDF[game][w2])
            if len(di) > 0:
                d.append(di)

        return sum([max(x) for x in d])

    # get the closest games using the TF-IDF and word embedding distances
    def getClosestGames(txt,k=3):
        # tokenize and remove all stop words
        toks = tokenize(txt)
        entities = [t for t in toks if t not in ALL_TAGS]
        # print(f"ENTITIES: {entities}")

        # get the closest games using distance metrics
        dists = {}
        for g in GAME_DATA:
            dists[g] = gameThemeDist(toks,GAME_DATA[g]["tags"]+GAME_DATA[g]["entities"])   #use the theme words (entities + tags)
            # dists[g] = gameThemeDist(entities,GAME_DATA[g]["entities"])   #only use entities

        #multiply by the tag matches
        # TAG_MATCHES = tagsFound(txt)
        # for g in dists:
        #     dists[g] *= TAG_MATCHES[g]

        #add tf-idf scores where found
        for g in dists:
            for t in toks:
                if t in GAME_DAT_TFIDF[g]:
                    dists[g] += GAME_DAT_TFIDF[g][t]

        # sort by distance
        dists = sorted(dists.items(), key=lambda x: x[1], reverse=True)
        return dists[:k], sorted([(d[0],round(d[1],4)) for d in dists], key=lambda x: x[1], reverse=True)
    

## RUN RECOMMENDATION SYSTEM

#### Steps: 
1. Load the data (games => tags, entities, features)
2. Get a user prompt for a game and its genre
3. Recommend some random features based on the text similarity and closest tags

* Note: Keep in mind, there are over 150k games with tags, features, and entities - the search cannot be too large


In [49]:
# recommend some features to the user based on a prompt
def recommendFeatures(userPrompt):
    # get the closest game
    closestGames, other = getClosestGames(userPrompt)
    print(closestGames)

    #show more debug info
    if other:
        print(other)
        print("")

    # get the features for that game
    for i in closestGames:
        g,v = i
        print(f"{g} -> {v}")
        print(sentGame(g))
        print(GAME_DATA[g]["features"])
        print("")

In [50]:
user_txt = "top-down multiplayer action game about knights, castles, and assassinations"  #ideally should return "castle crashers" or "hammerwatch" or "thief"
print(tokenize(user_txt))
print("")
print(f"----- {TRAIN_MODE} -----")
print("")
recommendFeatures(user_txt)

['top-down', 'multiplayer', 'action', 'knights', 'castles', 'assassinations', 'assassin']

----- WORD_EMB -----

[('HOARD', 25.266873260011234), ('ROAD REDEMPTION', 23.061376032894227), ('ASSASSIN BLUE', 20.398141089807915)]

HOARD -> 25.266873260011234
indie action casual singleplayer strategy fantasy multiplayer arcade top-down co-op dragons action strategy - abilities archers base carts castles co crops dragon drive extinction fantasy fiery fire game gasps gold hoard home income kidnapping kingdom knights limit map maps merchant modes mound mouth name number one op parameters particularities pile player players preying princess princesses pursuit ransom ransoms selections sorts speed stream target targets thieves thumbsticks time treasure user wake waste windmills
['direct a stream', 'maximise income', 'heal and upgrade one of four abilities', 'decorate hoard', 'fly and one']

ROAD REDEMPTION -> 23.061376032894227
indie action singleplayer adventure simulation multiplayer arcade fun

## Test prompts
* Notes: Needs hyper-specific adjectives to get right on the money with the suggestions - otherwise can get a little too off.

In [51]:
prompts = [
    "a pixel skateboarding game in a cyberpunk future",                  #recs: the ramp, jet set radio, cyberpunk 2077
    "a cute, relaxing, atmospheric farming rpg with animal characters",  #recs: stardew valley, night in the woods, a short hike
    "a physics-based shopping simulator in a 90s mall",                   #recs: goat simulator, retrowave, donut county
    # "a first-person goat skateboarding game"
    ]  

for p in prompts:
    print(f"----- {p} -----")
    print("")
    recommendFeatures(p)
    print("")
    print("")

----- a pixel skateboarding game in a cyberpunk future -----

[('SKATEBOARD JOUST', 14.851314018215888), ("BLACK FUTURE '88", 12.140778781467823), ('ZEPHYR', 12.136567061172428)]

SKATEBOARD JOUST -> 14.851314018215888
action bonus chamber coins enemies future objective obstacles place player powers score shooting skateboard skateboarding sport stage waves
['dispose of your enemies']

BLACK FUTURE '88 -> 12.140778781467823
2d pixel graphics sci-fi platformer difficult co-op great soundtrack side scroller 1980s rogue-like rogue-lite local co-op action roguelike cyberpunk action ' action colossal dash explodes future heart homicidal junkies owner punk shoot slash top tower traps version waves way
['reach the top', 'climb an always evolving procedural tower', 'upgrade yourself to survive the endless waves', 'kill its insane owner']

ZEPHYR -> 12.136567061172428
3d shooter sci-fi combat old school fps racing futuristic 1990's cyberpunk robots aliens driving dark humor dystopian destruction

KeyboardInterrupt: 

## TEST AREA`

In [None]:
for gt in ["CASTLE CRASHERS", "HYPER LIGHT DRIFTER", "THIEF GOLD", "THE ELDER SCROLLS V: SKYRIM", "ELDEN RING", "KINGSWAY"]:
    print(gt)
    print(sentGame(gt))
    for f in GAME_DATA[gt]["features"]:
        print(f"- {f}")
    # print(GAME_DATA[gt]["features"])
    print("")



CASTLE CRASHERS
action casual singleplayer adventure rpg 2d multiplayer cute arcade funny comedy co-op cartoony medieval online co-op local multiplayer hack and slash local co-op 4 player local beat 'em up action adventure role-playing beat 'em up adventure arcade award castles characters crashers friends hack hand hi kingdom princess res slash victory visuals way
- defend your kingdom
- smash your way to victory
- play locally or online to save your princess
- crash some castles

HYPER LIGHT DRIFTER
indie action singleplayer adventure rpg 2d atmospheric pixel graphics fantasy colorful exploration sci-fi difficult top-down great soundtrack action rpg hack and slash post-apocalyptic metroidvania souls-like action role-playing action adventure bit blood classics collectors designs disease drifter drifters echoes grander histories illness knowledge land lands mechanics past resonate rpg scale technologies treasure vein way world
- quiet the vicious disease

THIEF GOLD
action singleplayer 

KeyError: 'ELDEN RING'

In [None]:
GAME_DAT_TFIDF["CYBERPUNK 2077"]

{'character': 1.9661128563728327,
 'rockerboy': 3.912023005428146,
 'exploration': 1.3470736479666094,
 'key': 3.912023005428146,
 'futuristic': 2.8134107167600364,
 'body': 3.912023005428146,
 'players': 1.2039728043259361,
 'open world': 1.4271163556401458,
 'immersive sim': 2.8134107167600364,
 'world': 0.8209805520698303,
 'immortality': 3.912023005428146,
 'corporation': 3.912023005428146,
 'v': 2.8134107167600364,
 'nudity': 3.2188758248682006,
 'great soundtrack': 0.5108256237659907,
 'first-person': 1.7147984280919266,
 'mature': 3.2188758248682006,
 'cyberware': 3.912023005428146,
 'sci-fi': 1.8325814637483102,
 'city': 2.120263536200091,
 'modification': 3.912023005428146,
 'story rich': 1.2039728043259361,
 'characters': 1.6094379124341003,
 'character customization': 2.302585092994046,
 'cast': 2.8134107167600364,
 'cyberpunk': 2.302585092994046,
 'ability': 3.912023005428146,
 'role': 3.2188758248682006,
 'technology': 3.2188758248682006,
 'action rpg': 1.9661128563728327,

In [None]:
# show the distances between the word sets as a table
def showSim(arr2d,x,y):
    # print the table
    z = "X"
    print(f"{z.ljust(10)}",end="")
    for i in x:
        print(f"{i[:8].ljust(10)}",end="")
    print("")
    for i in range(len(arr2d)):
        print(f"{y[i][:8].ljust(10):10}",end="")
        for j in range(len(arr2d[i])):
            print(f"{str(arr2d[i][j]).ljust(10)}",end="")
        print("")


# tokenize and remove all stop words
entities = ["knights", "castle", "assassination"]
ideal_games = ["CASTLE CRASHERS", "HAMMERWATCH", "THIEF GOLD", "THE ELDER SCROLLS V: SKYRIM", "ELDEN RING", "KINGSWAY"]
# actual_games = ["DOWNWELL", "VAMPIRE SURVIVORS", "BATMAN: ARKHAM CITY"]
actual_games = ["MINI NINJAS", "THE BINDING OF ISAAC", "NIDHOGG"]

# get the closest games using distance metrics
i_dists = {}
i_break = {}
for g in ideal_games:
    i_dists[g] = gameThemeDist(entities,GAME_DATA[g]["entities"])   #only use entities
    i_break[g] = []  
    for i in entities:
        a = []
        for j in GAME_DATA[g]["entities"]:
            if i in GLOVE_DAT and j in GLOVE_DAT:
                a.append(cosine_similarity([GLOVE_DAT[i]],[GLOVE_DAT[j]])[0][0]*GAME_DAT_TFIDF[g][j])
            else:
                a.append(0)
        a.append(np.max(a))
        i_break[g].append(a)
            
    # i_break[g] = cosine_similarity([GLOVE_DAT[w] for w in entities if w in GLOVE_DAT],[GLOVE_DAT[w] for w in GAME_DATA[g]["entities"] if w in GLOVE_DAT if w in GLOVE_DAT])

a_dists = {}
a_break = {}
for g in actual_games:
    a_dists[g] = gameThemeDist(entities,GAME_DATA[g]["entities"])   #only use entities
    a_break[g] = []  
    for i in entities:
        a = []
        for j in GAME_DATA[g]["entities"]:
            if i in GLOVE_DAT and j in GLOVE_DAT:
                a.append(cosine_similarity([GLOVE_DAT[i]],[GLOVE_DAT[j]])[0][0]*GAME_DAT_TFIDF[g][j])
            else:
                a.append(0)
        a.append(np.max(a))
        a_break[g].append(a)

    # a_break[g] = cosine_similarity([GLOVE_DAT[w] for w in entities if w in GLOVE_DAT],[GLOVE_DAT[w] for w in GAME_DATA[g]["entities"] if w in GLOVE_DAT if w in GLOVE_DAT])


#show distances
print(entities)
print("")

print("\nIDEAL GAMES")
for g in i_dists:
    print(f"{g} -> {i_dists[g]}")
    x = GAME_DATA[g]["entities"] + ["MAX"]
    showSim(np.around(i_break[g],4),x,entities)
    print("")
    # print(i_dists[g])
    # print(np.around(i_break[g],2))
    # print("")

print("\nACTUAL GAMES")
for g in a_dists:
    print(f"{g} -> {a_dists[g]}")
    x = GAME_DATA[g]["entities"] + ["MAX"]
    showSim(np.around(a_break[g],4),x,entities)
    print("")
    # print(a_dists[g])
    # print(np.around(a_break[g],2))
    # print("")

['knights', 'castle', 'assassination']


IDEAL GAMES
CASTLE CRASHERS -> 1.4532353806595277
X         adventur  arcade    award     castles   characte  crashers  friends   hack      hand      hi        kingdom   princess  res       slash     victory   visuals   way       MAX       
knights   0.2184    0.7287    1.0496    1.8414    0.4665    0.0716    0.475     -0.0219   0.3712    -0.1035   1.7534    0.9923    -0.5047   -0.0834   1.3338    -0.2149   0.2118    1.8414    
castle    0.2242    0.8608    0.3959    2.7652    0.494     0.686     0.5149    -0.1383   0.5172    -0.173    2.0129    1.8139    -0.8778   -0.5066   0.8286    -0.004    0.3824    2.7652    
assassin  0.1358    0.0138    0.5769    -0.2865   0.3647    0.3698    0.4884    0.1759    0.6348    -0.7147   0.995     0.8659    0.1804    -0.1924   1.0482    -0.2528   0.3202    1.0482    

HAMMERWATCH -> 1.2761834602584552
X         -         action    adventur  art       bottom    classes   co        customiz  developm  difficul  

44