## Game Feature Recommendation System
Recommends to the user on prompting game features extracted from game database information

In [78]:
# Library imports

import numpy as np
import spacy
from nltk.tokenize import word_tokenize
from tqdm import tqdm
from nltk.corpus import stopwords
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity

nlp = spacy.load("en_core_web_sm")


In [79]:
# External data imports
def loadFeatData():
    DAT = {}
    with open("../data/game_datfeat.txt", "r") as f:
        lines = [l.strip() for l in f.readlines()]
        CUR_GAME = ""
        for l in lines:
            # empty line (between entries)
            if l == "":
                continue
            #new entry
            else:
                if l[0] == "+":
                    CUR_GAME = l[2:]
                    DAT[CUR_GAME] = {"tags":[],"entities":[],"features":[]}
                elif l[0] == "#":
                    DAT[CUR_GAME]["tags"] = [t.lower() for t in l[2:].split(",")]
                elif l[0] == "@":
                    DAT[CUR_GAME]["entities"] = l[2:].split(",")
                else:
                    DAT[CUR_GAME]["features"].append(l)
    return DAT


#get all of the tags and the entities from the game data
def getTagsEntities():
    # get all of the tags and entities
    ALL_TAGS = []
    ALL_ENTITIES = []
    for g in GAME_DATA:
        ALL_TAGS += [t.lower() for t in GAME_DATA[g]["tags"]]
        ALL_ENTITIES += [e.lower() for e in GAME_DATA[g]["entities"]]

    # remove duplicates
    ALL_TAGS = list(set(ALL_TAGS))
    ALL_ENTITIES = list(set(ALL_ENTITIES))

    return ALL_TAGS, ALL_ENTITIES


GAME_DATA = loadFeatData()
ALL_TAGS, ALL_ENTITIES = getTagsEntities()


## EXPERIMENTS
* Sample input text: "_top-down multiplayer action game about knights, castles, and assassinations_"
* Target selection games: [Castle Crashers, Hammerwatch, Kingsway], Thief Gold, Skyrim, Elden Ring

In [134]:
# general functions
TRAIN_MODE = "CT_TRANS"
SENT_MODEL = None
custom_stopwords = ["game", ",", ".", "!"]

#setup abstract function
def getClosestGames(txt,k=3):
    return [("?",0)]

#tokenizes the text
def tokenize(txt):
    raw_toks = word_tokenize(txt)
    toks = [w.lower() for w in raw_toks if w.lower() not in stopwords.words("english") and w.lower() not in custom_stopwords]
    #add the custom tag words (can be compound words)
    for t in ALL_TAGS:
        if t in txt and t not in toks:
            toks.append(t)
    return toks

### VECTORIZE THE TAGS + ENTITIES
- Note: Completely wrong. Vectorizing doesn't work with unbalanced dataset - picked the matches with fewest amount of data in vector
- Result: ["The Ramp", "Batman: Arkham City", "Among Us"]

In [135]:
###   VECTOR THE TAGS AND ENTITIES   ###
if TRAIN_MODE == "VECTOR":

    # get all of the tags and entities
    ALL_TAGS = []
    ALL_ENTITIES = []
    for g in GAME_DATA:
        ALL_TAGS += [t.lower() for t in GAME_DATA[g]["tags"]]
        ALL_ENTITIES += [e.lower() for e in GAME_DATA[g]["entities"]]

    # remove duplicates
    ALL_TAGS = list(set(ALL_TAGS))
    ALL_ENTITIES = list(set(ALL_ENTITIES))

    #make a tag embedding
    def tagEmbed(tags):
        vec = [0]*len(ALL_TAGS)
        for t in tags:
            tt = t.lower()
            if tt in ALL_TAGS:
                vec[ALL_TAGS.index(tt)] = 1
        return vec

    #make a entity embedding
    def entEmbed(entities):
        vec = [0]*len(ALL_ENTITIES)
        for e in entities:
            ee = e.lower()
            if ee in ALL_ENTITIES:
                vec[ALL_ENTITIES.index(ee)] = 1
        return vec


    # make the embeddings for each game
    TAG_EMBEDDINGS = {}
    ENTITY_EMBEDDINGS = {}
    for g in GAME_DATA:
        TAG_EMBEDDINGS[g] = tagEmbed(GAME_DATA[g]["tags"])
        ENTITY_EMBEDDINGS[g] = entEmbed(GAME_DATA[g]["entities"])

    #make a combined embedding
    FULL_EMVEDDINGS = {}
    for g in GAME_DATA:
        FULL_EMVEDDINGS[g] = TAG_EMBEDDINGS[g]+ENTITY_EMBEDDINGS[g]


    ## GET CLOSEST GAME FROM TEXT BASED ON EMBEDDINGS ##
    def getClosestGames(txt,k=3):
        # tokenize and remove all stop words
        toks = tokenize(txt)
        # print(toks)

        # get the embedding for the text
        txt_embed = tagEmbed(toks)+entEmbed(toks)

        # get the closest games using distance metrics
        dists = {}
        for g in GAME_DATA:
            dists[g] = np.linalg.norm(np.array(txt_embed) - np.array(FULL_EMVEDDINGS[g]))
        dists = sorted(dists.items(), key=lambda x: x[1])
        # print(dists)
        return dists[:k], None
    

### COUNT THE MATCHES
- Note: Doesn't get the right matches for the secondary choices - especially if the tags themselves are too general. No way to account for tie-breakers
- Result: ["Castle Crashers", "Hammerwatch", "Among Us"]

In [136]:
# count the number of tags and entities matches 
if TRAIN_MODE == "COUNT":
    
    # get the closest games using count matches
    def getClosestGames(txt,k=3):
        # tokenize and remove all stop words
        toks = tokenize(txt)

        # initialize the counts
        cts = {}
        for g in GAME_DATA:
            cts[g] = 0
        
        # count the matches for each game
        matches = {}
        for t in toks:
            for g in GAME_DATA:
                if t in GAME_DATA[g]["tags"] or t in GAME_DATA[g]["entities"]:
                    cts[g] += 1
                    if g not in matches:
                        matches[g] = []
                    matches[g].append(t)

        # sort the counts
        cts = sorted(cts.items(), key=lambda x: x[1], reverse=True)
        return cts[:k], matches

### BERT SENTENCE TRANSFORMER MODEL
- Note: Ok, but places too much weight on the word semantics themselves.
- Result: ["Kingsway", "Hammerwatch", "Batman: Arkham City"]

In [137]:
if TRAIN_MODE == "TRANSFORMER":
    # load the model
    if SENT_MODEL == None:
        SENT_MODEL = SentenceTransformer('bert-base-nli-mean-tokens')

In [138]:
if TRAIN_MODE == "TRANSFORMER":

    # turn game data into a "sentence" for the sentence-transformer
    def sentGame(g):
        return " ".join(GAME_DATA[g]["tags"])+" "+" ".join(GAME_DATA[g]["entities"])

    # encode the game data
    def encodeGames():
        emb_dat = {}
        with tqdm(total=len(GAME_DATA)) as pbar:
            for g in GAME_DATA:
                emb_dat[g] = SENT_MODEL.encode(sentGame(g))
                pbar.update(1)
        return emb_dat

    GAME_DAT_EMB = encodeGames()

In [139]:
if TRAIN_MODE == "TRANSFORMER":
    # find the closest game based on the encoded game data
    def getClosestGames(txt,k=3):
        # tokenize and remove all stop words
        toks = tokenize(txt)

        # encode the text
        txt_embed = SENT_MODEL.encode([" ".join(toks)])

        # get the closest games using distance metrics
        dists = {}
        for g in GAME_DATA:
            temb = np.array(txt_embed)
            gemb = np.array(GAME_DAT_EMB[g])
            dists[g] = cosine_similarity(temb,[gemb])
        dists = sorted(dists.items(), key=lambda x: x[1], reverse=True)
        # print(dists)
        return dists[:k], None

### TAG MATCH + SENTENCE TRANSFORMER SIMILARITY
- Note: Ok, but has both problems of the counter and transformer experiments. Too much weight on tags, not enough on less frequently occuring entities
- Result: ["Hammerwatch", "Binding of Isaac", "Castle Crashers"]

In [140]:
if TRAIN_MODE == "CT_TRANS":
    # load the model
    if SENT_MODEL == None:
        SENT_MODEL = SentenceTransformer('bert-base-nli-mean-tokens')
    

In [141]:
if TRAIN_MODE == "CT_TRANS":
    # turn game data into a "sentence" for the sentence-transformer
    def sentGame(g):
        return " ".join(GAME_DATA[g]["tags"])+" "+" ".join(GAME_DATA[g]["entities"])

    # encode the game data
    def encodeGames():
        emb_dat = {}
        with tqdm(total=len(GAME_DATA)) as pbar:
            for g in GAME_DATA:
                emb_dat[g] = SENT_MODEL.encode(sentGame(g))
                pbar.update(1)
        return emb_dat

    GAME_DAT_EMB = encodeGames()

    # count the number of tags found in each game's list
    def tagsFound(txt):
        # tokenize and remove all stop words
        toks = tokenize(txt)

        # initialize the counts
        cts = {}
        for g in GAME_DATA:
            cts[g] = 0
        
        # count the matches for each game
        for g in GAME_DATA:
            for t in toks:
                if t in GAME_DATA[g]["tags"]:
                    cts[g] += 1

        # sort the counts
        cts = dict(sorted(cts.items(), key=lambda x: x[1], reverse=True))
        return cts


100%|██████████| 50/50 [00:10<00:00,  4.87it/s]


In [142]:
if TRAIN_MODE == "CT_TRANS":
    # get the closest games using count matches and the sentence transformer distance metric
    def getClosestGames(txt,k=3):
        # tokenize and remove all stop words
        toks = tokenize(txt)

        # encode the text
        txt_embed = SENT_MODEL.encode([" ".join(toks)])

        # get the closest games using distance metrics
        dists = {}
        for g in GAME_DATA:
            temb = np.array(txt_embed)
            gemb = np.array(GAME_DAT_EMB[g])
            dists[g] = cosine_similarity(temb,[gemb])

        #multiply by the tag matches
        TAG_MATCHES = tagsFound(txt)
        print(TAG_MATCHES)
        for g in dists:
            dists[g] *= TAG_MATCHES[g]

        # sort by distance
        dists = sorted(dists.items(), key=lambda x: x[1], reverse=True)
        # print(dists)
        return dists[:k], None

## RUN RECOMMENDATION SYSTEM

#### Steps: 
1. Load the data (games => tags, entities, features)
2. Get a user prompt for a game and its genre
3. Recommend some random features based on the text similarity and closest tags

* Note: Keep in mind, there are over 150k games with tags, features, and entities - the search cannot be too large


In [143]:
# recommend some features to the user based on a prompt
def recommendFeatures(userPrompt):
    # get the closest game
    closestGames, other = getClosestGames(userPrompt)
    print(closestGames)

    #show more debug info
    if other:
        print(other)
        print("")

    # get the features for that game
    for i in closestGames:
        g,v = i
        print(f"{g} -> {v}")
        print(sentGame(g))
        print(GAME_DATA[g]["features"])
        print("")

In [144]:
user_txt = "top-down multiplayer action game about knights, castles, and assassinations"  #ideally should return "castle crashers" or "hammerwatch" or "thief"
print(tokenize(user_txt))
recommendFeatures(user_txt)

['top-down', 'multiplayer', 'action', 'knights', 'castles', 'assassinations']
{'HAMMERWATCH': 3, 'AMONG US': 2, 'BATTLEBLOCK THEATER': 2, 'THE BINDING OF ISAAC': 2, 'CASTLE CRASHERS': 2, 'CUPHEAD': 2, 'GRAND THEFT AUTO V': 2, 'HYPER LIGHT DRIFTER': 2, 'MINI NINJAS': 2, 'NIDHOGG': 2, 'SCRIBBLENAUTS UNLIMITED': 2, 'BABA IS YOU': 1, 'BATMAN: ARKHAM CITY': 1, 'DOWNWELL': 1, 'THE ELDER SCROLLS V: SKYRIM': 1, 'GOAT SIMULATOR': 1, 'JET SET RADIO': 1, "MIRROR'S EDGE": 1, 'MONSTER PROM': 1, 'OVERCOOKED! 2': 1, 'PORTAL': 1, 'QUADRILATERAL COWBOY': 1, 'SPELUNKY': 1, 'STARDEW VALLEY': 1, 'STREETS OF ROGUE': 1, 'SUPER FANCY PANTS ADVENTURE': 1, 'SUPER MEAT BOY': 1, 'THIEF GOLD': 1, 'CAVEBLAZERS': 0, 'CHROMA SQUAD': 0, 'CYBERPUNK 2077': 0, 'DONUT COUNTY': 0, 'ELDEN RING': 0, 'EMILY IS AWAY TOO': 0, 'ESCAPE SIMULATOR': 0, 'FEZ': 0, 'HADES': 0, 'KINDERGARTEN': 0, 'KINGSWAY': 0, 'LAST CALL BBS': 0, 'NIGHT IN THE WOODS': 0, 'OMORI': 0, 'OUTER WILDS': 0, 'THE RAMP': 0, 'RETROWAVE': 0, 'A SHORT HIKE': 0, 

In [145]:
for gt in ["CASTLE CRASHERS", "HYPER LIGHT DRIFTER", "THIEF GOLD"]:
    print(gt)
    print(sentGame(gt))
    print(GAME_DATA[gt]["features"])
    print("")



CASTLE CRASHERS
action casual singleplayer adventure rpg 2d multiplayer cute arcade funny comedy co-op cartoony medieval online co-op local multiplayer hack and slash local co-op 4 player local beat 'em up adventure arcade award castles characters crashers friends hack hand hi kingdom princess res slash victory visuals way
['defend your kingdom', 'play locally or online to save your princess', 'smash your way to victory', 'crash some castles']

HYPER LIGHT DRIFTER
indie action singleplayer adventure rpg 2d atmospheric pixel graphics fantasy colorful exploration sci-fi difficult top-down great soundtrack action rpg hack and slash post-apocalyptic metroidvania souls-like action adventure bit blood classics collectors designs disease drifter drifters echoes grander histories illness knowledge land lands mechanics past resonate rpg scale technologies treasure vein way world
['quiet the vicious disease']

THIEF GOLD
action singleplayer adventure atmospheric story rich fantasy exploration fi