# LSH TEST 
Uses locality sensitive hashing on a large dataset to create subsets and find similar games for recomendations 

### Imports

In [1]:
# Library imports
import numpy as np
import spacy
import math
from nltk.tokenize import word_tokenize
from tqdm import tqdm
from nltk.corpus import stopwords
from sklearn.metrics.pairwise import cosine_similarity, euclidean_distances
import gensim

nlp = spacy.load("en_core_web_sm")

# External file imports
import sys  
sys.path.insert(0, '../')
from Python.lsh import VanillaLSH

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# import the dataset
FEAT_FILE = "../data/game_datfeat.txt"

def loadFeatData():
    DAT = {}
    with open(FEAT_FILE, "r") as f:
        lines = [l.strip() for l in f.readlines()]
        CUR_GAME = ""
        for l in lines:
            # empty line (between entries)
            if l == "":
                continue
            #new entry
            else:
                if l[0] == "+":
                    CUR_GAME = l[2:]
                    DAT[CUR_GAME] = {"tags":[],"entities":[],"features":[]}
                elif l[0] == "#":
                    DAT[CUR_GAME]["tags"] = [t.lower() for t in l[2:].split(",")]
                elif l[0] == "@":
                    DAT[CUR_GAME]["entities"] = l[2:].split(",")
                elif l[0] == "-":
                    DAT[CUR_GAME]["features"].append(l[2:])
    return DAT


#get all of the tags and the entities from the game data
def getTagsEntities():
    # get all of the tags and entities
    ALL_TAGS = []
    ALL_ENTITIES = []
    for g in GAME_DATA:
        ALL_TAGS += [t.lower() for t in GAME_DATA[g]["tags"]]
        ALL_ENTITIES += [e.lower() for e in GAME_DATA[g]["entities"]]

    # remove duplicates
    ALL_TAGS = list(set(ALL_TAGS))
    ALL_ENTITIES = list(set(ALL_ENTITIES))

    return ALL_TAGS, ALL_ENTITIES


GAME_DATA = loadFeatData()
ALL_TAGS, ALL_ENTITIES = getTagsEntities()

print(len(GAME_DATA))

59771


### Encoding

In [7]:
#tokenizes the text
custom_stopwords = ["game", ",", ".", "!"]
def tokenize(txt):
    raw_toks = word_tokenize(txt)
    toks = [w.lower() for w in raw_toks if w.lower() not in stopwords.words("english") and w.lower() not in custom_stopwords]
    #add the custom tag words (can be compound words)
    for t in ALL_TAGS:
        if t in txt and t not in toks:
            toks.append(t)
    return toks

# simplifies the prompt by tokenizing and turn into one string
def simplify(prompt):
    #tokenize the prompt
    prompt_toks = tokenize(prompt)
    #turn into one string
    prompt_str = " ".join(prompt_toks)
    return prompt_str

# encode all of the games for doc2vec
def encodeGames():
    # tokenize all of the games
    GAMES = []
    with tqdm(total=len(GAME_DATA), desc="Tokenzing games") as pbar:

        for g in GAME_DATA:
            tags = GAME_DATA[g]["tags"]   # get the tags
            ents = GAME_DATA[g]["entities"]  # get the entities
            
            txt = " ".join(tags + ents)   # combine the tags and entities
            txt_toks = simplify(txt)   # tokenize the text

            GAMES.append(txt_toks)
            pbar.update(1)
    
    #preprocess for doc2vec
    doc_games = []
    with tqdm(total=len(GAMES), desc="Preprocessing games for doc2vec") as pbar2:
        for i, line in enumerate(GAMES):
            tokens = gensim.utils.simple_preprocess(line)
            # For training data, add tags
            doc_games.append(gensim.models.doc2vec.TaggedDocument(tokens, [i]))
            pbar2.update(1)
    return doc_games, GAMES
    

In [4]:
IMPORT = False

In [6]:
# create from scratch
if not IMPORT:

    ### TRAIN MODEL ###

    train_corpus, TOK_GAMES = encodeGames()

    #train doc2vec model for encoding
    doc2vec = gensim.models.doc2vec.Doc2Vec(vector_size=50, min_count=2, epochs=40)
    doc2vec.build_vocab(list(train_corpus))

    # test the model
    print(doc2vec.infer_vector(['top-down', 'multiplayer', 'action', 'knights', 'castles', 'assassinations']))

    ### VECTORIZE ###

    #get vector for each game
    VEC_GAMES = []
    GAME_NAMES = list(GAME_DATA.keys())
    with tqdm(total=len(GAME_NAMES), desc="Getting vectors for games") as pbar3:
        for i, g in enumerate(TOK_GAMES):
            VEC_GAMES.append({"name":GAME_NAMES[i], "vector":doc2vec.infer_vector(g.split(" "))})
            pbar3.update(1)

    ### EXPORT ###

     #export the model
    doc2vec.save("../models/doc2vec/game_doc2vec_model_mini")

    #export the game name, tokens, and vectors to a file
    with open("../models/doc2vec/game_doc2vec_dat_mini.txt", "w+") as f:
        for i, g in enumerate(VEC_GAMES):
            vstr = " ".join([str(v) for v in g["vector"]])
            f.write(f"{g['name']}\n{TOK_GAMES[i]}\n{vstr}\n\n")

    print("> MODEL AND DATA EXPORTED!")
        
        

# otherwise import everything
else:
    #import the trained model
    doc2vec = gensim.models.doc2vec.Doc2Vec.load("../models/doc2vec/game_doc2vec_model_mini")

    #import the games and their vectors
    with open("../models/doc2vec/game_doc2vec_dat.txt", "r") as f:
        lines = [l.strip() for l in f.readlines()]
        VEC_GAMES = []
        TOK_GAMES = []
        for i in range(0, len(lines), 4):
            VEC_GAMES.append({"name":lines[i], "vector":np.array([float(v) for v in lines[i+2].split(" ")])})
            TOK_GAMES.append(lines[i+1])

    print("> MODEL AND DATA IMPORTED!")
            


> MODEL AND DATA IMPORTED!


### USE LSH WITH DATASET

In [10]:
# setup the vanilla lsh and hash
vec_dim = len(VEC_GAMES[0]["vector"])
van_lsh = VanillaLSH(250, 50, 1, vec_dim)
for game in tqdm(VEC_GAMES, desc="Hashing games"):
    van_lsh.hash(game["vector"])


Hashing games: 100%|██████████| 59771/59771 [03:36<00:00, 276.19it/s] 


In [11]:
# return games that are close to the prompt
def getClosestGames(prompt):
    #make the vector
    prompt_tok = tokenize(prompt)
    prompt_vec = doc2vec.infer_vector(prompt_tok)

    #get the closest games
    cgi = van_lsh.query(prompt_vec)
    closest_game_names = [VEC_GAMES[i]["name"] for i in cgi]

    return closest_game_names

In [14]:
prompts = [
    "a pixel skateboarding game in a cyberpunk future",                  #recs: the ramp, jet set radio, cyberpunk 2077
    "a cute, relaxing, atmospheric farming rpg with animal characters",  #recs: stardew valley, night in the woods, a short hike
    "a physics-based shopping simulator in a 90s mall",                   #recs: goat simulator, retrowave, donut county
    # "a first-person goat skateboarding game"
    ]  

for p in prompts:
    print(f"> {p}")
    recs = getClosestGames(p)
    print(f"GAMES: {len(recs)}")
    print(f"RECOMMENDATIONS: {recs}")
    print("")

> a pixel skateboarding game in a cyberpunk future
GAMES: 52
RECOMMENDATIONS: ["PENNY ARCADE'S ON THE RAIN-SLICK PRECIPICE OF DARKNESS 3", 'PATRICIAN IV', 'KUNG-FU MASTER', 'RAT HOTEL', 'BURNOUT: CHAMPIONSHIP DRAG RACING', 'THE UNSOLVED: HYPER SCIENCE ADVENTURE', 'HELLO KITTY NO MAGICAL MUSEUM', 'KNUCKLE BASH 2', 'WHITE BREATH: PERFECT EDITION', 'BYTEPATH', 'SPIRIT OF ADVENTURE', 'HOLLYWOOD HERO: COMEBACK', 'CHOPPER DROP', 'ESTRANGED: ACT II', 'THE GREAT BATTLE IV', 'MEGA MAN 5', "MEGA MAN 8: ANNIVERSARY COLLECTOR'S EDITION", 'SUPER GODZILLA', 'MICKEY TO DONALD: MAGICAL ADVENTURE 3', 'RIDICULOUS REALITY', 'GRAND PRIX RALLY II', 'HAVE A N.I.C.E. DAY!', 'INITIAL D', 'ALIEN BASH II', 'DANCE DANCE REVOLUTION: HOTTEST PARTY 5', 'DEAD REEFS', 'MOTHER 25TH ANNIVERSARY EDITION', 'TAKEN SOULS: BLOOD RITUAL', 'JOHN DEERE: HARVEST IN THE HEARTLAND', 'ACA NEOGEO BURNING FIGHT', 'POCOYO PARTY', 'FARNHAM FABLES', 'THE GREAT ACE ATTORNEY CHRONICLES', 'AVOID THE AWFUL THING THAT VAGUELY RESEMBLES A BA

In [16]:
BASE_GAME = "STARDEW VALLEY"
SANITY_GAMES = ["MEGA MAN 5", "HARVEST MOON GBC", "BABA IS YOU"]

# get the vector for the base game
base_vec = None
for g in VEC_GAMES:
    if g["name"] == BASE_GAME:
        base_vec = g["vector"]
        break

# get the vectors for the sanity games
sanity_vecs = []
for g in VEC_GAMES:
    if g["name"] in SANITY_GAMES:
        sanity_vecs.append(g["vector"])

# get the cosine similarity between the base game and the sanity games
print(f"> COSINE SIMILARITY BETWEEN {BASE_GAME} AND SANITY GAMES")
for i, g in enumerate(SANITY_GAMES):
    print(f"{g}: {euclidean_distances(base_vec.reshape(1, -1), sanity_vecs[i].reshape(1, -1))[0][0]}")

> COSINE SIMILARITY BETWEEN STARDEW VALLEY AND SANITY GAMES
DOOM, THE ROGUELIKE: 0.05775723614468901
HARVEST MOON GBC: 0.04661868511385689
BABA IS YOU: 0.06437132771768969


### Bag of Words

In [3]:
# create the encoding format for the bag of words
def makeBag():
    # get the set of all tags and entities
    TOKENs = []
    TOKENs += ALL_TAGS
    TOKENs += ALL_ENTITIES
    TOKENs = [t for t in list(set(TOKENs))]  #make unique
    TOKENs.sort()

    #enumerate into a set
    TOKEN_SET = {}
    for i, t in enumerate(TOKENs):
        TOKEN_SET[t] = i

    return TOKEN_SET

# return encodingq for a bag of words
def getBag(bag_idx, words):
    bag = [0 for _ in range(len(bag_idx))]
    for w in words:  #assume all words in bag for speed
        bag[bag_idx[w]] = 1
    return bag


# encode each game as a bag of words set
def bagGames(bag_idx):
    # encode each game as a bag of words
    BAG_GAMES = {}
    for g in tqdm(list(GAME_DATA.keys()), desc="Encoding games as bag of words"):
        game = GAME_DATA[g]
        bag = getBag(bag_idx, game["tags"]+game["entities"])
        BAG_GAMES[g] = bag

    return BAG_GAMES


In [4]:
BAG_IDX = makeBag()
BAG_GAMES = bagGames(BAG_IDX)

Encoding games as bag of words: 100%|██████████| 59771/59771 [02:24<00:00, 413.73it/s]


In [22]:
### LSH ###

# setup the vanilla lsh and hash
vec_dim = len(BAG_GAMES["STARDEW VALLEY"])
van_lsh = VanillaLSH(2, 2, 1, vec_dim)

# hash and save the index
game_idx = []
for k,v in tqdm(BAG_GAMES.items(), desc="Hashing games"):
    van_lsh.hash(v)
    game_idx.append(k)

Hashing games: 100%|██████████| 59771/59771 [03:46<00:00, 264.06it/s]


In [23]:
# return games that are close to the prompt
def getClosestGames(prompt):
    #make the vector
    prompt_tok = tokenize(prompt)
    prompt_vec = getBag(BAG_IDX, prompt_tok)

    #get the closest games
    cgi = van_lsh.query(prompt_vec)
    print(cgi)
    closest_game_names = [game_idx[i] for i in cgi]

    return closest_game_names


prompts = [
    "a pixel skateboarding game in a cyberpunk future",                  #recs: the ramp, jet set radio, cyberpunk 2077
    "a cute, relaxing, atmospheric farming rpg with animal characters",  #recs: stardew valley, night in the woods, a short hike
    "a physics shopping simulator in a 90s mall",                   #recs: goat simulator, retrowave, donut county
    # "a first-person goat skateboarding game"
    ]  

for p in prompts:
    print(f"> {p}")
    recs = getClosestGames(p)
    print(f"GAMES: {len(recs)}")
    print(f"RECOMMENDATIONS: {recs}")
    print("")

> a pixel skateboarding game in a cyberpunk future
[439, 836, 1070, 1137, 1289, 1484, 2322, 3473, 3931, 4099, 4188, 4561, 5268, 5376, 5756, 5989, 6090, 6140, 6188, 6383, 6458, 6952, 7029, 7214, 7408, 7513, 8352, 8788, 8910, 9010, 9076, 9346, 9390, 9576, 9657, 10434, 10448, 10530, 10536, 10768, 10787, 10993, 11031, 11419, 11437, 11474, 11512, 11862, 12036, 12059, 12076, 12244, 12275, 12352, 12354, 12423, 12787, 13017, 13201, 13610, 13628, 13671, 13698, 13707, 13780, 13988, 14037, 14259, 14370, 14627, 14716, 15140, 15318, 15347, 15434, 15568, 15751, 16418, 16420, 16474, 16564, 16791, 16870, 16976, 17156, 17291, 17377, 17537, 17624, 17714, 17960, 18007, 18018, 18136, 18402, 18462, 18537, 18754, 18915, 19214, 19364, 19722, 20225, 20244, 20249, 20326, 20519, 20689, 21194, 21295, 21332, 21426, 21485, 21629, 21924, 21940, 22181, 22285, 22326, 22350, 22815, 23050, 23122, 23155, 23480, 23528, 23747, 23780, 24071, 24136, 24265, 25420, 25983, 26224, 26442, 26742, 26759, 27104, 27629, 27811, 28000

In [24]:
BASE_GAME = "STARDEW VALLEY"
SANITY_GAMES = ["DOOM, THE ROGUELIKE", "HARVEST MOON GBC", "BABA IS YOU", "ULTRAKILL", "THIEF", "FEZ"]

# get the vector for the base game
base_vec = np.array(BAG_GAMES[BASE_GAME])
sanity_vecs = np.array([BAG_GAMES[g] for g in SANITY_GAMES])

# get the cosine similarity between the base game and the sanity games
print(f"> EUCLIDEAN SIMILARITY BETWEEN {BASE_GAME} AND SANITY GAMES")
for i, g in enumerate(SANITY_GAMES):
    print(f"{g}: {euclidean_distances(base_vec.reshape(1, -1), sanity_vecs[i].reshape(1, -1))[0][0]}")
    

> EUCLIDEAN SIMILARITY BETWEEN STARDEW VALLEY AND SANITY GAMES
DOOM, THE ROGUELIKE: 9.695359714832659
HARVEST MOON GBC: 7.874007874011811
BABA IS YOU: 7.874007874011811
ULTRAKILL: 8.54400374531753
THIEF: 8.94427190999916
FEZ: 8.888194417315589
