In [16]:
import re
import nltk
import string
import hashlib
import numpy as np
import pandas as pd

from scipy import spatial
from sklearn.feature_extraction.text import TfidfVectorizer

nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/egiovanni/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [17]:
def clean_string(row, stem=None):
    text = row[2]
    # Make lower
    text = text.lower()

    # Remove line breaks
    text = re.sub(r'\n', '', text)

    # Keep only letters
    text = re.sub('[^a-z ]+', '', text)

    # Remove stop words
    text = text.split()
    useless_words = nltk.corpus.stopwords.words("english")
    useless_words = useless_words + ['hi', 'im']
    text_filtered = [word for word in text if not word in useless_words]

    return ' '.join(text_filtered)

In [18]:
data = pd.read_csv(
    'data/switch-games-titles.txt',
    delimiter='|'
    )
data = data.dropna()
data['id'] = data.apply(lambda x: hashlib.md5(x[2].encode()).hexdigest()[:8], axis=1)
data['clean_review'] = data.apply(clean_string, axis=1)
data

Unnamed: 0,title,review_url,review,id,clean_review
0,Thymesia Review,https://www.ign.com/articles/thymesia-review,It’s always a shame when a game manages to get...,cf5ae401,always shame game manages get things right oth...
1,Cult of the Lamb Review,https://www.ign.com/articles/cult-of-the-lamb-...,"When I started Cult of the Lamb, I wasn’t expe...",ca52879c,started cult lamb wasnt expecting shoveling mu...
2,Digimon Survive Review,https://www.ign.com/articles/digimon-survive-r...,"You ever see someone with massive, toned arms ...",359455cc,ever see someone massive toned arms embarrassi...
3,Two Point Campus Review,https://www.ign.com/articles/two-point-campus-...,Two Point Campus continues in the already impr...,a4ed8c25,two point campus continues already impressive ...
4,Xenoblade Chronicles 3 Review,https://www.ign.com/articles/xenoblade-chronic...,When it comes to over-the-top roleplaying game...,f5324fe1,comes overthetop roleplaying games hard beat b...
...,...,...,...,...,...
265,Ibb & Obb Review,https://www.ign.com/articles/2013/08/07/ibb-an...,It’s strange that something so calming can som...,f1a55058,strange something calming sometimes grow frust...
266,Limbo Review,https://www.ign.com/articles/2011/08/02/limbo-...,Video games are an art form made up of visuals...,c8ecc519,video games art form made visuals sound myster...
267,Resident Evil 5: Gold Edition Review,https://www.ign.com/articles/2010/03/17/reside...,Change can be a painful process. Just ask one ...,63c47b16,change painful process ask one unfortunate cit...
268,World of Goo Review,https://www.ign.com/articles/2008/10/20/world-...,"Simple concept, executed to perfection. That's...",3769978f,simple concept executed perfection thats gamin...


In [20]:
data_titles = data.loc[:, ['id', 'title']]
data_titles.to_json('meilisearch/data.json', orient='records', force_ascii=False)
data_titles

Unnamed: 0,id,title
0,cf5ae401,Thymesia Review
1,ca52879c,Cult of the Lamb Review
2,359455cc,Digimon Survive Review
3,a4ed8c25,Two Point Campus Review
4,f5324fe1,Xenoblade Chronicles 3 Review
...,...,...
265,f1a55058,Ibb & Obb Review
266,c8ecc519,Limbo Review
267,63c47b16,Resident Evil 5: Gold Edition Review
268,3769978f,World of Goo Review


In [21]:
corpus = data['clean_review'].tolist()
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(corpus)
vectorizer.get_feature_names_out()

array(['aaa', 'aaah', 'aang', ..., 'zubatspeaking', 'zx', 'zxthe'],
      dtype=object)

In [22]:
X.toarray().shape

(270, 20851)

In [23]:
tree = spatial.KDTree(X.toarray())

In [15]:
index = 181
game_title = data['title'].iloc[index]
game_array = X[index].toarray()[0]
games_closest = tree.query(game_array, k=11)[1]
print(game_title)
print(game_array)
print(games_closest)
for ind, game in enumerate(games_closest):
    if ind == 0: continue
    game_closest_title = data['title'].iloc[game]
    print(game_closest_title)

Super Smash Bros. Ultimate Review
[0. 0. 0. ... 0. 0. 0.]
[181 169 109  44 143 136 119 189 178 158 161]
Super Smash Bros. Ultimate: Joker Challenger Pack DLC Review
Super Mario Bros. 35 Review
Nickelodeon All-Star Brawl Review
Super Smash Bros. Ultimate - Terry Bogard DLC Review
Super Smash Bros. Ultimate - Byleth DLC Review
Super Smash Bros. Ultimate - Min Min DLC Review
Nintendo Switch Review - 2018
New Super Mario Bros. U Deluxe Review
Super Smash Bros. Ultimate: Hero DLC Review 
Marvel Ultimate Alliance 3: The Black Order Review
