In [1]:
import re
import nltk
import hashlib
import pandas as pd

# Model
from scipy import spatial
from sklearn.feature_extraction.text import TfidfVectorizer

In [10]:
def clean_string(row, stem=None):
    text = row[9]
    # Make lower
    text = text.lower()

    # Remove line breaks
    text = re.sub(r'\n', '', text)

    # Keep only letters
    text = re.sub('[^a-z ]+', '', text)

    # Remove stop words
    text = text.split()
    useless_words = nltk.corpus.stopwords.words("english")
    useless_words = useless_words + ['hi', 'im']
    text_filtered = [word for word in text if not word in useless_words]

    return ' '.join(text_filtered)

In [11]:
data = pd.read_csv(
    '../dataset/switch-games-id.csv',
    delimiter=',',
)
data['clean_review'] = data.apply(clean_string, axis=1)
data[['id', 'clean_review']].to_csv('../dataset/switch-games-clean-reviews.csv', index=False, sep=',')

# Model

In [8]:
def create_model(data):
    # Load data
    data = pd.read_csv(
        '../dataset/switch-games-clean-reviews.csv',
        delimiter=','
    )
    # My model will be a simple tf-idf over all reviews
    vectorizer = TfidfVectorizer()
    X = vectorizer.fit_transform(data['clean_review'])
    X.shape

    # Tree to do a fast search over distances
    tree = spatial.KDTree(X.toarray())
    return X, tree


def predict(id):
    index = data.index[data['id'] == id][0]
    game_array = X[index].toarray()[0]
    games_closest = tree.query(game_array, k=21)[1][1:]
    return [data.iloc[index]['id'] for index in games_closest]

X, tree = create_model(data)
predict('0f0191a3')

['a8996117',
 'e2f10932',
 '51ff4f64',
 '4377dbf1',
 '751a7bbf',
 'dc506d3b',
 '438bc630',
 '7796d27e',
 '4e293730',
 '31409bf6',
 '97fc82c2',
 'ff64c4ba',
 'a1af38c1',
 '92d53aa6',
 '6ae91a77',
 '54ce4908',
 '889702e7',
 '95d72d02',
 '646338d6',
 '0f0191a3']