In [39]:
import re
import nltk
import hashlib
import pandas as pd

# Model
from scipy import spatial
from sklearn.feature_extraction.text import TfidfVectorizer

In [28]:
def clean_string(row, stem=None):
    text = row[3]
    # Make lower
    text = text.lower()

    # Remove line breaks
    text = re.sub(r'\n', '', text)

    # Keep only letters
    text = re.sub('[^a-z ]+', '', text)

    # Remove stop words
    text = text.split()
    useless_words = nltk.corpus.stopwords.words("english")
    useless_words = useless_words + ['hi', 'im']
    text_filtered = [word for word in text if not word in useless_words]

    return ' '.join(text_filtered)

In [31]:
data = pd.read_csv(
    '../dataset/switch-games-reviews.csv',
    delimiter=',',
    index_col='index'
)
data['clean_review'] = data.apply(clean_string, axis=1)
data[['id', 'clean_review']].to_csv('../dataset/switch-games-clean-reviews.csv', index=False, sep=',')

# Model

In [72]:
def create_model(data):
    # Load data
    data = pd.read_csv(
        '../dataset/switch-games-clean-reviews.csv',
        delimiter=','
    )
    # My model will be a simple tf-idf over all reviews
    vectorizer = TfidfVectorizer()
    X = vectorizer.fit_transform(data['clean_review'])
    X.shape

    # Tree to do a fast search over distances
    tree = spatial.KDTree(X.toarray())
    return X, tree


def predict(id):
    index = data.index[data['id'] == id][0]
    game_array = X[index].toarray()[0]
    games_closest = tree.query(game_array, k=21)[1][1:]
    return [data.iloc[index]['id'] for index in games_closest]

X, tree = create_model(data)
predict('a4ed8c25')

['ecbf9029',
 'b573e8a7',
 'a4024d6d',
 '60acdceb',
 'fda30a93',
 '5d3f714e',
 'c291b261',
 '73d2294c',
 '3399bfc5',
 '99981399',
 '4dbb9933',
 '01ca06ea',
 '4eec045a',
 'f25543e0',
 '9de7f721',
 'f5324fe1',
 '8c83f3d5',
 '32a735ea',
 '6331f8ee',
 '19915850']