In [28]:
import os
import pandas as pd
import numpy as np
import ast


In [29]:
os.chdir(os.getcwd().replace('notebooks', 'data'))

games = pd.read_csv('clean.csv')


In [30]:
games.head()


Unnamed: 0,Name,tags
0,disco-elysium-the-final-cut,"['Disco', 'Elysium', '-', 'The', 'Final', 'Cut..."
1,half-life-2,"[""[Metacritic's"", '2004', 'PC', 'Game', 'of', ..."
2,grand-theft-auto-v,"['Los', 'Santos:', 'a', 'sprawling', 'sun-soak..."
3,the-orange-box,"['Games', 'included', 'in', 'The', 'Orange', '..."
4,bioshock,"['Going', 'beyond', '""run', 'and', 'gun', 'cor..."


In [31]:
corups = games['tags'].values


In [32]:
corups.shape


(4185,)

In [33]:
from nltk.stem import PorterStemmer

stemmer = PorterStemmer()


def convert_tags(tags):
    return " ".join([stemmer.stem(tag) for tag in ast.literal_eval(tags)]).lower()


In [34]:
convert_tags(corups[0])


"disco elysium - the final cut is the definit edit of the smash-hit rpg. pursu your polit dream in new quests, meet and question more of the city' locals, and explor a whole extra area. full voice-acting, control support, and expand languag option also included. get even more out of thi award-win open world. you'r a detect with a uniqu skill system at your dispos and a whole citi block to carv your path across. interrog unforgett characters, crack murders, or take bribes. becom a hero or an absolut disast of a human being. collaps role-play gener western-styl metascoreveryhigh userscoreposit rating_m za/um universalacclaim generallyfavorablereview singleplay release_year_2021"

In [35]:
corups = [convert_tags(tag) for tag in corups]


In [36]:
corups[:4]


["disco elysium - the final cut is the definit edit of the smash-hit rpg. pursu your polit dream in new quests, meet and question more of the city' locals, and explor a whole extra area. full voice-acting, control support, and expand languag option also included. get even more out of thi award-win open world. you'r a detect with a uniqu skill system at your dispos and a whole citi block to carv your path across. interrog unforgett characters, crack murders, or take bribes. becom a hero or an absolut disast of a human being. collaps role-play gener western-styl metascoreveryhigh userscoreposit rating_m za/um universalacclaim generallyfavorablereview singleplay release_year_2021",
 "[metacritic' 2004 pc game of the year] by take the suspense, challeng and viscer charg of the original, and ad startl new realism and responsiveness, half-lif 2 open the door to a world where the player' presenc affect everyth around him, from the physic environ to the behavior -- even the emot -- of both fri

In [37]:
from sklearn.feature_extraction.text import CountVectorizer


In [38]:
vectorizer = CountVectorizer(max_features=10000, stop_words='english', ngram_range=(1, 2))


In [39]:
vectors = vectorizer.fit_transform(corups)


In [40]:
vectors[0]


<1x10000 sparse matrix of type '<class 'numpy.int64'>'
	with 80 stored elements in Compressed Sparse Row format>

In [41]:
vectorizer.get_feature_names_out()[:100]


array(['000', '000 year', '07', '08', '10', '10 000', '10 differ',
       '10 new', '10 year', '100', '100 level', '100 year', '101',
       '1080p', '1080p 60fps', '10th', '10ton',
       '10ton mixedoraveragereview', '11', '11bitstudio', '11th', '12',
       '120', '13', '13 lbs', '130', '14', '140', '15', '15 new', '150',
       '16', '16 bit', '16 player', '160', '17', '1701', '17th', '18',
       '18th', '19', '1920', '1930', '1940', '1941', '1942', '1944',
       '1945', '1950', '1960', '1968', '1980', '1982', '1984', '1986',
       '1987', '1988', '1989', '1990', '1991', '1992', '1993', '1994',
       '1995', '1996', '1997', '1998', '1999', '19th', '19th centuri',
       '19th century', '1c', '1c 1centertain', '1c company',
       '1centertain', '1st', '20', '20 differ', '20 new', '20 year',
       '200', '2000', '2001', '2002', '2003', '2004', '2005', '2006',
       '2007', '2008', '2009', '2010', '2013', '2014', '2015', '2016',
       '2017', '2018', '2019', '2020'], dtype=obj

In [42]:
from sklearn.metrics.pairwise import cosine_similarity


In [43]:
game_similarities = cosine_similarity(vectors[1], vectors)
game_similarities


array([[0.11321571, 1.        , 0.10615269, ..., 0.06786566, 0.10485249,
        0.05822225]])

In [44]:
def recomend(game, k = 30):
  index = games[games['Name'] == game].index[0]
  game_similarities = cosine_similarity(vectors[index], vectors)[0]

  games_list = sorted(list(enumerate(game_similarities)), reverse=True, key=lambda x: x[1])[1:k + 1]
  return [games.iloc[game[0]]['Name'] for game in games_list]


In [45]:
games['Name'].values


array(['disco-elysium-the-final-cut', 'half-life-2', 'grand-theft-auto-v',
       ..., 'you-suck-at-parking', 'tanuki-sunset', 'shredders'],
      dtype=object)

In [46]:
recomend('grand-theft-auto-v')


['grand-theft-auto-iv',
 'outer-wilds',
 'grand-theft-auto-san-andreas',
 'yakuza-5-remastered',
 'sleeping-dogs',
 'metal-gear-solid-v-the-phantom-pain',
 'hitman-codename-47',
 'yakuza-4-remastered',
 'lost-ember',
 'manhunt',
 'darksiders-ii-deathinitive-edition',
 'mad-max',
 'like-a-dragon-ishin!',
 'death-stranding',
 'blossom-tales-the-sleeping-king',
 'way-of-the-samurai-4',
 'grand-theft-auto-iii',
 'the-forest',
 'grand-theft-auto-vice-city',
 'marvels-spider-man',
 'everspace-stellar-edition',
 'tom-clancys-rainbow-six-vegas',
 'bully-scholarship-edition',
 'the-witcher-3-wild-hunt---complete-edition',
 'remothered-tormented-fathers',
 'cat-quest',
 'assassins-creed-valhalla-dawn-of-ragnarok',
 'hob-the-definitive-edition',
 'shakedown-hawaii',
 'sea-of-thieves']

In [47]:
import joblib


In [48]:
joblib.dump(vectors, 'game_vectors.joblib')


['game_vectors.joblib']

In [49]:
joblib.dump(games['Name'].values, 'game_Name.joblib')


['game_Name.joblib']