In [25]:
from gensim.models import Word2Vec
from sklearn.metrics.pairwise import cosine_similarity
from gensim.models import Word2Vec
import pandas as pd
import pyarrow.parquet as pq
import joblib


In [14]:
steam_games = pd.read_parquet('steam_games.parquet')

In [16]:
steam_games['genres'] = steam_games['genres'].apply(lambda x: x.tolist())
steam_games['tags'] = steam_games['tags'].apply(lambda x: x.tolist())
steam_games['specs'] = steam_games['specs'].apply(lambda x: x.tolist())

In [17]:
steam_games['keys_vec'] = steam_games.apply(lambda row: list(set(row['tags'] + row['specs'] + row['genres'] )), axis=1)


In [18]:
steam_games[['title','keys_vec']]

Unnamed: 0,title,keys_vec
0,Lost Summoner Kitty,"[casual, strategy, indie, single-player, actio..."
1,Ironbound,"[pvp, indie, 2d, female protagonist, replay va..."
2,Real Pool 3D - Poolians,"[casual, free to play, multi-player, in-app pu..."
3,弹炸人2222,"[casual, single-player, action, adventure]"
5,Battle Royale Trainer,"[sniper, third person, fps, third-person shoot..."
...,...,...
32129,Kebab it Up!,"[violent, casual, steam cloud, indie, single-p..."
32130,Colony On Mars,"[casual, strategy, indie, single-player, steam..."
32131,LOGistICAL: South Africa,"[casual, steam cloud, strategy, steam leaderbo..."
32132,Russian Roads,"[racing, steam trading cards, indie, single-pl..."


In [19]:
steam_games.drop(columns=['tags','specs','genres','app_name','release_date','price','id','developer'], inplace=True)

In [35]:
df = steam_games.loc[:, ('title', 'keys_vec')]

model = Word2Vec(df['keys_vec'], vector_size=400, window=10, min_count=1, workers=5)  

def get_vector_for_list(word_list):
    vectors = [model.wv[word] for word in word_list if word in model.wv]
    return sum(vectors) / len(vectors) if vectors else None


df['vector'] = df['keys_vec'].apply(get_vector_for_list)

df = df.dropna(subset=['vector'])
df.reset_index(drop=True, inplace=True)

cosine_similarities = cosine_similarity(list(df['vector']), list(df['vector']))

def get_top_n_recommendations(game_title, n=5):
    game_index = df.query('title==@game_title').index[0]
    
    sim_scores = list(enumerate(cosine_similarities[game_index]))
    
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    top_n_indices = [i for i, _ in sim_scores[1:n+1]]

    recommended_games = df['title'].iloc[top_n_indices].tolist()

    return recommended_games



Unnamed: 0,title,keys_vec,vector
0,Lost Summoner Kitty,"[casual, strategy, indie, single-player, actio...","[-0.20399688, 0.0011429936, 0.13287929, -0.128..."
1,Ironbound,"[pvp, indie, 2d, female protagonist, replay va...","[-0.24126586, 0.16356538, 0.04287066, -0.12778..."
2,Real Pool 3D - Poolians,"[casual, free to play, multi-player, in-app pu...","[-0.19871001, 0.10815572, 0.091588214, -0.0626..."
3,弹炸人2222,"[casual, single-player, action, adventure]","[-0.034087174, -0.051405556, 0.013843387, -0.0..."
4,Battle Royale Trainer,"[sniper, third person, fps, third-person shoot...","[-0.07961322, 0.1712774, 0.017655522, -0.14445..."
...,...,...,...
30071,Kebab it Up!,"[violent, casual, steam cloud, indie, single-p...","[-0.15718624, 0.063142546, -0.041191652, -0.05..."
30072,Colony On Mars,"[casual, strategy, indie, single-player, steam...","[-0.2617202, 0.10598645, 0.096524775, -0.11190..."
30073,LOGistICAL: South Africa,"[casual, steam cloud, strategy, steam leaderbo...","[-0.23144446, 0.06893918, 0.09490009, -0.10388..."
30074,Russian Roads,"[racing, steam trading cards, indie, single-pl...","[-0.2757305, 0.12194395, 0.13611603, 0.0120643..."


In [36]:
get_top_n_recommendations('Ironbound')

['Tactical Genius Online',
 'Forge of Gods (RPG)',
 'The Banner Saga: Factions',
 '❂ Heroes of Hexaluga ❂',
 'Card Hunter']

In [24]:

# Guarda el modelo Word2Vec
model.save("word2vec_model.joblib")

# Guarda el DataFrame modificado
df.to_pickle("steam_games_processed.pkl")
