In [162]:
# Importaciones necesarias

import numpy as np
import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq
import scipy as sp

from sklearn.metrics.pairwise import cosine_similarity

In [163]:
# Se cargan los archivos parquet a utilizar
steam_games = pd.read_parquet('datos_parquet/steam_games.parquet')
users_items = pd.read_parquet('datos_parquet/user_items.parquet')
user_reviews = pd.read_parquet('datos_parquet/user_reviews.parquet')

In [164]:
# Se crean dos variables en las que se almacenan dos dataframes con las columnas a utilizar para la consulta

reviews = user_reviews[['user_id', 'id', 'sentiment_analysis']]

items = users_items[['id', 'item_name']]

items = items.drop_duplicates()

In [165]:
# Se crea un dataframe realizando un merge entre los dataset users_items y user_reviews
item_item = pd.merge(items,reviews, on='id', how='inner')

# Se renombra la columna id por item_id para facilitar la comprensión del dataset.
item_item.rename(columns={'id':'item_id'}, inplace=True)

# Se limita la cantidad de filas del dataset
item_recortado = item_item.head(40000)

item_recortado

Unnamed: 0,item_id,item_name,user_id,sentiment_analysis
0,10,Counter-Strike,Bennysaputra,2
1,10,Counter-Strike,Monsta45,1
2,10,Counter-Strike,76561198040188061,2
3,10,Counter-Strike,mayshowganmore,2
4,10,Counter-Strike,BestinTheWorldThund3r,2
...,...,...,...,...
39995,386360,SMITE,wadiyatalknabeet,2
39996,386360,SMITE,poofcorn,2
39997,386360,SMITE,76561198111978125,2
39998,386360,SMITE,AlphaPodbury,2


In [166]:
# Se crea una matriz de utilidad donde las filas representan usuarios, las columnas representan los items y las celdas contendran las calificaciones o interacciones.
matriz_item = item_recortado.pivot_table(index=['user_id'], columns='item_name', values='sentiment_analysis')

# Se rellenan los nulos con ceros para luego almacenar los datos en una matriz dispersa
matriz_item = matriz_item.fillna(0)

matriz_item

item_name,100% Orange Juice,12 Labours of Hercules II: The Cretan Bull,16bit Trader,3079 -- Block Action RPG,3089 -- Futuristic Action RPG,3D Ultra Minigolf Adventures Deluxe,3DMark,99 Levels To Hell,A New Beginning - Final Cut,A Virus Named TOM,...,XCOM: Enemy Unknown,Xpand Rally Xtreme,Yet Another Zombie Defense,Zeno Clash,Zeno Clash 2,Zombie Army Trilogy,Zombie Driver,Zombies Monsters Robots,Zuma's Revenge,resident evil 4 / biohazard 4
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
--000--,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
--ace--,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
--ionex--,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
-2SV-vuLB-Kg,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
-Azsael-,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
zuzuga2003,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
zv_odd,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
zvanik,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
zynxgameth,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [167]:
# Se realiza una matriz dispersa a fin de mejorar espacio de almacenamiento y eficiencia de computo al almacenar solo los elementos distintos de cero.
matriz_sparse = sp.sparse.csr_matrix(matriz_item.values)

#item_similarity = cosine_similarity(matriz_sparse)

user_similarity = cosine_similarity(matriz_sparse.T)

user_sim = pd.DataFrame(user_similarity, index=matriz_item.columns, columns=matriz_item.columns)

In [168]:
type(user_similarity)

numpy.ndarray

In [169]:
user_sim

item_name,100% Orange Juice,12 Labours of Hercules II: The Cretan Bull,16bit Trader,3079 -- Block Action RPG,3089 -- Futuristic Action RPG,3D Ultra Minigolf Adventures Deluxe,3DMark,99 Levels To Hell,A New Beginning - Final Cut,A Virus Named TOM,...,XCOM: Enemy Unknown,Xpand Rally Xtreme,Yet Another Zombie Defense,Zeno Clash,Zeno Clash 2,Zombie Army Trilogy,Zombie Driver,Zombies Monsters Robots,Zuma's Revenge,resident evil 4 / biohazard 4
item_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
100% Orange Juice,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0
12 Labours of Hercules II: The Cretan Bull,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0
16bit Trader,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3079 -- Block Action RPG,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3089 -- Futuristic Action RPG,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Zombie Army Trilogy,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.050395,0.0,0.0,1.0,0.0,0.0,0.0,0.0
Zombie Driver,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.000000,0.0,0.0,0.0,1.0,0.0,0.0,0.0
Zombies Monsters Robots,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,1.0,0.0,0.0
Zuma's Revenge,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [170]:
#item_sim_df = pd.DataFrame(item_similarity, index=matriz_item.index, columns=matriz_item.index)
#item_sim_df

In [171]:
def cinco_recomendaciones2(item_id):
    
    # Se verifica si el ID de juego proporcionado existe en el DataFrame
    if int(item_id) not in item_recortado['item_id'].values:
        return f"Error: El ID de juego {item_id} no se encuentra en el DataFrame."
    
    # Verificar si hay suficientes datos para calcular recomendaciones
    if len(user_sim) <= item_id:
        return "Error: No hay suficientes datos para calcular recomendaciones."
    
    else:
            # Obtener el índice del juego en la matriz
        id_to_name = item_recortado.loc[item_recortado['item_id'] == item_id, 'item_name']
        
        item_name = id_to_name.values[0]
    
        encabezado = f'Juegos similares a {item_name}:'
    
        for item in user_sim.sort_values(by=item_name, ascending=False).index[1:6]:
            encabezado += f'\n-{item}'
    print (encabezado)

In [172]:
cinco_recomendaciones2(10)

Juegos similares a Counter-Strike:
-Half-Life Deathmatch: Source
-Serious Sam Classic: The First Encounter
-Half-Life: Blue Shift
-Day of Defeat
-Counter-Strike: Source


In [173]:
# Se convierte el DataFrame a una tabla de Arrow para posteriormente comprimirlo en parquet
table = pa.Table.from_pandas(user_sim)

# Se especifica el nombre del archivo Parquet y comprime con snappy
parquet_file = 'datos_parquet/user_sim.parquet'
pq.write_table(table, parquet_file, compression='snappy')

In [174]:
# Se convierte el DataFrame a una tabla de Arrow para posteriormente comprimirlo en parquet
table = pa.Table.from_pandas(item_recortado)

# Se especifica el nombre del archivo Parquet y comprime con snappy
parquet_file = 'datos_parquet/item_chunk.parquet'
pq.write_table(table, parquet_file, compression='snappy')

In [181]:
item_recortado = pd.read_parquet('datos_parquet/user_items.parquet')

item_recortado

Unnamed: 0,user_id,items_count,steam_id,id,item_name,playtime_forever,playtime_2weeks
0,76561197970982479,277,76561197970982479,10,Counter-Strike,6.0,0.0
1,76561197970982479,277,76561197970982479,20,Team Fortress Classic,0.0,0.0
2,76561197970982479,277,76561197970982479,30,Day of Defeat,7.0,0.0
3,76561197970982479,277,76561197970982479,40,Deathmatch Classic,0.0,0.0
4,76561197970982479,277,76561197970982479,50,Half-Life: Opposing Force,0.0,0.0
...,...,...,...,...,...,...,...
5170009,76561198329548331,7,76561198329548331,346330,BrainBread 2,0.0,0.0
5170010,76561198329548331,7,76561198329548331,373330,All Is Dust,0.0,0.0
5170011,76561198329548331,7,76561198329548331,388490,One Way To Die: Steam Edition,3.0,3.0
5170012,76561198329548331,7,76561198329548331,521570,You Have 10 Seconds 2,4.0,4.0


In [182]:
items_items_ML = pd.read_parquet('datos_parquet/user_items2.parquet')

items_items_ML

Unnamed: 0,user_id,items_count,steam_id,id,item_name,playtime_forever,playtime_2weeks
4205846,ThatKidRanga,681,76561198035551917,227080,Papo & Yo,9.0,0.0
977603,76561197981425218,109,76561197981425218,228200,Company of Heroes (New Steam Version),4312.0,0.0
3170639,76561197970733576,61,76561197970733576,43110,Metro 2033,14.0,0.0
1343254,chloejacobsx,1003,76561198041310702,205950,Jet Set Radio,28.0,0.0
3001052,76561198050051046,66,76561198050051046,291550,Brawlhalla,1354.0,692.0
...,...,...,...,...,...,...,...
2671627,76561198055712539,49,76561198055712539,65800,Dungeon Defenders,584.0,0.0
455985,GoldenCookies,86,76561198087478253,246280,Happy Wars,40.0,0.0
4680548,76561198065886270,99,76561198065886270,349700,Aftermath,0.0,0.0
1365084,GilfSlayer,27,76561198096628949,273110,Counter-Strike Nexon: Zombies,0.0,0.0
