In [1]:
# Importaciones necesarias

import numpy as np
import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq
import scipy as sp

from sklearn.metrics.pairwise import cosine_similarity

In [4]:
# Se cargan los archivos parquet a utilizar
steam_games = pd.read_parquet('datos_parquet/steam_games.parquet')
users_items = pd.read_parquet('datos_parquet/user_items_chunk.parquet')
user_reviews = pd.read_parquet('datos_parquet/user_reviews.parquet')

In [5]:
# Se crean dos variables en las que se almacenan dos dataframes con las columnas a utilizar para la consulta

reviews = user_reviews[['user_id', 'id', 'sentiment_analysis']]

items = users_items[['id', 'item_name']]

items = items.drop_duplicates()

In [6]:
# Se crea un dataframe realizando un merge entre los dataset users_items y user_reviews
item_item = pd.merge(items,reviews, on='id', how='inner')

# Se renombra la columna id por item_id para facilitar la comprensión del dataset.
item_item.rename(columns={'id':'item_id'}, inplace=True)

# Se limita la cantidad de filas del dataset
item_recortado = item_item.head(40000)

item_recortado

Unnamed: 0,item_id,item_name,user_id,sentiment_analysis
0,227080,Papo & Yo,76561197999005150,2
1,227080,Papo & Yo,henryle,1
2,227080,Papo & Yo,Xasion,2
3,43110,Metro 2033,76561197970982479,2
4,43110,Metro 2033,Denied72,2
...,...,...,...,...
39995,389570,Mitos.is: The Game,Lumaxious,2
39996,389570,Mitos.is: The Game,76561198062236203,2
39997,389570,Mitos.is: The Game,76561198077207405,2
39998,389570,Mitos.is: The Game,Dankest-Meme,2


In [7]:
# Se crea una matriz de utilidad donde las filas representan usuarios, las columnas representan los items y las celdas contendran las calificaciones o interacciones.
matriz_item = item_recortado.pivot_table(index=['user_id'], columns='item_name', values='sentiment_analysis')

# Se rellenan los nulos con ceros para luego almacenar los datos en una matriz dispersa
matriz_item = matriz_item.fillna(0)

matriz_item

item_name,140,18 Wheels of Steel: Extreme Trucker,7 Days to Die,A Virus Named TOM,A.V.A - Alliance of Valiant Arms,ACE - Arena: Cyber Evolution,APB Reloaded,ARK: Survival Evolved,ARK: Survival Of The Fittest,Absolute Drift,...,XCOM: Enemy Unknown,Yargis - Space Melee,You Have to Win the Game,Zombies Monsters Robots,bit Dungeon II,how do you Do It?,iBomber Defense Pacific,resident evil 4 / biohazard 4,the static speaks my name,theHunter
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
--000--,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
--ace--,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
--ionex--,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
-2SV-vuLB-Kg,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
-Beave-,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
zv_odd,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
zvanik,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
zynxgameth,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
zyr0n1c,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [12]:
# Se realiza una matriz dispersa a fin de mejorar espacio de almacenamiento y eficiencia de computo al almacenar solo los elementos distintos de cero.
matriz_sparse = sp.sparse.csr_matrix(matriz_item.values)

user_similarity = cosine_similarity(matriz_sparse.T)

user_sim = pd.DataFrame(user_similarity, index=matriz_item.columns, columns=matriz_item.columns)

In [10]:
user_sim

item_name,140,18 Wheels of Steel: Extreme Trucker,7 Days to Die,A Virus Named TOM,A.V.A - Alliance of Valiant Arms,ACE - Arena: Cyber Evolution,APB Reloaded,ARK: Survival Evolved,ARK: Survival Of The Fittest,Absolute Drift,...,XCOM: Enemy Unknown,Yargis - Space Melee,You Have to Win the Game,Zombies Monsters Robots,bit Dungeon II,how do you Do It?,iBomber Defense Pacific,resident evil 4 / biohazard 4,the static speaks my name,theHunter
item_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
140,1.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000
18 Wheels of Steel: Extreme Trucker,0.0,1.0,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000
7 Days to Die,0.0,0.0,1.000000,0.0,0.0,0.0,0.0,0.018684,0.0,0.0,...,0.011898,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.016027
A Virus Named TOM,0.0,0.0,0.000000,1.0,0.0,0.0,0.0,0.000000,0.0,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000
A.V.A - Alliance of Valiant Arms,0.0,0.0,0.000000,0.0,1.0,0.0,0.0,0.000000,0.0,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
how do you Do It?,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,...,0.000000,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.000000
iBomber Defense Pacific,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.000000
resident evil 4 / biohazard 4,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.008505,0.0,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.000000
the static speaks my name,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,...,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.000000


In [19]:
def cinco_recomendaciones(item_id):
    
    # Se verifica si el ID de juego proporcionado existe en el DataFrame
    if int(item_id) not in item_recortado['item_id'].values:
        return f"Error: El ID de juego {item_id} no se encuentra en el DataFrame."
    
    # Verificar si hay suficientes datos para calcular recomendaciones
    if len(user_sim) <= item_id:
        return "Error: No hay suficientes datos para calcular recomendaciones."
    
    else:
            # Obtener el índice del juego en la matriz
        id_to_name = item_recortado.loc[item_recortado['item_id'] == item_id, 'item_name']
        
        item_name = id_to_name.values[0]
    
        encabezado = f'Juegos similares a {item_name}:'
    
        for item in user_sim.sort_values(by=item_name, ascending=False).index[1:6]:
            encabezado += f'\n-{item}'
    print (encabezado)

In [21]:
cinco_recomendaciones(20)

Juegos similares a Team Fortress Classic:
-They Bleed Pixels
-MicroVolts Surge
-Day of Defeat: Source
-Super Meat Boy
-Call of Duty: World at War


In [173]:
# Se convierte el DataFrame a una tabla de Arrow para posteriormente comprimirlo en parquet
table = pa.Table.from_pandas(user_sim)

# Se especifica el nombre del archivo Parquet y comprime con snappy
parquet_file = 'datos_parquet/user_sim.parquet'
pq.write_table(table, parquet_file, compression='snappy')

In [174]:
# Se convierte el DataFrame a una tabla de Arrow para posteriormente comprimirlo en parquet
table = pa.Table.from_pandas(item_recortado)

# Se especifica el nombre del archivo Parquet y comprime con snappy
parquet_file = 'datos_parquet/item_chunk.parquet'
pq.write_table(table, parquet_file, compression='snappy')