# ML similitud del coseno

In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import LabelEncoder

## EDA preliminar

En primer lugar se realizará un análisis exploratorio para poder reducir los datos debido a los recursos del entorno donde se hace el despliegue de la aplicación 

In [2]:
#Se cargan los datos
df_games=pd.read_parquet('Datasets/data_ML.parquet')

In [3]:
df_users=pd.read_parquet('Datasets/users_items_clean.parquet')

In [4]:
df_games['release_year']=df_games['release_year'].astype('int16')

In [5]:
df_games.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 27442 entries, 0 to 27441
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   genres        27442 non-null  object
 1   app_name      27442 non-null  object
 2   tags          27442 non-null  object
 3   id            27442 non-null  int32 
 4   release_year  27442 non-null  int16 
dtypes: int16(1), int32(1), object(3)
memory usage: 804.1+ KB


In [6]:
df_users.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3865427 entries, 0 to 3865426
Data columns (total 4 columns):
 #   Column            Dtype  
---  ------            -----  
 0   user_id           object 
 1   items_count       int16  
 2   item_id           int32  
 3   playtime_forever  float16
dtypes: float16(1), int16(1), int32(1), object(1)
memory usage: 59.0+ MB


In [7]:
df_most_popular=pd.merge(df_games,df_users[['item_id','playtime_forever']],left_on='id',right_on='item_id',how='inner')

In [8]:
df_most_popular.drop(columns=['item_id','id'],inplace=True)

Filtro los juegos por los que se lanzaron a partir del 2014.

In [9]:
df_most_popular=df_most_popular[df_most_popular['release_year']>=2014]

In [10]:
df_playtime=df_most_popular.groupby('app_name')['playtime_forever'].sum().reset_index()

In [11]:
df_playtime.sort_values(by='playtime_forever',ascending=False)

Unnamed: 0,app_name,playtime_forever
5373,Unturned,1786035.0
2774,Loadout,1084753.0
3973,Robocraft,884821.0
2235,Heroes & Generals,882531.0
2035,Goat Simulator,816752.0
...,...,...
5250,Trawl,0.0
5251,Trebuchet,0.0
3975,Robot Exploration Squad,0.0
5256,Tri Original Soundtrack + Artbook,0.0


In [12]:
df_playtime=df_playtime[df_playtime['playtime_forever']>0]

In [13]:
df_playtime

Unnamed: 0,app_name,playtime_forever
0,! That Bastard Is Trying To Steal Our Gold !,43.0
1,"""Glow Ball"" - The Billiard Puzzle Game",21.0
2,$1 Ride,1522.0
4,//N.P.P.D. Rush//- The Milk Of Ultraviolet,14722.0
5,//Snowflake Tattoo//,5034.0
...,...,...
5735,[The Sequence],174.0
5736,Астролорды: Облако Оорта,310.0
5738,侠客风云传(Tale Of Wuxia),92.0
5739,神明的一天世界(God'S One Day World),249.0


In [14]:
data=pd.merge(df_playtime['app_name'],df_games,left_on='app_name',right_on='app_name',how='inner')

In [15]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5211 entries, 0 to 5210
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   app_name      5211 non-null   object
 1   genres        5211 non-null   object
 2   tags          5211 non-null   object
 3   id            5211 non-null   int32 
 4   release_year  5211 non-null   int16 
dtypes: int16(1), int32(1), object(3)
memory usage: 152.8+ KB


In [16]:
data.drop('release_year',axis=1,inplace=True)

In [17]:
data.to_parquet('Datasets/data.parquet')

## Modelo de la similitud del coseno

In [18]:
df=pd.read_parquet('Datasets/data.parquet')

In [19]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5211 entries, 0 to 5210
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   app_name  5211 non-null   object
 1   genres    5211 non-null   object
 2   tags      5211 non-null   object
 3   id        5211 non-null   int32 
dtypes: int32(1), object(3)
memory usage: 142.6+ KB


In [20]:
df.reset_index(drop=True, inplace=True)

In [21]:
#Vectorizar el nombre de los productos
vectorizer=TfidfVectorizer()
tfidf_matrix=vectorizer.fit_transform(df[['app_name','genres','tags']].apply(lambda x: ' '.join(x),axis=1))

In [22]:
# Calcular la cantidad de memoria en bytes
memory_in_bytes = (tfidf_matrix.data.nbytes + tfidf_matrix.indices.nbytes + tfidf_matrix.indptr.nbytes)

# Convertir a megabytes
memory_in_megabytes = memory_in_bytes / (1024 * 1024)

print(f"Memory: {memory_in_megabytes} MB")

Memory: 0.2952156066894531 MB


In [23]:
#Combinar características
#features=np.column_stack([tfidf_matrix.toarray(),df['genres'],df['tags']])

In [24]:
#Calcular la matriz de similitud del coseno
similarity_matrix=cosine_similarity(tfidf_matrix)

In [25]:

# Calcular la cantidad de memoria en bytes
memory_in_bytes = similarity_matrix.nbytes

# Convertir a megabytes
memory_in_megabytes = memory_in_bytes / (1024 * 1024)

print(f"Memory: {memory_in_megabytes} MB")

Memory: 207.17255401611328 MB


In [26]:
df

Unnamed: 0,app_name,genres,tags,id
0,! That Bastard Is Trying To Steal Our Gold !,Action,Action,449940
1,"""Glow Ball"" - The Billiard Puzzle Game",Casual,Strategy,388390
2,$1 Ride,Action,Casual,508290
3,//N.P.P.D. Rush//- The Milk Of Ultraviolet,Action,Indie,270090
4,//Snowflake Tattoo//,Action,Adventure,355430
...,...,...,...,...
5206,[The Sequence],Indie,Indie,454320
5207,Астролорды: Облако Оорта,Free to Play,Free to Play,385530
5208,侠客风云传(Tale Of Wuxia),Adventure,RPG,377530
5209,神明的一天世界(God'S One Day World),Adventure,Adventure,490390


In [27]:

item_id=508290
if df[df['id']==item_id].empty:
    print('No existe el item')
else:
    producto_index=df[df['id']==item_id].index[0]
    product_similarities=similarity_matrix[producto_index]
    most_similar_products_index=np.argsort(-product_similarities)[1:6]
    most_similar_products=df.loc[most_similar_products_index,'app_name']
    print(most_similar_products)

3592             Ride The Bullet
3591                        Ride
3925           Slide Ride Arcade
1764    Funfair Ride Simulator 3
3458                   R.O.O.T.S
Name: app_name, dtype: object


In [28]:
most_similar_products.values.tolist()

['Ride The Bullet',
 'Ride',
 'Slide Ride Arcade',
 'Funfair Ride Simulator 3',
 'R.O.O.T.S']

In [29]:
most_similar_products.to_json(orient='records')

'["Ride The Bullet","Ride","Slide Ride Arcade","Funfair Ride Simulator 3","R.O.O.T.S"]'

In [30]:
{"Recomendación": most_similar_products.to_json(orient='records')}

{'Recomendación': '["Ride The Bullet","Ride","Slide Ride Arcade","Funfair Ride Simulator 3","R.O.O.T.S"]'}