# ML similitud del coseno

In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import LabelEncoder

## EDA preliminar

En primer lugar se realizará un análisis exploratorio para poder reducir los datos debido a los recursos del entorno donde se hace el despliegue de la aplicación 

In [6]:
#Se cargan los datos
df_games=pd.read_parquet('Datasets/data_ML.parquet')

In [3]:
df_users=pd.read_parquet('Datasets/users_items_clean.parquet')

In [11]:
df_games['release_year']=df_games['release_year'].astype('int16')

In [13]:
df_games.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 27442 entries, 0 to 27441
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   genres        27442 non-null  object
 1   app_name      27442 non-null  object
 2   tags          27442 non-null  object
 3   id            27442 non-null  int32 
 4   release_year  27442 non-null  int16 
dtypes: int16(1), int32(1), object(3)
memory usage: 804.1+ KB


In [12]:
df_users.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3865427 entries, 0 to 3865426
Data columns (total 4 columns):
 #   Column            Dtype  
---  ------            -----  
 0   user_id           object 
 1   items_count       int16  
 2   item_id           int32  
 3   playtime_forever  float16
dtypes: float16(1), int16(1), int32(1), object(1)
memory usage: 59.0+ MB


In [71]:
df_most_popular=pd.merge(df_games,df_users[['item_id','playtime_forever']],left_on='id',right_on='item_id',how='inner')

In [73]:
df_most_popular.drop(columns=['item_id','id'],inplace=True)

In [74]:
df_most_popular=df_most_popular[df_most_popular['release_year']>=2014]

In [76]:
df_playtime=df_most_popular.groupby('app_name')['playtime_forever'].sum().reset_index()

In [77]:
df_playtime.sort_values(by='playtime_forever',ascending=False)

Unnamed: 0,app_name,playtime_forever
5373,Unturned,1786035.0
2774,Loadout,1084753.0
3973,Robocraft,884821.0
2235,Heroes & Generals,882531.0
2035,Goat Simulator,816752.0
...,...,...
5250,Trawl,0.0
5251,Trebuchet,0.0
3975,Robot Exploration Squad,0.0
5256,Tri Original Soundtrack + Artbook,0.0


In [78]:
df_playtime=df_playtime[df_playtime['playtime_forever']>0]

In [79]:
df_playtime

Unnamed: 0,app_name,playtime_forever
0,! That Bastard Is Trying To Steal Our Gold !,43.0
1,"""Glow Ball"" - The Billiard Puzzle Game",21.0
2,$1 Ride,1522.0
4,//N.P.P.D. Rush//- The Milk Of Ultraviolet,14722.0
5,//Snowflake Tattoo//,5034.0
...,...,...
5735,[The Sequence],174.0
5736,Астролорды: Облако Оорта,310.0
5738,侠客风云传(Tale Of Wuxia),92.0
5739,神明的一天世界(God'S One Day World),249.0


In [41]:
data=pd.merge(df_playtime['app_name'],df_games,left_on='app_name',right_on='app_name',how='inner')

In [47]:
data.drop('release_year',axis=1,inplace=True)

In [48]:
data.to_parquet('Datasets/data.parquet')

## Modelo de la similitud del coseno

In [81]:
df=pd.read_parquet('Datasets/data.parquet')

In [105]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5211 entries, 0 to 5210
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   app_name  5211 non-null   object
 1   genres    5211 non-null   int32 
 2   tags      5211 non-null   int32 
 3   id        5211 non-null   int32 
dtypes: int32(3), object(1)
memory usage: 101.9+ KB


In [97]:
le = LabelEncoder()
df['genres'] = le.fit_transform(df['genres'])
df['tags'] = le.fit_transform(df['tags'])

In [55]:
df.reset_index(drop=True, inplace=True)

In [56]:
#Vectorizar el nombre de los productos
vectorizer=TfidfVectorizer()
tfidf_matrix=vectorizer.fit_transform(df['app_name'])

In [57]:
# Calcular la cantidad de memoria en bytes
memory_in_bytes = (tfidf_matrix.data.nbytes + tfidf_matrix.indices.nbytes + tfidf_matrix.indptr.nbytes)

# Convertir a megabytes
memory_in_megabytes = memory_in_bytes / (1024 * 1024)

print(f"Memory: {memory_in_megabytes} MB")

Memory: 0.18424224853515625 MB


In [98]:
#Combinar características
features=np.column_stack([tfidf_matrix.toarray(),df['genres'],df['tags']])

In [99]:
#Calcular la matriz de similitud del coseno
similarity_matrix=cosine_similarity(features)

In [104]:

# Calcular la cantidad de memoria en bytes
memory_in_bytes = similarity_matrix.nbytes

# Convertir a megabytes
memory_in_megabytes = memory_in_bytes / (1024 * 1024)

print(f"Memory: {memory_in_megabytes} MB")

Memory: 207.17255401611328 MB


In [101]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5211 entries, 0 to 5210
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   app_name  5211 non-null   object
 1   genres    5211 non-null   int32 
 2   tags      5211 non-null   int32 
 3   id        5211 non-null   int32 
dtypes: int32(3), object(1)
memory usage: 101.9+ KB


In [102]:
df[df['app_name'].str.startswith('Co',na=False)]

Unnamed: 0,app_name,genres,tags,id
896,Co-Op : Decrypted,1,64,315130
897,Coast Guard,1,6,361200
898,Cobalt,0,4,357340
899,Code Of Princess,0,4,408640
900,Codename Cure,0,52,355180
...,...,...,...,...
968,Counter Spell,0,4,351230
969,Counter-Strike Nexon: Zombies,0,52,273110
970,Counterattack,0,42,451600
971,Countless Rooms Of Death,1,64,341380


In [112]:

item_id=273110
if df[df['id']==item_id].empty:
    print('No existe el item')
else:
    producto_index=df[df['id']==273110].index[0]
    product_similarities=similarity_matrix[producto_index]
    most_similar_products_index=np.argsort(-product_similarities)[1:6]
    most_similar_products=df.loc[most_similar_products_index,'app_name']
    print(most_similar_products)

3075                                         Omg Zombies!
2232                                      Infinium Strike
1028                          Ctu: Counter Terrorism Unit
5162                                                    Z
3561    Resident Evil Revelations 2 / Biohazard Revela...
Name: app_name, dtype: object


In [116]:
most_similar_products.to_json(orient='records')

'["Omg Zombies!","Infinium Strike","Ctu: Counter Terrorism Unit","Z","Resident Evil Revelations 2 \\/ Biohazard Revelations 2"]'