## Machine Learning

Para trabajar con sistemas de recomendación, emplearemos la librería Surprise de la que dejamos aquí la documentación.

Tendremos que llevar nuestro dataset al formato con el que trabaja esta librería.

In [1]:
# Vinculamos google colab con google drive para importar el dataset

from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
# Instalamos surprise

!pip install scikit-surprise

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting scikit-surprise
  Downloading scikit-surprise-1.1.3.tar.gz (771 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m772.0/772.0 KB[0m [31m35.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: scikit-surprise
  Building wheel for scikit-surprise (setup.py) ... [?25l[?25hdone
  Created wheel for scikit-surprise: filename=scikit_surprise-1.1.3-cp39-cp39-linux_x86_64.whl size=3193636 sha256=6c30847584b00844cf2c208bc47b891ccd3a9ad46609bc57057795fad3eaf6a7
  Stored in directory: /root/.cache/pip/wheels/c6/3a/46/9b17b3512bdf283c6cb84f59929cdd5199d4e754d596d22784
Successfully built scikit-surprise
Installing collected packages: scikit-surprise
Successfully installed scikit-surprise-1.1.3


In [3]:
# Importamos librerias

import pandas as pd
import sys
import surprise
from surprise import Dataset
from surprise import Reader
from surprise.model_selection import train_test_split

In [4]:
reader = Reader()

In [5]:


data_score = pd.read_csv('/content/drive/MyDrive/MyLops/score.csv', sep=',', encoding='latin-1')
data_movies = pd.read_csv('/content/drive/MyDrive/MyLops/plataformas_prom.csv', sep=',', encoding='latin-1')

In [6]:
data_score = data_score[ ['userId', 'rating', 'movieId'] ]
data_score.head(3)

Unnamed: 0,userId,rating,movieId
0,1,1.0,as680
1,1,4.5,ns2186
2,1,5.0,hs2381


In [7]:
data_movies = data_movies[ ['Id', 'title'] ]
data_movies.head(3)

Unnamed: 0,Id,title
0,as1,the grand seduction
1,as2,take care good night
2,as3,secrets of deception


In [8]:
# Unimos ambos dataset, para obtener el nombre de las peliculas con sus votaciones

data_movies_score = pd.merge(left=data_score, right=data_movies, how='left', left_on='movieId', right_on='Id')
data_movies_score.head()

Unnamed: 0,userId,rating,movieId,Id,title
0,1,1.0,as680,as680,the english civil war
1,1,4.5,ns2186,ns2186,latte and the magic waterstone
2,1,5.0,hs2381,hs2381,la diosa coronada
3,1,5.0,ns3663,ns3663,"frankensteinÃ¢ÂÂs monsterÃ¢ÂÂs monster, fr..."
4,1,5.0,as9500,as9500,kept woman


In [9]:
N_filas = 100000 # Limitamos el dataset a N_filas

data = Dataset.load_from_df( data_movies_score[ ['userId', 'movieId', 'rating']][:N_filas], reader)

In [10]:
# Separamos nuestros datos

trainset, testset = train_test_split (data, test_size=.25)

In [11]:
# Usaremos un modelo de Singular Value Decomposition

from surprise import SVD
model = SVD()

In [12]:
# Entrenamos el modelo

model.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x7fe226869d90>

In [13]:
# Predecimos

predictions = model.test(testset)

In [14]:
predictions[1]

Prediction(uid=805, iid='hs1737', r_ui=4.0, est=3.17828487936829, details={'was_impossible': False})

In [15]:
# Hacemos una predicción al azar para usuario y película

model.predict(1, 'as680')

Prediction(uid=1, iid='as680', r_ui=None, est=3.320836486578552, details={'was_impossible': False})

In [16]:
# Tomaremos un usuario para hacerle una recomendación

usuario = 682
rating = 4   # Tomamos películas a las que haya calificado con 4 o 5 estrellas
df_user = data_movies_score[(data_score['userId'] == usuario) & (data_score['rating'] >= rating)]
df_user = df_user.reset_index(drop=True)
# df_user['Name'] = data_movies['title'].loc[df_user.userId].values
df_user

Unnamed: 0,userId,rating,movieId,Id,title
0,682,4.0,as8174,as8174,the trough
1,682,4.0,ns1632,ns1632,rust creek
2,682,5.0,as6216,as6216,sorority row
3,682,4.0,as6436,as6436,all at sea
4,682,4.0,ds1450,ds1450,captain sparky vs. the flying saucers
...,...,...,...,...,...
267,682,4.5,as2347,as2347,gina yashere: skinny b*tch
268,682,4.0,as5843,as5843,ruben guthrie
269,682,4.0,as8480,as8480,bidhilipi
270,682,4.0,ds767,ds767,buffalo dreams


In [17]:
recomendaciones_usuario = data_movies.iloc[:4499].copy()
print(recomendaciones_usuario.shape)
recomendaciones_usuario.head()

(4499, 2)


Unnamed: 0,Id,title
0,as1,the grand seduction
1,as2,take care good night
2,as3,secrets of deception
3,as4,pink: staying true
4,as5,monster maker


In [18]:
# Debemos extraer las películas que ya ha visto

usuario_vistas = data_score[ data_score['userId'] == usuario]
print(usuario_vistas.shape)
usuario_vistas.head()

(823, 3)


Unnamed: 0,userId,rating,movieId
63307,682,4.0,as8174
63308,682,3.0,as1957
63309,682,2.0,as8521
63310,682,3.0,ns2818
63311,682,3.0,hs2555


In [19]:
# Recomendamos

recomendaciones_usuario['Estimate_Score'] = recomendaciones_usuario['Id'].apply(lambda x: model.predict(usuario, x).est)

In [20]:
recomendaciones_usuario = recomendaciones_usuario.sort_values('Estimate_Score', ascending=False)
recomendaciones_usuario.head()

Unnamed: 0,Id,title,Estimate_Score
3330,as3331,alien crash retrievals,3.993491
674,as675,the fabulous allan carr,3.962793
591,as592,the keeping room,3.930976
3712,as3713,tumhari sulu,3.930443
833,as834,sword oratoria: is it wrong to try to pick up ...,3.916719


In [21]:
# Evaluación

from surprise import accuracy

accuracy.rmse(predictions)

RMSE: 0.9580


0.9579576012094069