<a href="https://colab.research.google.com/github/Javsk891/PowerBI-Project/blob/main/recsys_YelpReviews.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Sistema de recomendación con reseñas de Yelp
> Actualizado 2024-06-10
> - Incluye las reseñas desde Google Drive (copia propia).

In [None]:
# Obtener reseñas desde Google Drive
!curl -L -o recsys_YelpReviews.zip "https://drive.usercontent.google.com/download?id=1nwPzvTnMXPGSxnKY0Kfues_C7USCGKcV&export=download&confirm=yes"

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 95.8M  100 95.8M    0     0  36.4M      0  0:00:02  0:00:02 --:--:-- 36.4M


In [None]:
# Descomprimir > recsys_YelpReviews.csv
!unzip -q 'recsys_YelpReviews.zip'

In [None]:
# Imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

## Exploración de los datos

In [None]:
yelp_review = pd.read_csv('recsys_YelpReviews.csv')
yelp_review.columns

Index(['Unnamed: 0', 'business_blank', 'business_categories', 'business_city',
       'business_full_address', 'business_id', 'business_latitude',
       'business_longitude', 'business_name', 'business_neighborhoods',
       'business_open', 'business_review_count', 'business_stars',
       'business_state', 'business_type', 'cool', 'date', 'funny', 'review_id',
       'reviewer_average_stars', 'reviewer_blank', 'reviewer_cool',
       'reviewer_funny', 'reviewer_name', 'reviewer_review_count',
       'reviewer_type', 'reviewer_useful', 'stars', 'text', 'type', 'useful',
       'user_id'],
      dtype='object')

In [None]:
cols = ['user_id', 'business_id', 'business_name', 'stars']
yelp_review = pd.read_csv('recsys_YelpReviews.csv', usecols=cols)
data = yelp_review.drop('business_name', axis=1)

nUsers = data['user_id'].nunique()
nItems = data['business_id'].nunique()
nReviews = len(data)

print(f'Número de usuarios: {nUsers}')
print(f'Número de negocios: {nItems}')
print(f'Número de posibles reseñas: {nUsers * nItems}')
print(f'Número de reseñas: {nReviews} ({100*nReviews/(nUsers * nItems):0.2f}%)')

Número de usuarios: 45981
Número de negocios: 11537
Número de posibles reseñas: 530482797
Número de reseñas: 229907 (0.04%)


In [None]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 229907 entries, 0 to 229906
Data columns (total 3 columns):
 #   Column       Non-Null Count   Dtype 
---  ------       --------------   ----- 
 0   business_id  229907 non-null  object
 1   stars        229907 non-null  int64 
 2   user_id      229907 non-null  object
dtypes: int64(1), object(2)
memory usage: 5.3+ MB


## Modelo 1: **Basado en ranking**

In [None]:
average_ratings = data.groupby('business_id')['stars'].mean()
count_rating = data.groupby('business_id')['stars'].count()
final_rating = pd.DataFrame({'avg': average_ratings, 'count': count_rating})

final_rating.head()

Unnamed: 0_level_0,avg,count
business_id,Unnamed: 1_level_1,Unnamed: 2_level_1
--5jkZ3-nUPZxUvtcbr8Uw,4.545455,11
--BlvDO_RG2yElKu9XA1_g,4.162162,37
-0D_CYhlD2ILkmLR0pBmnA,4.0,5
-0QBrNvhrPQCaeo7mTo0zQ,4.333333,3
-0bUDim5OGuv8R0Qqq6J4A,2.333333,6


In [None]:
def top_n_businesses(groupby_data, n, min_interactions=100):
    recommendations = groupby_data[groupby_data['count'] >= min_interactions]
    recommendations = recommendations.sort_values(by='avg', ascending=False)
    return recommendations.index[:n]

In [None]:
top5 = list(top_n_businesses(final_rating, n=5, min_interactions=100))

yelp_review[yelp_review['business_id'].isin(top5)]['business_name'].unique()

array(['Yogurtland', 'Short Leash Dogs', 'Café Monarch',
       'Changing Hands Bookstore', 'Paletas Betty'], dtype=object)

## Modelo 2: **Filtro colaborativo ítem a ítem**

In [None]:
!pip install surprise

Collecting surprise
  Downloading surprise-0.1-py2.py3-none-any.whl.metadata (327 bytes)
Collecting scikit-surprise (from surprise)
  Downloading scikit_surprise-1.1.4.tar.gz (154 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m154.4/154.4 kB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Downloading surprise-0.1-py2.py3-none-any.whl (1.8 kB)
Building wheels for collected packages: scikit-surprise
  Building wheel for scikit-surprise (pyproject.toml) ... [?25l[?25hdone
  Created wheel for scikit-surprise: filename=scikit_surprise-1.1.4-cp310-cp310-linux_x86_64.whl size=2357275 sha256=c3561c499fbfe1d3a27a2830eaadfcc953b44a3d296612e8166200dcc79ea6ae
  Stored in directory: /root/.cache/pip/wheels/4b/3f/df/6acbf0a40397d9bf3ff97f582cc22fb9ce66adde75bc71fd54
Successfully built scikit-surprise
Install

In [None]:
from surprise import accuracy
from surprise.reader import Reader
from surprise.dataset import Dataset
from surprise.model_selection import GridSearchCV
from surprise.model_selection import train_test_split

from surprise.prediction_algorithms.knns import KNNBasic
from surprise.prediction_algorithms.matrix_factorization import SVD

## Model 3: **Factores latentes**