# Modelo de recomendación

In [1]:
import numpy as np
import pandas as pd
import pandas.util.testing as tm
from surprise import Reader
from surprise import Dataset
from surprise import accuracy
from surprise.model_selection import cross_validate
from surprise.model_selection import train_test_split
from surprise.model_selection import GridSearchCV
from surprise import SVD, SVDpp, NMF, SlopeOne, CoClustering, KNNBaseline, KNNWithZScore, KNNWithMeans, KNNBasic, BaselineOnly, NormalPredictor
from sklearn.metrics.pairwise import linear_kernel, cosine_similarity
import matplotlib.pyplot as plt
import seaborn as sns
from pyspark.sql import SparkSession
from pyspark.sql import functions as F

  This is separate from the ipykernel package so we can avoid doing imports until


In [2]:
spark = SparkSession.builder.appName('CADS').getOrCreate()

In [3]:
dfspark = spark.read.json('../common/data/reviews_clean')

In [4]:
data = dfspark.select(['reviewerId', 'productId', 'rating'])

In [5]:
#modelo = modelo.sort(col('reviewTime').desc())

In [6]:
data = data.withColumnRenamed('reviewerId', 'user').withColumnRenamed('productId','item').withColumnRenamed('rating', 'rating')

In [7]:
data.printSchema()
data.show()

root
 |-- user: string (nullable = true)
 |-- item: string (nullable = true)
 |-- rating: long (nullable = true)

+--------------+----------+------+
|          user|      item|rating|
+--------------+----------+------+
|A16EGPYX5X2GWT|0006353282|     5|
|A3DJZNG4TJIIJK|0006353282|     4|
|A10F2RGD2D36ZI|0006353282|     4|
| AP9ZRBLZ5CRAO|0006353282|     5|
|A2BIK37ZMXKW91|0006353282|     4|
|A2EF9S6J4ZSR1O|0006353282|     4|
| AUY7L0SJNUM95|0006353282|     4|
| ATM5S2UD11UPI|0006353282|     5|
|A14EQQTXMRWS8J|0006353282|     3|
| AHK6ASC7RULUB|0006353282|     5|
| A74CX68WG4UQN|0006353282|     5|
| AT34AL3XSVYEP|0006353282|     5|
|A1V8NBMG0PDVK1|0006353282|     5|
|A1KZX8NQGEEQJD|0006353282|     5|
|   AFVQZQ8PW0L|0006353282|     5|
|A27FP0S5QH7GNE|0006353282|     5|
|A104ZQFWRTN5R8|0006353282|     5|
|A1PZQKPTXUG81N|0006353282|     5|
| AVKDFF2BHS17J|0006353282|     5|
|A3A6GXILVT9JNW|0006353282|     5|
+--------------+----------+------+
only showing top 20 rows



In [8]:
data = data.withColumn("rating",data["rating"].cast("Float"))

In [9]:
producto = '0006353282'

In [10]:
usuarios = data.filter(data.item == producto)

In [11]:
modelo = data.join(usuarios, data.user == usuarios.user, 'leftsemi')

In [12]:
users = modelo.select('user').distinct()

In [13]:
items = modelo.select('item').distinct()

In [14]:
users = users.select('user').rdd.flatMap(lambda x: x).collect()

In [15]:
len(users)

46

In [16]:
items = items.select('item').rdd.flatMap(lambda x: x).collect()

In [17]:
len(items)

21516

In [18]:
modelo.count()

22198

In [19]:
spark.conf.set("spark.sql.execution.arrow.pyspark.enabled", "true")

In [20]:
pandas_df = modelo.limit(600000).toPandas() 

In [21]:
len(pandas_df.item.unique())

21516

In [22]:
sim_options = {"name": "cosine", "user_based": False,}

algoritmo = KNNWithMeans(sim_options=sim_options)

In [23]:
reader = Reader(rating_scale=(1, 5))

In [24]:
data_n = Dataset.load_from_df(pandas_df[["user", "item", "rating"]], reader)

In [25]:
trainingSet = data_n.build_full_trainset()

In [None]:
algoritmo.fit(trainingSet)

In [31]:
recomend = []
usuario = 'A16EGPYX5X2GWT'
for item in items:
    p1 = algoritmo.predict(usuario, item[:50])
    if p1.est > 1:
        p2 = [p1.est , p1.iid]
        recomend.append(p2)
recomend.sort()
recomend[-5:]

[[5, 'B008M2Q9C6'],
 [5, 'B008M4C1BC'],
 [5, 'B008M4J9B2'],
 [5, 'B008M4KB0K'],
 [5, 'B008M4RMJ8']]

In [32]:
dfspark_p = spark.read.json('../common/data/products_etl')

In [33]:
dfspark_p.show()

+--------------------+------+----------+--------------------+--------------------+
|          Categories| price| productId|             related|               title|
+--------------------+------+----------+--------------------+--------------------+
|Sports & Outdoors...| 29.98|B00IO9H7DM|, B00IO9H84U, B00...|Nike Girls (8-16)...|
|Clothing, Shoes &...|265.68|B00IO93358|, B00IHSSRKW, B00...|Coach 27348 Peyto...|
|Apps for Android,...|  null|B00IO9EWB2|B00LBH2UMM, B00GY...|                null|
|Health & Personal...| 17.99|B00IO9JF2I|, B004DR7YPK, B00...|Nutri Vitae Plus ...|
|Clothing, Shoes &...|  11.2|B00IO9H3YA|, B00JHDGW8U, B00...|Hanes Women`s Cla...|
|Sports & Outdoors...|  null|B00IO9HGUG|B00ACIFF0A, B00C5...|Hanes Adult X-Tem...|
|Automotive, Perfo...|  96.1|B00IO9CKW0|                null|Max KT001831 Fron...|
|Books, Humor & En...|  null|B00IO9JVJU|                null|                null|
|Cell Phones & Acc...|  20.1|B00IO91L0C|, B0098FKL1Q, B00...|Incipio Stashback...|
|Hea

In [34]:
def item_id_to_name(id):
    name = dfspark_p.filter((dfspark_p.productId)==id).collect()[0][4]    
    return name
                            


In [35]:
con = 0
while con <= 4:
    calification = recomend[-5:][con][0]
    prod_id = recomend[-5:][con][1]
    print(f'Para el usuario {usuario} la recomendacion es : {item_id_to_name(prod_id)}')
    con += 1

Para el usuario A1SXASF6GYG96I la recomendacion es : None
Para el usuario A1SXASF6GYG96I la recomendacion es : None
Para el usuario A1SXASF6GYG96I la recomendacion es : None
Para el usuario A1SXASF6GYG96I la recomendacion es : None
Para el usuario A1SXASF6GYG96I la recomendacion es : Sterling Silver Diamond Fashion Ring (0.1 Cttw, H-I Color, I3 Clarity), Size 8


In [37]:
print(item_id_to_name('B008M2Q9C6'))

None
