# 1 - Install

In [None]:
!pip install pyspark
!pip install recommenders

# 2 - Imports

In [2]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

from google.colab import drive

from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, IntegerType, FloatType
from pyspark.sql.functions import explode, col

from pyspark.ml.recommendation import ALS

from recommenders.utils.timer import Timer
from recommenders.evaluation.spark_evaluation import SparkRankingEvaluation, SparkRatingEvaluation

# 3 - Google Drive

In [None]:
drive.mount(r'/content/drive/')
!ls "/content/drive/MyDrive/Dataset/"

# 4 - Sessão Spark

In [4]:
spark = SparkSession.builder.appName('SMDI').getOrCreate()

 # 5 - Preliminaries

In [5]:
# Contruindo o schema da tabela fato.
COL_USER = "user_id"
COL_ITEM = "item_id"
COL_RATING = "rating"
COL_TIMESTAMP = "timestamp"
COL_PREDICTION = "prediction"
TOP_K = 10

schema = StructType(
    [
      StructField(COL_USER, IntegerType(), True),
      StructField(COL_ITEM, IntegerType( ), True),
      StructField(COL_RATING, IntegerType(), True)
      # StructField(COL_TIMESTAMP, IntegerType(), True)
    ]
  )

# 6 - Dataset

In [7]:
# Carregando a tabela de dimensão dos itens
SMDI_items = spark.read.csv("/content/drive/MyDrive/Dataset/SMDI_items.csv", inferSchema=True, header=True)

# Carregando a tabela de dimensão dos usuários
SMDI_users = spark.read.csv("/content/drive/MyDrive/Dataset/SMDI_users.csv", inferSchema=True, header=True)

In [8]:
# Dividindo a base entre treino e teste
train = spark.read.csv("/content/drive/MyDrive/Dataset/SMDI-500E_train.csv", schema=schema)
test = spark.read.csv("/content/drive/MyDrive/Dataset/SMDI-500E_test.csv", schema=schema)

# 7 - Treinando

In [12]:
# Instanciado o modelo de recomendação ALS
als = ALS(
        userCol=COL_USER,
        itemCol=COL_ITEM,
        ratingCol=COL_RATING,
        coldStartStrategy="drop",
        regParam=0.1,
        maxIter=15,
        rank=128,
        seed=2023,
        nonnegative=True
)

with Timer() as train_time:
    model = als.fit(train)
print("Took {} seconds for training.".format(train_time.interval))

Took 146.716975069 seconds for training.


# 8 - Avaliando

In [13]:
# Obtenha a junção cruzada de todos os pares de item de usuário e pontue-os.
users = train.select(COL_USER).distinct()
items = train.select(COL_ITEM).distinct()
user_item = users.crossJoin(items)
dfs_pred = model.transform(user_item)

# Remova itens vistos pelo treinamento, ou seja, esta ficando só com as cobinações de usuário e item que não foram impostas ao treinamento do modelo.
from pyspark.sql.functions import col
dfs_pred_exclude_train_old = dfs_pred.alias("pred").join(train.alias("train"), (col(f"pred.{COL_USER}") == col(f"train.{COL_USER}")) & (col(f"pred.{COL_ITEM}") == col(f"train.{COL_ITEM}")), how='outer' )
dfs_pred_exclude_train = dfs_pred.alias("pred").join(train.alias("train"), [COL_USER, COL_ITEM], how="outer")

top_all = dfs_pred_exclude_train.filter(dfs_pred_exclude_train[f"train.{COL_RATING}"].isNull()).select('pred.' + COL_USER, 'pred.' + COL_ITEM, 'pred.' + "prediction")

In [14]:
rank_eval = SparkRankingEvaluation(
    test, #observado
    top_all, #predito
    k = TOP_K, 
    col_user=COL_USER, 
    col_item=COL_ITEM,
    col_rating=COL_RATING,
    col_prediction="prediction",
    relevancy_method="top_k"
)

print(
    "Model:\tALS",
    "Top K:\t%d" % rank_eval.k,
    "MAP:\t%f" % rank_eval.map_at_k(), 
    "NDCG:\t%f" % rank_eval.ndcg_at_k(),
    "Precision@K:\t%f" % rank_eval.precision_at_k(),
    "Recall@K:\t%f" % rank_eval.recall_at_k(), 
    sep='\n'
)



Model:	ALS
Top K:	10
MAP:	0.000257
NDCG:	0.002032
Precision@K:	0.002195
Recall@K:	0.001184


# 8.1 - Recomendações

In [18]:
# Gerando a quantidade de recomendações por usuário
recommendations = model.recommendForAllUsers(TOP_K)

In [19]:
# Explodindo as informações em novas colunas para melhor visualizar
recommendations = recommendations\
    .withColumn("rec_exp", explode("recommendations"))\
    .select('user_id', col("rec_exp.item_id"), col("rec_exp.rating"))
recommendations.limit(15).show()

+-------+-------+----------+
|user_id|item_id|    rating|
+-------+-------+----------+
|      1|   5279| 0.8998934|
|      1|    108| 0.8998916|
|      1|   5116|0.89989156|
|      1|   5100|0.89989156|
|      1|   3408|0.89989156|
|      1|     80|0.89989156|
|      1|   3091|0.89989156|
|      1|     20| 0.8998915|
|      1|     30| 0.8998915|
|      1|     40| 0.8998915|
|      5|   5279| 0.8998934|
|      5|    108| 0.8998916|
|      5|   3408|0.89989156|
|      5|     80|0.89989156|
|      5|   5100|0.89989156|
+-------+-------+----------+



# 8.2 - Teste usuário 17

In [16]:
user17 = train.filter(train.user_id == 17).select(train.user_id)

In [17]:
(model.recommendForUserSubset(dataset=user17, numItems=10)
    .withColumn("rec_exp", explode("recommendations"))
    .select('user_id', col("rec_exp.item_id"), col("rec_exp.rating")).show())

+-------+-------+----------+
|user_id|item_id|    rating|
+-------+-------+----------+
|     17|   5279| 0.8998934|
|     17|    108| 0.8998916|
|     17|   3408|0.89989156|
|     17|     80|0.89989156|
|     17|   5100|0.89989156|
|     17|   5116|0.89989156|
|     17|     60| 0.8998915|
|     17|     20| 0.8998915|
|     17|     30| 0.8998915|
|     17|     40| 0.8998915|
+-------+-------+----------+



### 8.2.1 Análise

In [20]:
user_id = '17'

In [21]:
# Recomendações feitas para o usuário
recommendations.join(SMDI_items, on='item_id').filter(f'user_id = {user_id}').show()

+-------+-------+----------+----------+--------+---------+---------+---------+---------+-------+
|item_id|user_id|    rating|section_id|brand_id|ref_price|avg_price|min_price|max_price| amount|
+-------+-------+----------+----------+--------+---------+---------+---------+---------+-------+
|   5279|     17| 0.8998934|        39|      10|      4.9|      4.9|      4.9|      4.9|   29.4|
|    108|     17| 0.8998916|        45|       7|     4.99|     7.85|     4.99|     7.91|  408.3|
|   3408|     17|0.89989156|        49|      47|     11.9|     11.9|     11.9|     11.9|   35.7|
|     80|     17|0.89989156|        45|       7|     4.99|      5.9|     3.99|      7.9| 724.58|
|   5100|     17|0.89989156|        47|    1373|     13.9|     13.9|     13.9|     13.9|   41.7|
|   5116|     17|0.89989156|        37|     950|      9.9|      9.9|      9.9|      9.9|   19.8|
|     60|     17| 0.8998915|        45|       7|     3.99|     2.09|     1.19|     3.99|1901.08|
|     20|     17| 0.8998915|  

In [22]:
# Aqui foi a base de teste, onde ele usou para prever quais vão ser os itens que serão comprados
test.join(SMDI_items, on='item_id').filter(f'user_id = {user_id}').sort('rating', ascending=False).show()

+-------+-------+------+----------+--------+---------+---------+---------+---------+-------+
|item_id|user_id|rating|section_id|brand_id|ref_price|avg_price|min_price|max_price| amount|
+-------+-------+------+----------+--------+---------+---------+---------+---------+-------+
|   3231|     17|     1|        29|     678|     3.59|      3.6|     3.59|     3.98| 734.73|
|    656|     17|     1|        47|     125|     4.29|     3.94|      2.8|     4.29| 255.99|
|     91|     17|     1|        45|       7|     5.99|     4.25|     1.99|      6.0|4914.92|
|    750|     17|     1|        47|     238|     5.49|     5.49|     3.99|     5.99| 609.39|
|   1072|     17|     1|        46|     159|     2.79|      2.9|     1.99|     3.41|2260.32|
|    871|     17|     1|        47|     105|     2.49|     2.69|     2.69|     2.69|2014.81|
|   4009|     17|     1|        39|      10|      5.9|      5.9|      5.9|      5.9|  643.1|
|   1150|     17|     1|        48|      14|      1.9|     1.92|      

In [23]:
# Aqui na base de treino foi a utilizada para entender o "comportamento" do usuário
train.join(SMDI_items, on='item_id').filter(f'user_id = {user_id}').sort('rating', ascending=False).show()

+-------+-------+------+----------+--------+---------+---------+---------+---------+--------+
|item_id|user_id|rating|section_id|brand_id|ref_price|avg_price|min_price|max_price|  amount|
+-------+-------+------+----------+--------+---------+---------+---------+---------+--------+
|   4523|     17|     1|        47|     450|     2.99|     2.86|     2.09|     2.99|  163.23|
|   2507|     17|     1|        48|      14|     19.9|    19.64|    16.99|    19.99| 2561.27|
|     15|     17|     1|        41|       1|    21.99|    25.97|    17.89|     34.9| 2392.58|
|    706|     17|     1|        47|     227|     2.99|     2.91|     2.19|     2.99| 2609.23|
|    585|     17|     1|        46|     150|     1.79|     1.81|     1.79|     1.99|  692.42|
|   3245|     17|     1|        47|     587|     3.99|     4.04|     3.99|     4.39|  286.49|
|     97|     17|     1|        45|       7|     4.99|     4.32|     3.98|      6.0| 1767.65|
|   1433|     17|     1|        49|      40|     1.79|     1