# 1 - Install

In [None]:
!pip install pyspark
!pip install recommenders

# 2 - Imports

In [2]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

from google.colab import drive

from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, IntegerType
from pyspark.sql.functions import explode, col

from pyspark.ml.recommendation import ALS

from recommenders.utils.timer import Timer
from recommenders.evaluation.spark_evaluation import SparkRankingEvaluation

# 3 - Google Drive

In [None]:
drive.mount(r'/content/drive/')
!ls "/content/drive/MyDrive/Dataset/"

# 4 - Sessão Spark

In [4]:
spark = SparkSession.builder.appName('SMDI').getOrCreate()

 # 5 - Preliminaries

In [5]:
# Contruindo o schema da tabela fato.
COL_USER = "user_id"
COL_ITEM = "item_id"
COL_RATING = "rating"
COL_TIMESTAMP = "timestamp"
COL_PREDICTION = "prediction"
TOP_K = 10

schema = StructType(
    [
      StructField(COL_USER, IntegerType(), True),
      StructField(COL_ITEM, IntegerType( ), True),
      StructField(COL_RATING, IntegerType(), True)
      # StructField(COL_TIMESTAMP, IntegerType(), True)
    ]
  )

# 6 - Dataset

In [8]:
# Dividindo a base entre treino e teste
train = spark.read.csv("/content/drive/MyDrive/Dataset/SMDI-500E_train.csv", schema=schema)
test = spark.read.csv("/content/drive/MyDrive/Dataset/SMDI-500E_test.csv", schema=schema)

# 7 - Treinando

In [12]:
# Instanciado o modelo de recomendação ALS
als = ALS(
        userCol=COL_USER,
        itemCol=COL_ITEM,
        ratingCol=COL_RATING,
        coldStartStrategy="drop",
        regParam=0.1,
        maxIter=15,
        rank=128,
        seed=2023,
        nonnegative=True
)

with Timer() as train_time:
    model = als.fit(train)
print("Took {} seconds for training.".format(train_time.interval))

Took 146.716975069 seconds for training.


# 8 - Avaliando

In [13]:
# Obtenha a junção cruzada de todos os pares de item de usuário e pontue-os.
users = train.select(COL_USER).distinct()
items = train.select(COL_ITEM).distinct()
user_item = users.crossJoin(items)
dfs_pred = model.transform(user_item)

# Remova itens vistos pelo treinamento, ou seja, esta ficando só com as cobinações de usuário e item que não foram impostas ao treinamento do modelo.
from pyspark.sql.functions import col
dfs_pred_exclude_train_old = dfs_pred.alias("pred").join(train.alias("train"), (col(f"pred.{COL_USER}") == col(f"train.{COL_USER}")) & (col(f"pred.{COL_ITEM}") == col(f"train.{COL_ITEM}")), how='outer' )
dfs_pred_exclude_train = dfs_pred.alias("pred").join(train.alias("train"), [COL_USER, COL_ITEM], how="outer")

top_all = dfs_pred_exclude_train.filter(dfs_pred_exclude_train[f"train.{COL_RATING}"].isNull()).select('pred.' + COL_USER, 'pred.' + COL_ITEM, 'pred.' + "prediction")

In [14]:
rank_eval = SparkRankingEvaluation(
    test, #observado
    top_all, #predito
    k = TOP_K, 
    col_user=COL_USER, 
    col_item=COL_ITEM,
    col_rating=COL_RATING,
    col_prediction="prediction",
    relevancy_method="top_k"
)

print(
    "Model:\tALS",
    "Top K:\t%d" % rank_eval.k,
    "MAP:\t%f" % rank_eval.map_at_k(), 
    "NDCG:\t%f" % rank_eval.ndcg_at_k(),
    "Precision@K:\t%f" % rank_eval.precision_at_k(),
    "Recall@K:\t%f" % rank_eval.recall_at_k(), 
    sep='\n'
)



Model:	ALS
Top K:	10
MAP:	0.000257
NDCG:	0.002032
Precision@K:	0.002195
Recall@K:	0.001184


# 8.1 - Teste usuário 17

In [16]:
user17 = train.filter(train.user_id == 17).select(train.user_id)

In [17]:
(model.recommendForUserSubset(dataset=user17, numItems=10)
    .withColumn("rec_exp", explode("recommendations"))
    .select('user_id', col("rec_exp.item_id"), col("rec_exp.rating")).show())

+-------+-------+----------+
|user_id|item_id|    rating|
+-------+-------+----------+
|     17|   5279| 0.8998934|
|     17|    108| 0.8998916|
|     17|   3408|0.89989156|
|     17|     80|0.89989156|
|     17|   5100|0.89989156|
|     17|   5116|0.89989156|
|     17|     60| 0.8998915|
|     17|     20| 0.8998915|
|     17|     30| 0.8998915|
|     17|     40| 0.8998915|
+-------+-------+----------+

