In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

import pyspark
from pyspark.sql import *
from pyspark.sql.types import *
from pyspark.sql.functions import *
from pyspark import SparkContext, SparkConf

In [2]:
# creiamo la sessione
conf = SparkConf().set("spark.ui.port", "4050")

# creiamo il contesto
sc = pyspark.SparkContext(conf=conf)
spark = SparkSession.builder.getOrCreate()

# I dati

In questo notebook utilizzeremo il dataset [MovieLens](https://grouplens.org/datasets/movielens/), in particolare il dataset 100K contenente circa 100000 ratings, da 1000 utenti di cierca 1700 film.

Caricheremo i rating nelle percentuali 80%-20% training/test, mentre il dataframe items contiene i il titolo del film e ulteriori informazioni.



In [3]:
from pydrive2.auth import GoogleAuth
from pydrive2.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials

# autenticazione e creazione del client PyDrive
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

In [4]:
id='1yQh7-j1Iw_C39dJq17yLv89uSiD8mcsq'
downloaded = drive.CreateFile({'id': id})
downloaded.GetContentFile('MovieLens.training')


id='1t_8KZGUNgJGwxhvTzvs0CwmIoaTqGHbl'
downloaded = drive.CreateFile({'id': id})
downloaded.GetContentFile('MovieLens.test')

id='1Kp09G2Iw0mSN-1xZJLWInSNG7v8TM81S'
downloaded = drive.CreateFile({'id': id})
downloaded.GetContentFile('MovieLens.item')

In [5]:
schema_ratings = StructType([
    StructField("user_id", IntegerType(), False),
    StructField("item_id", IntegerType(), False),
    StructField("rating", IntegerType(), False),
    StructField("timestamp", IntegerType(), False)])

schema_items = StructType([
    StructField("item_id", IntegerType(), False),
    StructField("movie", StringType(), False)])

training = spark.read.option("sep", "\t").csv("MovieLens.training", header=False, schema=schema_ratings)
test = spark.read.option("sep", "\t").csv("MovieLens.test", header=False, schema=schema_ratings)
items = spark.read.option("sep", "|").csv("MovieLens.item", header=False, schema=schema_items)

In [6]:
training.printSchema()
test.printSchema()
training.take(3)

root
 |-- user_id: integer (nullable = true)
 |-- item_id: integer (nullable = true)
 |-- rating: integer (nullable = true)
 |-- timestamp: integer (nullable = true)

root
 |-- user_id: integer (nullable = true)
 |-- item_id: integer (nullable = true)
 |-- rating: integer (nullable = true)
 |-- timestamp: integer (nullable = true)



[Row(user_id=1, item_id=1, rating=5, timestamp=874965758),
 Row(user_id=1, item_id=2, rating=3, timestamp=876893171),
 Row(user_id=1, item_id=3, rating=4, timestamp=878542960)]

In [7]:
items.printSchema()
items.take(3)

root
 |-- item_id: integer (nullable = true)
 |-- movie: string (nullable = true)



[Row(item_id=1, movie='Toy Story (1995)'),
 Row(item_id=2, movie='GoldenEye (1995)'),
 Row(item_id=3, movie='Four Rooms (1995)')]

# Esercizio 1 (Consegna 5 maggio)

- calcolare alcune statistiche sui dati. Numero di revisioni per ogni film. Rating medio per ogni film, rating medio per ogni utente
- Usando il training set, addestrare un modello con  Alternating Least Squares method disponibile in Spark MLlib: [https://spark.apache.org/docs/latest/ml-collaborative-filtering.html](https://spark.apache.org/docs/latest/ml-collaborative-filtering.html)
- calcolare il RMSE sul test set
- produrre le top k raccomandazioni per ogni utente (k=10)




In [8]:
ratings = training.union(test)
ratings.printSchema()

root
 |-- user_id: integer (nullable = true)
 |-- item_id: integer (nullable = true)
 |-- rating: integer (nullable = true)
 |-- timestamp: integer (nullable = true)



### Calcoliamo qualche statistica

In [9]:
n_revision_by_film = ratings.groupBy("item_id").count()
n_revision_by_film.show()

mean_by_user = ratings.groupBy("user_id").mean()
mean_by_user.show()

mean_by_film = ratings.groupBy("item_id").mean()
mean_by_film.show()

+-------+-----+
|item_id|count|
+-------+-----+
|    496|  231|
|    463|   71|
|    471|  221|
|    833|   49|
|    148|  128|
|   1088|   13|
|   1238|    8|
|   1580|    1|
|   1591|    6|
|   1645|    1|
|   1342|    2|
|    392|   68|
|    540|   43|
|    243|  132|
|    737|   59|
|    897|    2|
|    858|    3|
|   1084|   21|
|    623|   39|
|   1025|   44|
+-------+-----+
only showing top 20 rows

+-------+------------+------------------+------------------+-------------------+
|user_id|avg(user_id)|      avg(item_id)|       avg(rating)|     avg(timestamp)|
+-------+------------+------------------+------------------+-------------------+
|    148|       148.0|315.03076923076924|               4.0|8.771647431230769E8|
|    463|       463.0|  515.187969924812|2.8646616541353382|8.821840331578947E8|
|    471|       471.0| 411.1290322580645|3.3870967741935485|8.898279313225807E8|
|    496|       496.0|  433.015503875969|3.0310077519379846|8.760676046279069E8|
|    833|       833.0| 

### Addestramento modello ALS

In [18]:
from pyspark.ml.recommendation import ALS
als = ALS(maxIter=5, regParam=0.01, userCol="user_id", itemCol="item_id", ratingCol="rating", coldStartStrategy="drop")
model = als.fit(training)

In [19]:
model

ALSModel: uid=ALS_b2f79cceb4b9, rank=10

### Calcoliamo l'errore

In [20]:
from pyspark.ml.evaluation import RegressionEvaluator
predictions = model.transform(test)
evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating", predictionCol="prediction")
rmse = evaluator.evaluate(predictions)
print("Root-mean-square error = " + str(rmse))

Root-mean-square error = 1.1248474247333107


### Calcoliamo le raccomandazioni

In [21]:
k = 10
user_recs = model.recommendForAllUsers(k)
user_recs.show()

+-------+--------------------+
|user_id|     recommendations|
+-------+--------------------+
|      1|[{854, 7.351436},...|
|      3|[{1428, 9.875046}...|
|      5|[{1184, 8.137185}...|
|      6|[{1126, 6.383986}...|
|      9|[{446, 13.307565}...|
|     12|[{982, 7.3987474}...|
|     13|[{1280, 7.1823993...|
|     15|[{1062, 8.90009},...|
|     16|[{703, 8.239677},...|
|     17|[{914, 10.175524}...|
|     19|[{1174, 8.572823}...|
|     20|[{1036, 9.9008045...|
|     22|[{1434, 9.534663}...|
|     26|[{904, 5.4380465}...|
|     27|[{1428, 9.950672}...|
|     28|[{1131, 7.885643}...|
|     31|[{103, 6.409612},...|
|     34|[{1131, 15.608356...|
|     35|[{974, 9.207007},...|
|     37|[{1131, 8.719027}...|
+-------+--------------------+
only showing top 20 rows



In [28]:
exploded_recs = user_recs.select(
    col("user_id"),
    explode(col("recommendations")).alias("recommendation_item")
)

exploded_recs = exploded_recs.select(
    col("user_id"),
    col("recommendation_item.item_id").alias("item_id"),
    col("recommendation_item.rating").alias("predicted_rating")
)

joined_recs = exploded_recs.join(
    items,
    on="item_id",
    how="inner"
)

joined_recs.orderBy(
      col("user_id").asc(),
      col("predicted_rating").desc()
  ).show(truncate=False)

+-------+-------+----------------+------------------------------------------------------------------+
|item_id|user_id|predicted_rating|movie                                                             |
+-------+-------+----------------+------------------------------------------------------------------+
|854    |1      |7.351436        |Bad Taste (1987)                                                  |
|1160   |1      |7.1268516       |Love! Valour! Compassion! (1997)                                  |
|1643   |1      |6.813043        |Angel Baby (1995)                                                 |
|1192   |1      |6.4859266       |Boys of St. Vincent, The (1993)                                   |
|1368   |1      |6.3977532       |Mina Tannenbaum (1994)                                            |
|904    |1      |6.376305        |Ma vie en rose (My Life in Pink) (1997)                           |
|1631   |1      |6.3331857       |Slingshot, The (1993)                           