# Some Notes:
- This recommendation is used the same with the file, recommendation_mllib, with the only difference is the ALS using the pyspark.ml, and input data is dataframe, instead of RDD for pyspark.mllib.
- Another difference between two libraries is, predictions from ALS pyspark.ml returns some nulls, while in ALS pyspark.mllib does not due to the random split in RDD and dataframe difference. 

## 1. Setup google colab

In [31]:
# setup for pyspark working on google colab
'''
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q https://www-us.apache.org/dist/spark/spark-2.4.5/spark-2.4.5-bin-hadoop2.7.tgz
'''
from google.colab import drive
drive.mount('/content/gdrive')

!cp '/content/gdrive/My Drive/pyspark/spark-2.4.5-bin-hadoop2.7.tgz' .

!tar xf spark-2.4.5-bin-hadoop2.7.tgz
!pip install -q findspark

# setup java hoem and spark home directory in google collab
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-2.4.5-bin-hadoop2.7"

# import some library pyspark
import findspark
findspark.init()
from pyspark.sql import SparkSession
spark = SparkSession.builder.master("local[*]").getOrCreate()

from pyspark import SparkConf, SparkContext
from pyspark import SQLContext
from pyspark.sql import Row
from pyspark.sql.types import *

# create spark context set-up
conf = SparkConf().setAppName('sql_dataframe')
sc = SparkContext.getOrCreate(conf = conf)
sqlcontext = SQLContext(sc)

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


## 2. Reading data

In [2]:
# reading file ratings.csv in local drive into dataframe
url = '/content/gdrive/My Drive/pyspark/recommendation/ratings.csv'
df_ratings = sqlcontext.read.csv(url, inferSchema=True, header=True)
df_ratings.printSchema()
df_ratings = df_ratings.select(['userID','movieID','rating'])
df_ratings.show(3)
# summary about ratings.csv file
# this summary to build the matrix model
num_ratings = df_ratings.count()
print('total number of ratings: ', num_ratings)
print('total number of movies rated: ', df_ratings.select('movieId').distinct().count())
print('total number of users rated: ', df_ratings.select('userId').distinct().count())

root
 |-- userId: integer (nullable = true)
 |-- movieId: integer (nullable = true)
 |-- rating: double (nullable = true)
 |-- timestamp: integer (nullable = true)

+------+-------+------+
|userID|movieID|rating|
+------+-------+------+
|     1|      1|   4.0|
|     1|      3|   4.0|
|     1|      6|   4.0|
+------+-------+------+
only showing top 3 rows

total number of ratings:  100836
total number of movies rated:  9724
total number of users rated:  610


In [3]:
# reading file movies.csv in local drive into dataframe
url = '/content/gdrive/My Drive/pyspark/recommendation/movies.csv'
df_movies = sqlcontext.read.csv(url, inferSchema=True, header=True)
df_movies.printSchema()
df_movies.show(3)
# reading file tags.csv in local drive into dataframe
url = '/content/gdrive/My Drive/pyspark/recommendation/tags.csv'
df_tags = sqlcontext.read.csv(url, inferSchema=True, header=True)
df_tags.printSchema()
df_tags.show(3)
# reading file links.csv in the local drive into dataframe
url = '/content/gdrive/My Drive/pyspark/recommendation/links.csv'
df_links = sqlcontext.read.csv(url, inferSchema=True, header=True)
df_links.printSchema()
df_links.show(3)

root
 |-- movieId: integer (nullable = true)
 |-- title: string (nullable = true)
 |-- genres: string (nullable = true)

+-------+--------------------+--------------------+
|movieId|               title|              genres|
+-------+--------------------+--------------------+
|      1|    Toy Story (1995)|Adventure|Animati...|
|      2|      Jumanji (1995)|Adventure|Childre...|
|      3|Grumpier Old Men ...|      Comedy|Romance|
+-------+--------------------+--------------------+
only showing top 3 rows

root
 |-- userId: integer (nullable = true)
 |-- movieId: integer (nullable = true)
 |-- tag: string (nullable = true)
 |-- timestamp: integer (nullable = true)

+------+-------+---------------+----------+
|userId|movieId|            tag| timestamp|
+------+-------+---------------+----------+
|     2|  60756|          funny|1445714994|
|     2|  60756|Highly quotable|1445714996|
|     2|  60756|   will ferrell|1445714992|
+------+-------+---------------+----------+
only showing top 3 r

## 3. Making predictions

In [30]:
# making prediction on the ratings dataframe
data_train, data_test = df_ratings.randomSplit([0.8, 0.2], seed = 12345)
num_train, num_test = data_train.count(), data_test.count()
print('number of data length: ', df_ratings.count())
print('number of training data: ', num_train)
print('number of test data: ', num_test)
print('num_train + num_test = ', num_train + num_test)
# change column name for data_train to fit ALS() API
data_train = (data_train.withColumnRenamed('userId', 'user')
                        .withColumnRenamed('movieId', 'item'))

# process data_test by dropping last column
# and rename to column to fit with ALS() API
data_test_removed = data_test.select(['userId','movieId'])
data_test_removed = (data_test_removed.withColumnRenamed('userId', 'user')
                                      .withColumnRenamed('movieId', 'item'))
data_test_removed.show(3) 
# train model ALS (alternative least square).
# split large sparse matrix into product of 
# two matrices with lower rank.
from pyspark.ml.recommendation import ALS
model_als = ALS(rank = 20, maxIter = 20, coldStartStrategy = "drop")
model_als_fit = model_als.fit(data_train)
predicted_test = model_als_fit.transform(data_test_removed)
print('total line of predicted_test (ALS model automatically removed those predictions is null/nan): ', predicted_test.count())

number of data length:  100836
number of training data:  80487
number of test data:  20349
num_train + num_test =  100836
+----+----+
|user|item|
+----+----+
|   1|  50|
|   1| 110|
|   1| 151|
+----+----+
only showing top 3 rows

total line of predicted_test (ALS model automatically removed those predictions is null/nan):  19536


In [28]:
# join predicted_test with data_test (including actual test)
data_test_actual_predicted = (data_test.join(predicted_test
                                      , (data_test.userID == predicted_test.user) 
                                      & (data_test.movieID == predicted_test.item) 
                                      , how = 'left' ))

# data after join is not following the order of original dataframe
data_test_actual_predicted.show(3)

# we sort out the result of join in order user, item 
# to make result dataframe look like before
data_test.show(3)
data_test_actual_predicted = (data_test_actual_predicted
                                                        .sort(['userID','movieID'], ascending = True)
                                                        .drop('user')
                                                        .drop('item'))
data_test_actual_predicted.show(3)
                                                        

+------+-------+------+----+----+----------+
|userID|movieID|rating|user|item|prediction|
+------+-------+------+----+----+----------+
|     1|   1208|   4.0|   1|1208| 4.9291854|
|     9|   5481|   5.0|   9|5481| 2.6461997|
|    42|    434|   4.0|  42| 434| 2.9978712|
+------+-------+------+----+----+----------+
only showing top 3 rows

+------+-------+------+
|userID|movieID|rating|
+------+-------+------+
|     1|     50|   5.0|
|     1|    110|   4.0|
|     1|    151|   5.0|
+------+-------+------+
only showing top 3 rows

+------+-------+------+----------+
|userID|movieID|rating|prediction|
+------+-------+------+----------+
|     1|     50|   5.0| 4.7725115|
|     1|    110|   4.0| 4.6414976|
|     1|    151|   5.0| 3.6864564|
+------+-------+------+----------+
only showing top 3 rows



## 4. Some evaluations

In [0]:
# Compute the false positive for those predictions
# that are greater than the two limits 0 and 5 in 
# the rating scale.
number_fasle_positive = data_test_actual_predicted.filter(data_test_actual_predicted.prediction.between(0,5))
print('number of false positive and null or nan in the recommendation: ', data_test_actual_predicted.count() - number_fasle_positive.count())

number of false positive and null or nan in the recommendation:  852


In [0]:
# Evaluate the model using root mean square error
from pyspark.sql.functions import when, count, isnan, isnull

# before evaluate, we check how many null/nan existing in the predictions
data_test_actual_predicted.show(3)
data_test_actual_predicted.printSchema()
data_test_actual_predicted.select([count(when(isnan(c), 1)).alias(c) for c in data_test_actual_predicted.columns]).show()
data_test_actual_predicted.select([count(when(isnull(c), 1)).alias(c) for c in data_test_actual_predicted.columns]).show() 

# remove rows with null/nan
df_evaluate = data_test_actual_predicted.dropna()
num_nonna = df_evaluate.count()
print('number of lines without null: ', num_nonna)

# Root Mean Square Error
import math
df_evaluate = df_evaluate.withColumn('rmse', ((data_test_actual_predicted.rating 
                                              -data_test_actual_predicted.prediction)**2))
df_evaluate.show(3)
val = df_evaluate.agg({'rmse': 'sum'}).collect()[0][0]
rmse = math.sqrt(val/num_nonna)
print('root mean square error of recommendation system is: ', rmse)

# Mean Absoluted Error
from pyspark.sql.functions import abs
df_evaluate = df_evaluate.withColumn('mae', abs(data_test_actual_predicted.rating 
                                               -data_test_actual_predicted.prediction))

df_evaluate.agg({'mae': 'sum'}).collect()[0][0]
mae = val/num_nonna
print('mean absolute error of recommendation system is: ', mae)
df_evaluate.printSchema()

+------+-------+------+----------+
|userID|movieID|rating|prediction|
+------+-------+------+----------+
|     1|     50|   5.0|  4.599513|
|     1|    110|   4.0| 4.6269245|
|     1|    151|   5.0| 3.5044563|
+------+-------+------+----------+
only showing top 3 rows

root
 |-- userID: integer (nullable = true)
 |-- movieID: integer (nullable = true)
 |-- rating: double (nullable = true)
 |-- prediction: float (nullable = true)

+------+-------+------+----------+
|userID|movieID|rating|prediction|
+------+-------+------+----------+
|     0|      0|     0|         0|
+------+-------+------+----------+

+------+-------+------+----------+
|userID|movieID|rating|prediction|
+------+-------+------+----------+
|     0|      0|     0|       813|
+------+-------+------+----------+

number of lines without null:  19536
+------+-------+------+----------+-------------------+
|userID|movieID|rating|prediction|               rmse|
+------+-------+------+----------+-------------------+
|     1|    

In [0]:
# the built-in function 'RegressionEvaluator' is an 
# alternative to computing rmse 
from pyspark.ml.evaluation import RegressionEvaluator
evaluator = RegressionEvaluator(metricName='rmse', labelCol='rating', predictionCol='prediction')
rmse = evaluator.evaluate(df_evaluate)
print("Root-mean-square error: ",rmse)

Root-mean-square error:  0.8823978310311149
