### Objective

Understand the recommender system and the FP mining models in this notebook

In [1]:
#Starting with import of pyspark and related modules

import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *

import warnings
warnings.filterwarnings("ignore")

In [2]:
#Initiating the spark session with postgres driver    
sparkSQL = SparkSession.builder.appName('Spark SQL') \
        .config('spark.jars',"/usr/share/java/postgresql-42.2.26.jar") \
        .getOrCreate()

22/11/29 15:10:33 WARN Utils: Your hostname, codeStation resolves to a loopback address: 127.0.1.1; using 192.168.224.83 instead (on interface wlo1)
22/11/29 15:10:33 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
22/11/29 15:10:34 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


In [3]:
mllibPath = "mllib/"
externalData = "externalData/"
ytDE = "/home/solverbot/Desktop/ytDE/csvfiles"

In [4]:
sparkread = sparkSQL.read
sparkcont = sparkSQL.sparkContext

In [6]:
#readin the movielens data

movielens = sparkcont.textFile(mllibPath+"sample_movielens_data.txt")

In [7]:
for i in movielens.take(2): print(i)

[Stage 1:>                                                          (0 + 1) / 1]

0::2::3
0::3::1


                                                                                

In [11]:
movieTable = movielens.map(lambda x: [int(u) for u in x.split("::")])

In [12]:
for i in movieTable.take(2): print(i)

[0, 2, 3]
[0, 3, 1]


In [18]:
moviedf = sparkSQL.createDataFrame(movieTable,["user_id","movie_id","rating"])

In [19]:
moviedf.write.csv(mllibPath+"movielens.csv",mode='overwrite')

In [20]:
moviedf.show(2)

+-------+--------+------+
|user_id|movie_id|rating|
+-------+--------+------+
|      0|       2|     3|
|      0|       3|     1|
+-------+--------+------+
only showing top 2 rows



In [21]:
moviedf.describe().show()

+-------+------------------+------------------+------------------+
|summary|           user_id|          movie_id|            rating|
+-------+------------------+------------------+------------------+
|  count|              1501|              1501|              1501|
|   mean|14.383744170552964| 49.40572951365756|1.7741505662891406|
| stddev| 8.591040424293267|28.937034065089016|1.1872761661248032|
|    min|                 0|                 0|                 1|
|    max|                29|                99|                 5|
+-------+------------------+------------------+------------------+



In [22]:
train, test = moviedf.randomSplit([0.8,0.2])

In [25]:
from pyspark.ml.recommendation import ALS
help(ALS)

Help on class ALS in module pyspark.ml.recommendation:

class ALS(pyspark.ml.wrapper.JavaEstimator, _ALSParams, pyspark.ml.util.JavaMLWritable, pyspark.ml.util.JavaMLReadable)
 |  ALS(*, rank: int = 10, maxIter: int = 10, regParam: float = 0.1, numUserBlocks: int = 10, numItemBlocks: int = 10, implicitPrefs: bool = False, alpha: float = 1.0, userCol: str = 'user', itemCol: str = 'item', seed: Optional[int] = None, ratingCol: str = 'rating', nonnegative: bool = False, checkpointInterval: int = 10, intermediateStorageLevel: str = 'MEMORY_AND_DISK', finalStorageLevel: str = 'MEMORY_AND_DISK', coldStartStrategy: str = 'nan', blockSize: int = 4096)
 |  
 |  Alternating Least Squares (ALS) matrix factorization.
 |  
 |  ALS attempts to estimate the ratings matrix `R` as the product of
 |  two lower-rank matrices, `X` and `Y`, i.e. `X * Yt = R`. Typically
 |  these approximations are called 'factor' matrices. The general
 |  approach is iterative. During each iteration, one of the factor
 |  

In [29]:
als = ALS(maxIter=15, regParam=0.01, userCol='user_id', 
          itemCol='movie_id', ratingCol='rating')

In [30]:
model = als.fit(train)
prediction = model.transform(test)
prediction.select("rating","prediction").show(2)

+------+----------+
|rating|prediction|
+------+----------+
|     3|-0.8297616|
|     1| 0.8943969|
+------+----------+
only showing top 2 rows



In [32]:
from pyspark.ml.evaluation import RegressionEvaluator
evaluator = RegressionEvaluator(metricName = 'rmse', labelCol = 'rating', predictionCol = 'prediction')
rmse = evaluator.evaluate(prediction)
print('RMSE:', rmse)

RMSE: 1.6637695440319358


In [34]:
this_user = test.filter(test['user_id'] == 12).select('user_id', 'movie_id')
this_user.show()

+-------+--------+
|user_id|movie_id|
+-------+--------+
|     12|       2|
|     12|       7|
|     12|       8|
|     12|      14|
|     12|      22|
|     12|      23|
|     12|      24|
|     12|      25|
|     12|      30|
|     12|      31|
|     12|      41|
|     12|      53|
|     12|      60|
|     12|      63|
|     12|      72|
|     12|      77|
|     12|      91|
|     12|      95|
+-------+--------+



In [35]:
recommendation_this_user = model.transform(this_user)
recommendation_this_user.show()

+-------+--------+-----------+
|user_id|movie_id| prediction|
+-------+--------+-----------+
|     12|      31|  1.6109191|
|     12|      53|  1.9744065|
|     12|      91|   3.446911|
|     12|      22|  0.7696948|
|     12|      41|  1.9108294|
|     12|      72|  4.3716397|
|     12|       8| -2.3297799|
|     12|      23|  4.7609572|
|     12|       7|  2.0911324|
|     12|      63|  1.5779959|
|     12|      77|-0.29248747|
|     12|      25|  3.4542992|
|     12|      24| 0.73902446|
|     12|      95| 0.16467327|
|     12|      60|  1.5980449|
|     12|      14|  2.5773976|
|     12|       2| -1.9526407|
|     12|      30|  3.4185607|
+-------+--------+-----------+



In [36]:
recommendation_this_user.orderBy("prediction",ascending=False).show()

+-------+--------+-----------+
|user_id|movie_id| prediction|
+-------+--------+-----------+
|     12|      23|  4.7609572|
|     12|      72|  4.3716397|
|     12|      25|  3.4542992|
|     12|      91|   3.446911|
|     12|      30|  3.4185607|
|     12|      14|  2.5773976|
|     12|       7|  2.0911324|
|     12|      53|  1.9744065|
|     12|      41|  1.9108294|
|     12|      31|  1.6109191|
|     12|      60|  1.5980449|
|     12|      63|  1.5779959|
|     12|      22|  0.7696948|
|     12|      24| 0.73902446|
|     12|      95| 0.16467327|
|     12|      77|-0.29248747|
|     12|       2| -1.9526407|
|     12|       8| -2.3297799|
+-------+--------+-----------+

