In [2]:
import pandas as pd
import numpy as np
from sklearn.metrics import *
from surprise import Dataset
from surprise.model_selection import cross_validate

<h1> Getting Dataset </h1>

In [3]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('Recommendation_system').getOrCreate()

In [6]:
ratings_data = pd.read_csv("Amazon Book Reviews.csv", usecols = [0,1,5], header=0, names = ["user_id", "book_id", "rating"])

In [7]:
ratings_data.head()

Unnamed: 0,user_id,book_id,rating
0,A15Q7ABIU9O9YZ,60554800,3
1,AUIJDXNYVTEA8,60554800,4
2,A20N5GOON55TE9,60554800,5
3,A1CT8ENDZSYTX3,60554800,3
4,A2SI6BNK5SWSMD,60554800,3


In [12]:
ratings_data = ratings_data.assign(newUserId = ratings_data['user_id'].astype('category').cat.codes)
ratings_data = ratings_data.assign(newBookId = ratings_data['book_id'].astype('category').cat.codes)
ratings_data.drop(["user_id","book_id"], axis=1, inplace=True)
ratings_data.head()

Unnamed: 0,rating,newUserId,newBookId
0,3,243,19
1,4,5467,19
2,5,1549,19
3,3,545,19
4,3,2719,19


<h1> Splitting test and train dataset </h1>

In [21]:
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.recommendation import ALS
from pyspark.sql import SQLContext
from pyspark import SparkContext

sc = SparkContext.getOrCreate()
sqlContext = SQLContext(sc)
spark_ratings_df = sqlContext.createDataFrame(ratings_data)

In [26]:
training, test = spark_ratings_df.randomSplit([0.8, 0.2])

In [28]:
print(training.count())
print(test.count())

6023
1413


<h1> ALS </h1>

In [31]:
als=ALS(maxIter=5,regParam=0.09,rank=25,userCol="newUserId",itemCol="newBookId",ratingCol="rating",coldStartStrategy="drop",nonnegative=True)
model=als.fit(training)

In [33]:
evaluator=RegressionEvaluator(metricName="rmse",labelCol="rating",predictionCol="prediction")
predictions=model.transform(test)
rmse=evaluator.evaluate(predictions)
print(rmse)
predictions.show()

1.6502403716193452
+------+---------+---------+----------+
|rating|newUserId|newBookId|prediction|
+------+---------+---------+----------+
|     5|     3757|      148|  3.884962|
|     5|     5036|      148|  5.206557|
|     5|     3016|      148|  3.884962|
|     4|     3853|      148| 3.0492332|
|     5|      984|       31| 3.8800292|
|     5|     4144|       31| 1.8428248|
|     5|     2314|       31| 3.8800292|
|     5|      695|      137| 3.4254065|
|     4|     1984|      137|  4.144337|
|     3|     2650|       65| 3.6743555|
|     3|     3557|       65| 3.2472925|
|     5|     5485|       65| 3.8945029|
|     4|     4927|       65| 3.8945029|
|     4|     5257|       65| 1.9159155|
|     5|     2770|       65| 3.1559439|
|     5|      761|       65| 2.3367019|
|     5|     4736|       65| 3.8945029|
|     4|      877|       65| 4.3931613|
|     5|      524|       65| 3.8945029|
|     4|     2358|       65| 4.4275594|
+------+---------+---------+----------+
only showing top 20 r

In [34]:
user_recs=model.recommendForAllUsers(20).show(10)

+---------+--------------------+
|newUserId|     recommendations|
+---------+--------------------+
|     1580|[[39, 5.787728], ...|
|     4900|[[124, 6.273851],...|
|      471|[[30, 4.95764], [...|
|     1591|[[32, 6.0767436],...|
|     4101|[[140, 7.2464395]...|
|     1342|[[53, 6.463038], ...|
|     2122|[[24, 9.975376], ...|
|      833|[[32, 8.046651], ...|
|     1645|[[26, 4.8069386],...|
|     3175|[[39, 5.0466824],...|
+---------+--------------------+
only showing top 10 rows

