In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *

In [None]:
spark = SparkSession.builder.master('local[*]').appName('BooksRating-SparkSQL').getOrCreate()

In [None]:
BooksRating = spark.read.csv('Book-Ratings.csv', sep = ';', header = True, inferSchema = True)

In [None]:
BooksRating.printSchema()

root
 |-- userid: integer (nullable = true)
 |-- isbn: string (nullable = true)
 |-- rate: integer (nullable = true)



In [None]:
BooksRating.show(5)

+------+----------+----+
|userid|      isbn|rate|
+------+----------+----+
|276725|034545104X|   0|
|276726|0155061224|   5|
|276727|0446520802|   0|
|276729|052165615X|   3|
|276729|0521795028|   6|
+------+----------+----+
only showing top 5 rows



AVG of Books

In [None]:
books_rating_avg = BooksRating.groupBy('isbn').avg('rate').withColumnRenamed('avg(rate)','avg_rate')

(username; book title; username’ rate; Book Avg’ rate)

In [None]:
books_rating_avg.show(5)

+----------+------------------+
|      isbn|          avg_rate|
+----------+------------------+
|2080674722|3.6666666666666665|
|3499134004|               0.0|
|3548603203|3.4166666666666665|
|880781112X| 4.333333333333333|
|0738205737|1.6666666666666667|
+----------+------------------+
only showing top 5 rows



In [None]:
users = spark.read.csv('Users.csv', sep=';', header= True, inferSchema = True).withColumnRenamed('UserID','userid')
books = spark.read.csv('Books.csv', sep=';', header= True, inferSchema = True)

In [None]:
users.show(5)

+------+-----------+--------------------+----+
|userid|   USERNAME|            Location| Age|
+------+-----------+--------------------+----+
|     1|bzsufoRTLN2|  nyc, new york, usa|NULL|
|     2|fq7kfHg4VEI|stockton, califor...|  18|
|     3|W0Hbkd3xR8v|moscow, yukon ter...|NULL|
|     4|W51GahAx5Ap|porto, v.n.gaia, ...|  17|
|     5|VKN3PQ18GgN|farnborough, hant...|NULL|
+------+-----------+--------------------+----+
only showing top 5 rows



In [None]:
books.show()

+----------+--------------------+--------------------+-----------------+--------------------+--------------------+--------------------+--------------------+
|      ISBN|           BookTitle|          BookAuthor|YearOfPublication|           Publisher|           ImageURLS|           ImageURLM|           ImageURLL|
+----------+--------------------+--------------------+-----------------+--------------------+--------------------+--------------------+--------------------+
|0195153448| Classical Mythology|  Mark P. O. Morford|             2002|Oxford University...|http://images.ama...|http://images.ama...|http://images.ama...|
|0002005018|        Clara Callan|Richard Bruce Wright|             2001|HarperFlamingo Ca...|http://images.ama...|http://images.ama...|http://images.ama...|
|0060973129|Decision in Normandy|        Carlo D'Este|             1991|     HarperPerennial|http://images.ama...|http://images.ama...|http://images.ama...|
|0374157065|Flu: The Story of...|    Gina Bari Kolata|    

In [94]:
BooksRating.join(users,['userid']).join(books,['isbn']).join(books_rating_avg,['isbn'])\
.select('USERNAME','BookTitle','rate','avg_rate').show(5)

+-----------+--------------------+----+--------+
|   USERNAME|           BookTitle|rate|avg_rate|
+-----------+--------------------+----+--------+
|6chdqlR3DC7|The Way Things Wo...|   8|     8.0|
|px70uymJ7k6|     Mog's Christmas|   0|     0.0|
|mjteD2ip2Lj|     Mog's Christmas|   0|     0.0|
|cHwJip4Kj4k|                Liar|   9|     9.0|
|6VUiynjA3tV|The Prime of Miss...|   0|     0.0|
+-----------+--------------------+----+--------+
only showing top 5 rows



## one-line SQL query
(username; book title; username’ rate; Book Avg’ rate)

In [90]:
BooksRating.createOrReplaceTempView('BooksRating')
users.createOrReplaceTempView('users')
books.createOrReplaceTempView('books')
books_rating_avg.createOrReplaceTempView('books_rating_avg')

In [92]:
spark.sql("""
SELECT u.USERNAME, b.BookTitle, br.rate, bra.avg_rate
FROM BooksRating br
JOIN users u ON br.userid = u.userid
JOIN books b ON br.isbn = b.isbn
JOIN books_rating_avg bra ON br.isbn = bra.isbn
LIMIT 5
""").show()

+-----------+--------------------+----+--------+
|   USERNAME|           BookTitle|rate|avg_rate|
+-----------+--------------------+----+--------+
|6chdqlR3DC7|The Way Things Wo...|   8|     8.0|
|px70uymJ7k6|     Mog's Christmas|   0|     0.0|
|mjteD2ip2Lj|     Mog's Christmas|   0|     0.0|
|cHwJip4Kj4k|                Liar|   9|     9.0|
|6VUiynjA3tV|The Prime of Miss...|   0|     0.0|
+-----------+--------------------+----+--------+

