In [1]:
# Creating the spark context
import os

import pyspark

conf = pyspark.SparkConf()
conf.set('spark.ui.proxyBase'
, '/user/' + os.environ['JUPYTERHUB_USER'] + '/proxy/4041')
conf.set('spark.sql.repl.eagerEval.enabled', False)

sc = pyspark.SparkContext(conf=conf)
spark = pyspark.SQLContext.getOrCreate(sc)

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/12/20 21:31:18 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [2]:
goodreads_reviews= spark.read.format("json").load("shared/finals/swarali/goodreads_reviews_dedup.json")

                                                                                

In [3]:
goodreads_reviews.printSchema()

root
 |-- book_id: string (nullable = true)
 |-- date_added: string (nullable = true)
 |-- date_updated: string (nullable = true)
 |-- n_comments: long (nullable = true)
 |-- n_votes: long (nullable = true)
 |-- rating: long (nullable = true)
 |-- read_at: string (nullable = true)
 |-- review_id: string (nullable = true)
 |-- review_text: string (nullable = true)
 |-- started_at: string (nullable = true)
 |-- user_id: string (nullable = true)



In [4]:
goodreads_reviews = goodreads_reviews.limit(10000)

In [5]:
from pyspark.sql.types import StringType
from pyspark.sql.functions import udf
from textblob import TextBlob
# 6. Sentiment Analysis

#sentiment_udf = udf(lambda text: blob(text).sentiment.polarity, StringType())
sentiment_udf = udf(lambda text: TextBlob(text).sentiment.polarity, StringType())

In [6]:
goodreads_reviews = goodreads_reviews.withColumn("sentiment", sentiment_udf("review_text"))
sentiment_analysis = goodreads_reviews.groupBy("rating").agg({"sentiment": "avg"}).orderBy("rating")
sentiment_analysis.show()

                                                                                

+------+--------------------+
|rating|      avg(sentiment)|
+------+--------------------+
|     0|  0.1689616605839417|
|     1|-0.04232273559161615|
|     2| 0.04576395723714733|
|     3|  0.1279824838441761|
|     4| 0.19277641212956653|
|     5|  0.2537273330754035|
+------+--------------------+



In [7]:
from pyspark.sql.functions import udf
from pyspark.sql.types import IntegerType

review_length_udf = udf(lambda text: len(text), IntegerType())
goodreads_reviews = goodreads_reviews.withColumn("review_length", review_length_udf("review_text"))
sentiment_by_length = goodreads_reviews.groupBy("rating").agg({"review_length": "avg"}).orderBy("rating")
sentiment_by_length.show()



+------+------------------+
|rating|avg(review_length)|
+------+------------------+
|     0|   463.11227154047|
|     1| 577.0843373493976|
|     2| 585.0147058823529|
|     3| 518.5528775209051|
|     4| 742.3339800443459|
|     5| 760.4680013127667|
+------+------------------+



                                                                                

In [8]:
sentiment_by_votes = goodreads_reviews.groupBy("rating").agg({"n_votes": "avg"}).orderBy("rating")
sentiment_by_comments = goodreads_reviews.groupBy("rating").agg({"n_comments": "avg"}).orderBy("rating")
sentiment_by_votes.show()
sentiment_by_comments.show()

                                                                                

+------+-------------------+
|rating|       avg(n_votes)|
+------+-------------------+
|     0|  0.720626631853786|
|     1| 0.5421686746987951|
|     2| 0.5632352941176471|
|     3|0.41859321200196753|
|     4| 1.3586474501108647|
|     5|   2.21168362323597|
+------+-------------------+





+------+-------------------+
|rating|    avg(n_comments)|
+------+-------------------+
|     0| 0.4177545691906005|
|     1| 0.3453815261044177|
|     2|0.23529411764705882|
|     3|0.11018199704869651|
|     4|0.20953436807095344|
|     5|  0.373153921890384|
+------+-------------------+



                                                                                

In [9]:
goodreads_books = spark.read.format("json").load("shared/finals/swarali/goodreads_books.json")

                                                                                

In [12]:
goodreads_books

DataFrame[asin: string, authors: array<struct<author_id:string,role:string>>, average_rating: string, book_id: string, country_code: string, description: string, edition_information: string, format: string, image_url: string, is_ebook: string, isbn: string, isbn13: string, kindle_asin: string, language_code: string, link: string, num_pages: string, popular_shelves: array<struct<count:string,name:string>>, publication_day: string, publication_month: string, publication_year: string, publisher: string, ratings_count: string, series: array<string>, similar_books: array<string>, text_reviews_count: string, title: string, title_without_series: string, url: string, work_id: string]