In [3]:
from pyspark.sql import SparkSession
from pyspark.sql import functions

import warnings
warnings.filterwarnings("ignore")

In [4]:

spark = SparkSession.builder.appName("BooksReviewSession").getOrCreate()

In [14]:
dataset_rating = spark.read.csv(
    path='../../../data/book_reviews/book_rating.csv',
    header=True,
    inferSchema=True,
    sep=';'
)

                                                                                

In [35]:
dataset_books = spark.read.csv(
    path='../../../data/book_reviews/books.csv',
    header=True,
    inferSchema=True,
    sep=';'
)

In [17]:
dataset_rating.printSchema()

root
 |-- UserID: integer (nullable = true)
 |-- MovieID: string (nullable = true)
 |-- Rating: integer (nullable = true)



In [15]:
dataset_rating.show(truncate=False)

+-------+----------+-----------+
|User-ID|ISBN      |Book-Rating|
+-------+----------+-----------+
|276725 |034545104X|0          |
|276726 |0155061224|5          |
|276727 |0446520802|0          |
|276729 |052165615X|3          |
|276729 |0521795028|6          |
|276733 |2080674722|0          |
|276736 |3257224281|8          |
|276737 |0600570967|6          |
|276744 |038550120X|7          |
|276745 |342310538 |10         |
|276746 |0425115801|0          |
|276746 |0449006522|0          |
|276746 |0553561618|0          |
|276746 |055356451X|0          |
|276746 |0786013990|0          |
|276746 |0786014512|0          |
|276747 |0060517794|9          |
|276747 |0451192001|0          |
|276747 |0609801279|0          |
|276747 |0671537458|9          |
+-------+----------+-----------+
only showing top 20 rows



In [16]:
dataset_rating = dataset_rating\
    .withColumnRenamed('USER-ID', 'UserID')\
    .withColumnRenamed('ISBN', 'MovieID')\
    .withColumnRenamed('Book-Rating', 'Rating')

dataset_rating.show()

+------+----------+------+
|UserID|   MovieID|Rating|
+------+----------+------+
|276725|034545104X|     0|
|276726|0155061224|     5|
|276727|0446520802|     0|
|276729|052165615X|     3|
|276729|0521795028|     6|
|276733|2080674722|     0|
|276736|3257224281|     8|
|276737|0600570967|     6|
|276744|038550120X|     7|
|276745| 342310538|    10|
|276746|0425115801|     0|
|276746|0449006522|     0|
|276746|0553561618|     0|
|276746|055356451X|     0|
|276746|0786013990|     0|
|276746|0786014512|     0|
|276747|0060517794|     9|
|276747|0451192001|     0|
|276747|0609801279|     0|
|276747|0671537458|     9|
+------+----------+------+
only showing top 20 rows



In [21]:
dataset_rating.groupby('UserId')\
    .count()\
    .sort('count', ascending=False).show()



+------+-----+
|UserId|count|
+------+-----+
| 11676|13602|
|198711| 7550|
|153662| 6109|
| 98391| 5891|
| 35859| 5850|
|212898| 4785|
|278418| 4533|
| 76352| 3367|
|110973| 3100|
|235105| 3067|
|230522| 2991|
| 16795| 2948|
|234623| 2674|
| 36836| 2529|
| 52584| 2512|
|245963| 2507|
|204864| 2504|
| 55492| 2459|
|185233| 2448|
|171118| 2421|
+------+-----+
only showing top 20 rows



                                                                                

In [30]:
dataset_rating.groupby('UserID').agg(
    functions.count('Rating').alias('RatingCount'),
    functions.min('Rating').alias('RatingMin'),
    functions.avg('Rating').alias('RatingMean'),
    functions.max('Rating').alias('RatingMax')
).sort('RatingCount', ascending=False).show()



+------+-----------+---------+--------------------+---------+
|UserID|RatingCount|RatingMin|          RatingMean|RatingMax|
+------+-----------+---------+--------------------+---------+
| 11676|      13602|        0|   4.564916923981768|       10|
|198711|       7550|        0| 0.01629139072847682|       10|
|153662|       6109|        0|   2.786871828449828|       10|
| 98391|       5891|        0|   8.728229502631132|       10|
| 35859|       5850|        0|  1.0758974358974358|       10|
|212898|       4785|        0|0.003134796238244514|        5|
|278418|       4533|        0| 0.15927641738363116|       10|
| 76352|       3367|        0| 0.14523314523314523|       10|
|110973|       3100|        0|   0.787741935483871|       10|
|235105|       3067|        0|   2.671992174763613|       10|
|230522|       2991|        0|  0.9117352056168505|       10|
| 16795|       2948|        0|  2.3348032564450474|       10|
|234623|       2674|        0| 0.13275991024682124|       10|
| 36836|

                                                                                

In [34]:
dataset_rating.filter(
    (dataset_rating.UserID == 11676) &
    (dataset_rating.Rating > 8)
).show()

+------+----------+------+
|UserID|   MovieID|Rating|
+------+----------+------+
| 11676|0000000000|     9|
| 11676|0001055666|     9|
| 11676|0002118580|     9|
| 11676|000225929X|    10|
| 11676|0006481213|    10|
| 11676|0006490344|    10|
| 11676|0006496946|     9|
| 11676|0006498493|    10|
| 11676|0006510906|     9|
| 11676|0006512046|    10|
| 11676|0006512208|     9|
| 11676|0006513204|     9|
| 11676|000655220X|     9|
| 11676|0007107900|     9|
| 11676|0020518501|     9|
| 11676|0028625757|    10|
| 11676|0060005424|    10|
| 11676|0060007788|     9|
| 11676|0060008369|     9|
| 11676|0060083263|    10|
+------+----------+------+
only showing top 20 rows



In [32]:
dataset_rating.groupby('MovieID').agg(
    functions.count('Rating').alias('RatingCount'),
    functions.min('Rating').alias('RatingMin'),
    functions.avg('Rating').alias('RatingMean'),
    functions.max('Rating').alias('RatingMax')
).sort('RatingCount', ascending=False).show()


                                                                                

+----------+-----------+---------+------------------+---------+
|   MovieID|RatingCount|RatingMin|        RatingMean|RatingMax|
+----------+-----------+---------+------------------+---------+
|0971880107|       2502|        0|1.0195843325339728|       10|
|0316666343|       1295|        0| 4.468725868725869|       10|
|0385504209|        883|        0| 4.652321630804077|       10|
|0060928336|        732|        0| 3.448087431693989|       10|
|0312195516|        723|        0| 4.334716459197787|       10|
|044023722X|        647|        0| 3.187017001545595|       10|
|0679781587|        639|        0| 4.381846635367762|       10|
|0142001740|        615|        0| 4.219512195121951|       10|
|067976402X|        614|        0| 3.255700325732899|       10|
|0671027360|        586|        0| 3.718430034129693|       10|
|0446672211|        585|        0| 4.105982905982906|       10|
|059035342X|        571|        0| 4.900175131348512|       10|
|0316601950|        568|        0|3.5933

In [37]:
dataset_books.show(truncate=False)

+----------+--------------------------------------------------------------------------------------------------+--------------------+-------------------+---------------------------+------------------------------------------------------------+------------------------------------------------------------+------------------------------------------------------------+
|ISBN      |Book-Title                                                                                        |Book-Author         |Year-Of-Publication|Publisher                  |Image-URL-S                                                 |Image-URL-M                                                 |Image-URL-L                                                 |
+----------+--------------------------------------------------------------------------------------------------+--------------------+-------------------+---------------------------+------------------------------------------------------------+-------------------------------

In [69]:
dataset_books = dataset_books\
    .withColumnRenamed('ISBN', 'MovieID')\
    .withColumnRenamed('Book-Title', 'Title')\
    .withColumnRenamed('Book-Author', 'Author')\
    .withColumnRenamed('Year-Of-Publication', 'Year')\
    .sort('Year')

dataset_books.show()

+----------+--------------------+--------------------+----+--------------------+--------------------+--------------------+--------------------+
|   MovieID|               Title|              Author|Year|           Publisher|         Image-URL-S|         Image-URL-M|         Image-URL-L|
+----------+--------------------+--------------------+----+--------------------+--------------------+--------------------+--------------------+
|0140288694|       The Shark Net|        Robert Drewe|   0|   Penguin Books Ltd|http://images.ama...|http://images.ama...|http://images.ama...|
|0752825259|     War of the Rats|       David Robbins|   0|               Orion|http://images.ama...|http://images.ama...|http://images.ama...|
|B00008WFXL|   The Da Vinci Code|           Dan Brown|   0|           Doubleday|http://images.ama...|http://images.ama...|http://images.ama...|
|9723605465|Nos Matamos O Cao...|Luis Bernard Honwana|   0|Edicoes Afrontamento|http://images.ama...|http://images.ama...|http://images.

                                                                                

In [39]:
dataset_books.groupby('Author')\
    .count()\
    .sort('count', ascending=False)\
    .show(truncate=False)



+---------------------+-----+
|Author               |count|
+---------------------+-----+
|Agatha Christie      |632  |
|William Shakespeare  |567  |
|Stephen King         |524  |
|Ann M. Martin        |423  |
|Carolyn Keene        |373  |
|Francine Pascal      |373  |
|Isaac Asimov         |330  |
|Nora Roberts         |315  |
|Barbara Cartland     |307  |
|Charles Dickens      |302  |
|Not Applicable (Na ) |286  |
|R. L. Stine          |282  |
|Mark Twain           |231  |
|Jane Austen          |223  |
|Terry Pratchett      |220  |
|Mary Higgins Clark   |218  |
|Piers Anthony        |217  |
|Marion Zimmer Bradley|216  |
|Janet Dailey         |214  |
|Franklin W. Dixon    |204  |
+---------------------+-----+
only showing top 20 rows



                                                                                

In [41]:
dataset_books.groupby('Year')\
    .count()\
    .sort('Year', ascending=False)\
    .show(truncate=False)

+----+-----+
|Year|count|
+----+-----+
|2050|2    |
|2038|1    |
|2037|1    |
|2030|7    |
|2026|1    |
|2024|1    |
|2021|1    |
|2020|3    |
|2012|1    |
|2011|2    |
|2010|2    |
|2008|1    |
|2006|3    |
|2005|46   |
|2004|5839 |
|2003|14359|
|2002|17628|
|2001|17360|
|2000|17235|
|1999|17432|
+----+-----+
only showing top 20 rows



In [50]:
dataset_books.groupby('Year')\
    .count()\
    .sort('Year', ascending=True)\
    .show(truncate=False)



+----+-----+
|Year|count|
+----+-----+
|0   |4619 |
|1376|1    |
|1378|1    |
|1806|1    |
|1897|1    |
|1900|3    |
|1901|7    |
|1902|2    |
|1904|1    |
|1906|1    |
|1908|1    |
|1909|2    |
|1910|1    |
|1911|19   |
|1914|1    |
|1917|1    |
|1919|1    |
|1920|33   |
|1921|2    |
|1922|2    |
+----+-----+
only showing top 20 rows



                                                                                

In [55]:
dataset_books.filter(dataset_books.Year < 1930).sort('Year', ascending=False).show(truncate=False)

+----------+-------------------------------------------------------------------------------------------------+--------------------+----+-----------------------+------------------------------------------------------------+------------------------------------------------------------+------------------------------------------------------------+
|MovieID   |Title                                                                                            |Author              |Year|Publisher              |Image-URL-S                                                 |Image-URL-M                                                 |Image-URL-L                                                 |
+----------+-------------------------------------------------------------------------------------------------+--------------------+----+-----------------------+------------------------------------------------------------+------------------------------------------------------------+------------------------------

In [56]:
dataset_books = dataset_books.filter(
    (dataset_books.Year < 2004)
)
dataset_books.show()

+----------+--------------------+--------------------+----+--------------------+--------------------+--------------------+--------------------+
|   MovieID|               Title|              Author|Year|           Publisher|         Image-URL-S|         Image-URL-M|         Image-URL-L|
+----------+--------------------+--------------------+----+--------------------+--------------------+--------------------+--------------------+
|0195153448| Classical Mythology|  Mark P. O. Morford|2002|Oxford University...|http://images.ama...|http://images.ama...|http://images.ama...|
|0002005018|        Clara Callan|Richard Bruce Wright|2001|HarperFlamingo Ca...|http://images.ama...|http://images.ama...|http://images.ama...|
|0060973129|Decision in Normandy|        Carlo D'Este|1991|     HarperPerennial|http://images.ama...|http://images.ama...|http://images.ama...|
|0374157065|Flu: The Story of...|    Gina Bari Kolata|1999|Farrar Straus Giroux|http://images.ama...|http://images.ama...|http://images.

In [59]:
dataset_books.groupby('Year')\
    .count()\
    .sort('Year', ascending=True)\
    .show(10, truncate=False)

+----+-----+
|Year|count|
+----+-----+
|0   |4619 |
|1376|1    |
|1378|1    |
|1806|1    |
|1897|1    |
|1900|3    |
|1901|7    |
|1902|2    |
|1904|1    |
|1906|1    |
+----+-----+
only showing top 10 rows



In [64]:
dataset_books = dataset_books.filter(
    ~dataset_books.Year.isin([1376, 1378])
)

dataset_books.show(truncate=False)

+----------+--------------------------------------------------------------------------------------------------+--------------------+----+---------------------------+------------------------------------------------------------+------------------------------------------------------------+------------------------------------------------------------+
|MovieID   |Title                                                                                             |Author              |Year|Publisher                  |Image-URL-S                                                 |Image-URL-M                                                 |Image-URL-L                                                 |
+----------+--------------------------------------------------------------------------------------------------+--------------------+----+---------------------------+------------------------------------------------------------+------------------------------------------------------------+---------------

In [65]:
dataset_books.groupby('Publisher')\
    .count()\
    .sort('count', ascending=False)\
    .show(truncate=False)



+------------------------+-----+
|Publisher               |count|
+------------------------+-----+
|Harlequin               |7206 |
|Silhouette              |4053 |
|Pocket                  |3865 |
|Ballantine Books        |3688 |
|Bantam Books            |3644 |
|Scholastic              |3139 |
|Simon & Schuster        |2920 |
|Penguin Books           |2795 |
|Penguin USA             |2715 |
|Warner Books            |2656 |
|Berkley Publishing Group|2596 |
|Harpercollins           |2525 |
|Fawcett Books           |2252 |
|Random House Inc        |2045 |
|Signet Book             |1982 |
|St Martins Pr           |1953 |
|St. Martin's Press      |1728 |
|HarperCollins Publishers|1701 |
|Zebra Books             |1641 |
|Tor Books               |1640 |
+------------------------+-----+
only showing top 20 rows



                                                                                

In [73]:
dataset_books.groupby('Publisher').agg(
    functions.count('Publisher').alias('count'),
    functions.first('Year').alias('first'),
    functions.last('Year').alias('last')
).sort('count', ascending=False).show(truncate=False)



+------------------------+-----+-----+----+
|Publisher               |count|first|last|
+------------------------+-----+-----+----+
|Harlequin               |7206 |1902 |2003|
|Silhouette              |4053 |1980 |2003|
|Pocket                  |3865 |0    |2003|
|Ballantine Books        |3688 |0    |2003|
|Bantam Books            |3644 |0    |2003|
|Scholastic              |3139 |0    |2003|
|Simon & Schuster        |2920 |0    |2003|
|Penguin Books           |2795 |0    |2003|
|Penguin USA             |2715 |0    |2003|
|Warner Books            |2656 |0    |2003|
|Berkley Publishing Group|2596 |0    |2003|
|Harpercollins           |2525 |0    |2003|
|Fawcett Books           |2252 |0    |2003|
|Random House Inc        |2045 |1956 |2003|
|Signet Book             |1982 |0    |2003|
|St Martins Pr           |1953 |0    |2002|
|St. Martin's Press      |1728 |0    |2003|
|HarperCollins Publishers|1701 |0    |2003|
|Zebra Books             |1641 |1980 |2003|
|Tor Books               |1640 |

                                                                                