In [2]:
import pyspark
from pyspark.sql import SparkSession

# MONGO CONFIGURATION
mongo_uri = "mongodb://admin:mongopw@mongo:27017/demo.feedback?authSource=admin"

# Spark init
spark = SparkSession \
    .builder \
    .master("local") \
    .appName('jupyter-pyspark') \
      .config("spark.mongodb.input.uri", mongo_uri) \
      .config("spark.mongodb.output.uri", mongo_uri) \
      .config("spark.jars.packages","org.mongodb.spark:mongo-spark-connector_2.12:3.0.1")\
    .getOrCreate()
sc = spark.sparkContext
sc.setLogLevel("ERROR")

print(mongo_uri)

mongodb://admin:mongopw@mongo:27017/demo.feedback?authSource=admin


In [14]:
# 1. Use Spark to load the /datasets/json-samples/US-Senators.json 
# into the MongoDB database labf under the collection senators.

source_file = "file:///home/jovyan/datasets/json-samples/US-Senators.json"
df = spark.read.json(source_file)
# df.printSchema()
df.write.format("mongo").mode("overwrite").option("database", "labf").option("collection", "senators").save()
print(source_file)

file:///home/jovyan/datasets/json-samples/US-Senators.json


In [15]:
# 4. Use Spark to load in the /datasets/netflix-canceled-2021/*.json 
# into MongoDB database labf under the collection nfcan.
source_file = "file:///home/jovyan/datasets/netflix-canceled-2021/*.json"
# nfcan = spark.read.json(source_file)
nfcan = spark.read.option("multiline", True).json(source_file)
#nfcan.printSchema()
#nfcan.show()
nfcan.write.format("mongo").mode("overwrite").option("database", "labf").option("collection", "nfcan").save()
print(source_file)

[Stage 18:>                                                         (0 + 1) / 1]

file:///home/jovyan/datasets/netflix-canceled-2021/*.json


                                                                                

In [19]:
#7.  Using Spark or Spark SQL, create a DataFrame or view from the Netflix Cancellations 
# MongoDB data consisting of show name, season number, episode, number, episode name, airdate
# and average rating (for the episode).

from pyspark.sql.functions import col, explode

tmp = nfcan.select(col("name").alias("showname"),
                   explode("_embedded.episodes").alias("episode")
                  )
eps = tmp.select("showname", "episode.name", "episode.season", "episode.number", "episode.airdate", "episode.rating.average")
eps.show()

+--------------+---------+------+------+----------+-------+
|      showname|     name|season|number|   airdate|average|
+--------------+---------+------+------+----------+-------+
|Peaky Blinders|Episode 1|     1|     1|2013-09-12|    8.3|
|Peaky Blinders|Episode 2|     1|     2|2013-09-19|    8.4|
|Peaky Blinders|Episode 3|     1|     3|2013-09-26|    8.4|
|Peaky Blinders|Episode 4|     1|     4|2013-10-03|    8.4|
|Peaky Blinders|Episode 5|     1|     5|2013-10-10|    8.6|
|Peaky Blinders|Episode 6|     1|     6|2013-10-17|    8.8|
|Peaky Blinders|Episode 1|     2|     1|2014-10-02|    7.8|
|Peaky Blinders|Episode 2|     2|     2|2014-10-09|    8.1|
|Peaky Blinders|Episode 3|     2|     3|2014-10-16|    8.2|
|Peaky Blinders|Episode 4|     2|     4|2014-10-23|    8.2|
|Peaky Blinders|Episode 5|     2|     5|2014-10-30|    8.2|
|Peaky Blinders|Episode 6|     2|     6|2014-11-06|    8.7|
|Peaky Blinders|Episode 1|     3|     1|2016-05-05|    8.3|
|Peaky Blinders|Episode 2|     3|     2|

In [21]:
eps.createOrReplaceTempView("eps")

In [24]:
# 8. Using the query you wrote in Question 7 (if you want), write a Spark or Spark 
# SQL query to get the lowest rated episodes of each season for the cancelled shows. 
# Display show name, season number, episode number, episode name, and 
# rating for that episode. NOTE: Some shows have more than one episode 
# with the lowest rating.
eps.printSchema()
query = '''
WITH Source AS 
(
    SELECT showname, name, season, number, airdate, average,
        MIN(average) OVER (PARTITION BY showname, season) AS low_rating
    FROM eps
)
SELECT * FROM Source WHERE average = low_rating 
'''
spark.sql(query).show()

root
 |-- showname: string (nullable = true)
 |-- name: string (nullable = true)
 |-- season: long (nullable = true)
 |-- number: long (nullable = true)
 |-- airdate: string (nullable = true)
 |-- average: double (nullable = true)



                                                                                

+-----------------+--------------------+------+------+----------+-------+----------+
|         showname|                name|season|number|   airdate|average|low_rating|
+-----------------+--------------------+------+------+----------+-------+----------+
|   Peaky Blinders|           The Shock|     5|     5|2019-09-15|    8.3|       8.3|
| The Last Kingdom|           Episode 1|     4|     1|2020-04-26|    8.6|       8.6|
| The Last Kingdom|           Episode 6|     4|     6|2020-04-26|    8.6|       8.6|
|         The Crew|No One Likes You....|     1|    10|2021-02-15|    5.8|       5.8|
|   Peaky Blinders|           Episode 1|     1|     1|2013-09-12|    8.3|       8.3|
|     Mr. Iglesias|      Generation Why|     2|     4|2020-06-17|    4.0|       4.0|
|Kim's Convenience|        Gay Discount|     1|     1|2016-10-11|    8.0|       8.0|
|Kim's Convenience|     Frank & Nayoung|     1|     4|2016-10-25|    8.0|       8.0|
|Kim's Convenience|   Happy Ummaversary|     4|     4|2020-01-28|

In [31]:
shows = nfcan.select("name").distinct().sort("name").toPandas()["name"].values

                                                                                

In [32]:
# 9 CHALLENGE YOURSELF! Display name of show, a picture of the show, and 
# show summary. Make it interactive so you can select the show and see the details.

from IPython.display import display, HTML, Image
from ipywidgets import interact

display(HTML("<H1>Netflix shows cancelled in 2021</H1>"))

@interact(show =  shows)
def comboChanged(show):
    theShow = nfcan.select("name", "summary", "image.medium", "status", "rating.average")\
        .where(nfcan.name == show).toPandas().iloc[0]
    display(HTML(f"<H3>{theShow['name']}</H3>"))
    display(Image(url=theShow["medium"]))

interactive(children=(Dropdown(description='show', options=('#blackAF', 'Bonding', 'Country Comfort', 'Cowboy …