In [46]:
from pyspark import SparkContext, SparkConf, SQLContext
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, sum, concat_ws, collect_list, count, avg, upper, round, desc, asc
import os
from dotenv import load_dotenv, dotenv_values

In [2]:
# VARIABLES NEEDED TO MAKE CONNECTION AND TABLE NAMES

config = dotenv_values(".env")
database = config["DATABASE"]
user = config["USER"]
password = config["PASS"]
port = config["PORT"]
server = config["SERVER"]
jdbc_url = f"jdbc:sqlserver://{server}:{port};databaseName={database};encrypt=true;trustServerCertificate=true"
jdbc_driver_path = config["JDBC_DRIVER_PATH"]

album, artist, customer, employee, genre, invoice, invoice_line, media_type, playlist, playlist_track, track = "dbo.Album", "dbo.Artist",\
    "dbo.Customer", "dbo.Employee", "dbo.Genre",\
    "dbo.Invoice", "dbo.InvoiceLine", "dbo.MediaType",\
    "dbo.Playlist", "dbo.PlaylistTrack", "dbo.Track"

In [3]:
# CREATE SPARK SESSION 

spark = SparkSession.builder \
    .appName("PySpark using Chinook DB") \
    .master("local") \
    .config("spark.driver.extraClassPath", jdbc_driver_path) \
    .getOrCreate()

In [35]:
# CREATE DF FOR EACH TABLE USING JDBC

album_df = spark.read.format("jdbc").option("url", jdbc_url).option("dbtable", album).option("user", user)\
    .option("password", password).option("driver", "com.microsoft.sqlserver.jdbc.SQLServerDriver").load()

artist_df = spark.read.format("jdbc").option("url", jdbc_url).option("dbtable", artist).option("user", user)\
    .option("password", password).option("driver", "com.microsoft.sqlserver.jdbc.SQLServerDriver").load()

customer_df = spark.read.format("jdbc").option("url", jdbc_url).option("dbtable", customer).option("user", user)\
    .option("password", password).option("driver", "com.microsoft.sqlserver.jdbc.SQLServerDriver").load()

employee_df = spark.read.format("jdbc").option("url", jdbc_url).option("dbtable", employee).option("user", user)\
    .option("password", password).option("driver", "com.microsoft.sqlserver.jdbc.SQLServerDriver").load()

genre_df = spark.read.format("jdbc").option("url", jdbc_url).option("dbtable", genre).option("user", user)\
    .option("password", password).option("driver", "com.microsoft.sqlserver.jdbc.SQLServerDriver").load()

invoice_df = spark.read.format("jdbc").option("url", jdbc_url).option("dbtable", invoice).option("user", user)\
    .option("password", password).option("driver", "com.microsoft.sqlserver.jdbc.SQLServerDriver").load()

invoice_line_df = spark.read.format("jdbc").option("url", jdbc_url).option("dbtable", invoice_line).option("user", user)\
    .option("password", password).option("driver", "com.microsoft.sqlserver.jdbc.SQLServerDriver").load()

media_type_df = spark.read.format("jdbc").option("url", jdbc_url).option("dbtable", media_type).option("user", user)\
    .option("password", password).option("driver", "com.microsoft.sqlserver.jdbc.SQLServerDriver").load()

playlist_df = spark.read.format("jdbc").option("url", jdbc_url).option("dbtable", playlist).option("user", user)\
    .option("password", password).option("driver", "com.microsoft.sqlserver.jdbc.SQLServerDriver").load()

playlist_track_df = spark.read.format("jdbc").option("url", jdbc_url).option("dbtable", playlist_track).option("user", user)\
    .option("password", password).option("driver", "com.microsoft.sqlserver.jdbc.SQLServerDriver").load()

track_df = spark.read.format("jdbc").option("url", jdbc_url).option("dbtable", track).option("user", user)\
    .option("password", password).option("driver", "com.microsoft.sqlserver.jdbc.SQLServerDriver").load()


EASY

In [8]:
# 1. Retrieve the total sales (invoiced amount) for each customer.

q1_df = customer_df.alias("c").join(invoice_df.alias("i"), col("c.CustomerId") == col("i.CustomerId"), "inner")\
    .groupBy(col("c.CustomerId").alias("Customer"))\
    .agg(round(sum(col("i.Total")), 1).alias("Total_Sales"))\
    .show()

+--------+-----------+
|Customer|Total_Sales|
+--------+-----------+
|      31|       37.6|
|      53|       37.6|
|      34|       39.6|
|      28|       43.6|
|      27|       37.6|
|      26|       47.6|
|      44|       41.6|
|      12|       37.6|
|      22|       39.6|
|      47|       37.6|
|       1|       39.6|
|      52|       37.6|
|      13|       37.6|
|      16|       37.6|
|       6|       49.6|
|       3|       39.6|
|      40|       38.6|
|      20|       39.6|
|      57|       46.6|
|      54|       37.6|
+--------+-----------+
only showing top 20 rows



In [61]:
# 2. List all albums by a specific artist (e.g., “Queen”).

q2_df = artist_df.alias("ar").join(album_df.alias("al"), col("ar.ArtistId") == col("al.ArtistId"), "inner")\
    .groupBy(col("ar.ArtistId"), col("ar.Name"))\
    .agg(concat_ws(", ", collect_list(col("al.Title"))).alias("Albums"))\
    .show()

+--------+--------------------+--------------------+
|ArtistId|                Name|              Albums|
+--------+--------------------+--------------------+
|       1|               AC/DC|For Those About T...|
|       2|              Accept|Balls to the Wall...|
|       3|           Aerosmith|            Big Ones|
|       4|   Alanis Morissette|  Jagged Little Pill|
|       5|     Alice In Chains|            Facelift|
|       6|Antônio Carlos Jobim|Warner 25 Anos, C...|
|       7|        Apocalyptica|Plays Metallica B...|
|       8|          Audioslave|Audioslave, Out O...|
|       9|            BackBeat| BackBeat Soundtrack|
|      10|        Billy Cobham|The Best Of Billy...|
|      11| Black Label Society|Alcohol Fueled Br...|
|      12|       Black Sabbath|Black Sabbath, Bl...|
|      13|          Body Count|          Body Count|
|      14|     Bruce Dickinson|    Chemical Wedding|
|      15|           Buddy Guy|The Best Of Buddy...|
|      16|      Caetano Veloso|Prenda Minha, S

In [65]:
# 3. Get a list of tracks from a specific genre (e.g., “Rock”).

q3_df = track_df.alias("t").join(genre_df.alias("g"), col("t.GenreId")==col("g.GenreId"), "inner")\
    .groupBy(col("g.name").alias("Genre")).agg(concat_ws("; ", collect_list(col("t.name"))).alias("Tracks"))\
    .show()

+-----------------+--------------------+
|            Genre|              Tracks|
+-----------------+--------------------+
|            World|Pura Elegancia; C...|
|      Hip Hop/Rap|ZeroVinteUm; Quei...|
|         TV Shows|Occupation / Prec...|
|       Bossa Nova|Samba Da Bênção; ...|
|            Latin|Jorge Da Capadóci...|
| Sci Fi & Fantasy|Crossroads, Pt. 1...|
|            Metal|Enter Sandman; Ma...|
|             Rock|For Those About T...|
|             Jazz|Desafinado; Garot...|
|         R&B/Soul|Please Please Ple...|
|            Drama|Don't Look Back; ...|
|       Soundtrack|Vai-Vai 2001; X-9...|
|Electronica/Dance|Just Another Stor...|
|        Classical|Symphony No. 3 in...|
|            Blues|First Time I Met ...|
|      Alternative|War Pigs; Say Hel...|
|    Rock And Roll|Money; Long Tall ...|
|  Science Fiction|Battlestar Galact...|
|              Pop|Dig-Dig, Lambe-La...|
|      Heavy Metal|Wildest Dreams; R...|
+-----------------+--------------------+
only showing top

In [13]:
# 4. Find the total number of invoices for each customer.

q4_df = customer_df.alias("c").join(invoice_df.alias("i"), col("c.customerid") == col("i.customerid"), "inner")\
    .groupBy(col("c.customerid"), col("c.firstname"))\
    .agg(count(col("i.invoiceid")))\
    .show()

+----------+---------+------------------+
|customerid|firstname|count(i.invoiceid)|
+----------+---------+------------------+
|        31|   Martha|                 7|
|        53|     Phil|                 7|
|        34|     João|                 7|
|        28|    Julia|                 7|
|        26|  Richard|                 7|
|        27|  Patrick|                 7|
|        44|    Terhi|                 7|
|        12|  Roberto|                 7|
|        22|  Heather|                 7|
|        47|    Lucas|                 7|
|         1|     Luís|                 7|
|        52|     Emma|                 7|
|        13| Fernanda|                 7|
|         6|   Helena|                 7|
|        16|    Frank|                 7|
|         3| François|                 7|
|        20|      Dan|                 7|
|        40|Dominique|                 7|
|        57|     Luis|                 7|
|        54|    Steve|                 7|
+----------+---------+------------

In [12]:
# 5 .Display the average track length (milliseconds) for each album.

q5_df = album_df.alias("al").join(track_df.alias("t"),col("al.albumid") == col("t.albumid"),"inner")\
    .groupBy(col("al.albumid").alias("Album no."), col("al.title").alias("Album Name"))\
    .agg(round(avg(col("t.milliseconds")), 1).alias("Length"))\
    .show()

+---------+--------------------+---------+
|Album no.|          Album Name|   Length|
+---------+--------------------+---------+
|      148|         Black Album| 313268.7|
|      243|The Best Of Van H...| 255881.2|
|       31|          Bongo Fury| 273992.1|
|       85|As Canções de Eu ...| 206159.7|
|      137|The Song Remains ...| 588794.2|
|      251|The Office, Season 3|1532683.8|
|       65|        Stormbringer| 244119.7|
|       53|        Vozes do MPB| 204191.3|
|      255|Instant Karma: Th...| 223255.9|
|      133|     Led Zeppelin II| 277652.1|
|      296|A Copland Celebra...| 198064.0|
|       78|        Deixa Entrar| 203032.6|
|      322|               Frank| 275982.5|
|      321|       Back to Black| 212004.2|
|      108|   Rock In Rio [CD1]| 338658.8|
|      155|           St. Anger| 409732.5|
|       34|Chill: Brazil (Di...| 248321.1|
|      193|Blood Sugar Sex M...| 261004.3|
|      211|         The Singles| 214704.4|
|      101|             Killers| 232369.1|
+---------+

In [8]:
# 6. Retrieve all customers from the “USA” and their invoices.

q6_df = customer_df.alias("c").join(invoice_df.alias("i"), col("c.customerid") == col("i.customerid"), "inner")\
    .select(col("c.customerid"), col("c.firstname"), col("i.invoiceid"), col("i.billingcountry"))\
    .where(upper(col("i.billingcountry")) == "USA")\
    .show()

+----------+---------+---------+--------------+
|customerid|firstname|invoiceid|billingcountry|
+----------+---------+---------+--------------+
|        28|    Julia|       71|           USA|
|        28|    Julia|       82|           USA|
|        28|    Julia|      137|           USA|
|        28|    Julia|      266|           USA|
|        28|    Julia|      289|           USA|
|        28|    Julia|      311|           USA|
|        28|    Julia|      363|           USA|
|        27|  Patrick|       39|           USA|
|        26|  Richard|       70|           USA|
|        26|  Richard|       93|           USA|
|        26|  Richard|      115|           USA|
|        26|  Richard|      167|           USA|
|        27|  Patrick|      168|           USA|
|        27|  Patrick|      191|           USA|
|        27|  Patrick|      213|           USA|
|        27|  Patrick|      265|           USA|
|        26|  Richard|      288|           USA|
|        26|  Richard|      299|        

In [14]:
# 7. Show the total number of tracks in each genre.

q7_df = genre_df.alias("g").join(track_df.alias("t"), col("g.genreid") == col("t.genreid"), "inner")\
    .groupBy(col("g.name").alias("Genre")) \
    .agg(count(col("t.trackid")).alias("no_of_tracks"))\
    .show()

+-----------------+------------+
|            Genre|no_of_tracks|
+-----------------+------------+
|            World|          28|
|      Hip Hop/Rap|          35|
|         TV Shows|          93|
|       Bossa Nova|          15|
|            Latin|         579|
| Sci Fi & Fantasy|          26|
|            Metal|         374|
|             Rock|        1297|
|             Jazz|         130|
|         R&B/Soul|          61|
|            Drama|          64|
|       Soundtrack|          43|
|Electronica/Dance|          30|
|        Classical|          74|
|            Blues|          81|
|      Alternative|          40|
|    Rock And Roll|          12|
|  Science Fiction|          13|
|              Pop|          48|
|      Heavy Metal|          28|
+-----------------+------------+
only showing top 20 rows



In [17]:
# 8. List all tracks for a specific album (e.g., "Abbey Road").

q8_df = album_df.alias("al").join(track_df.alias("t"), col("al.albumid") == col("t.albumid"), "inner")\
    .groupBy(col("al.title").alias("Album")) \
    .agg(concat_ws("; ", collect_list(col("t.name"))).alias("Tracks")) \
    .show()


+--------------------+--------------------+
|               Album|              Tracks|
+--------------------+--------------------+
|...And Justice Fo...|Blackened; ...And...|
|20th Century Mast...|Rock You Like a H...|
|A Copland Celebra...|Fanfare for the C...|
|A Matter of Life ...|Different World; ...|
|     A Real Dead One|The Number Of The...|
|     A Real Live One|Be Quick Or Be De...|
|  A Soprano Inspired|           Ave Maria|
|A TempestadeTempe...|Natália; L'Avvent...|
|             A-Sides|Nothing To Say; F...|
|       Ace Of Spades|Ace Of Spades; Lo...|
|        Achtung Baby|Zoo Station; Even...|
|            Acústico|Comida; Go Back; ...|
|        Acústico MTV|Vulcão Dub - Fui ...|
| Acústico MTV [Live]|Girassol; A Sombr...|
|Adams, John: The ...|Two Fanfares for ...|
|Adorate Deum: Gre...|Intoitus: Adorate...|
|      Afrociberdelia|Mateus Enter; O C...|
|   Album Of The Year|Collision; Strips...|
|Alcohol Fueled Br...|Intro/ Low Down; ...|
|Alcohol Fueled Br...|Heart Of G

In [44]:
# 9. Find customers who have not placed any invoices.

q9_df = customer_df.alias("c").join(invoice_df.alias("i"), col("c.customerid") == col("i.customerid"), "leftanti").select(col("c.customerid")).show()

+----------+
|customerid|
+----------+
|        60|
+----------+



In [47]:
# 10. Retrieve the most popular genres by total number of tracks.

q10_df = genre_df.alias("g").join(track_df.alias("t"), col("g.genreid") == col("t.genreid"), "inner") \
    .groupBy(col("g.name").alias("Genre")) \
    .agg(count(col("t.trackid")).alias("Total_tracks"))\
    .orderBy(desc("Total_tracks"))\
    .show()

+------------------+------------+
|             Genre|Total_tracks|
+------------------+------------+
|              Rock|        1297|
|             Latin|         579|
|             Metal|         374|
|Alternative & Punk|         332|
|              Jazz|         130|
|          TV Shows|          93|
|             Blues|          81|
|         Classical|          74|
|             Drama|          64|
|          R&B/Soul|          61|
|            Reggae|          58|
|               Pop|          48|
|        Soundtrack|          43|
|       Alternative|          40|
|       Hip Hop/Rap|          35|
| Electronica/Dance|          30|
|             World|          28|
|       Heavy Metal|          28|
|  Sci Fi & Fantasy|          26|
|    Easy Listening|          24|
+------------------+------------+
only showing top 20 rows



In [5]:
#spark.stop()