In [2]:
from IPython.display import Image
from IPython.core.display import HTML

# Connecting to the database

In [3]:
from db_tools.setup import setup
from pyspark.sql.types import StructType,StructField, StringType, IntegerType

In [4]:
spark = setup()

23/10/24 23:23:29 WARN Utils: Your hostname, luan-Dell-G15-5520 resolves to a loopback address: 127.0.1.1; using 192.168.1.12 instead (on interface wlp0s20f3)
23/10/24 23:23:29 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
23/10/24 23:23:30 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


SparkSession available as "spark"


In [5]:
def show_query(query,SparkSession = spark,n = 20):
    return SparkSession.sql(query).show()

# Tip 1: Alias is your friends (AS is Good practice)

In [6]:
query = """SELECT
	c.customer_id,
	first_name,
	amount,
	payment_date
FROM
	customer AS c
INNER JOIN payment AS p 
    ON p.customer_id = c.customer_id
ORDER BY
    payment_date DESC;"""

In [7]:
show_query(query)

+-----------+----------+------+--------------------+
|customer_id|first_name|amount|        payment_date|
+-----------+----------+------+--------------------+
|         28|   Cynthia|  2.99|2007-05-14 13:44:...|
|        111|    Carmen|  2.99|2007-05-14 13:44:...|
|        497|   Gilbert|  4.99|2007-05-14 13:44:...|
|        496|     Tyler|  2.99|2007-05-14 13:44:...|
|        596|   Enrique|  0.99|2007-05-14 13:44:...|
|        516|     Elmer|  0.00|2007-05-14 13:44:...|
|        300|      John|  4.99|2007-05-14 13:44:...|
|         53|   Heather|  7.98|2007-05-14 13:44:...|
|        412|     Allen|  0.99|2007-05-14 13:44:...|
|        296|    Ramona|  2.99|2007-05-14 13:44:...|
|        587|    Sergio|  0.99|2007-05-14 13:44:...|
|        115|     Wendy|  2.99|2007-05-14 13:44:...|
|         44|     Marie|  4.99|2007-05-14 13:44:...|
|        597|   Freddie|  4.99|2007-05-14 13:44:...|
|        192|    Laurie|  4.99|2007-05-14 13:44:...|
|        155|      Gail|  7.98|2007-05-14 13:4

# Alias in SELF join are mandatory

```
SELECT
    e.first_name employee,
    m .first_name manager
FROM
    employee e
INNER JOIN employee m 
    ON m.employee_id = e.manager_id
ORDER BY manager;
```

# JOINS - Overview

<center><img width=750 src="https://gorilla.bi/wp-content/uploads/2022/10/Join-Types-Cheat-Sheet.png"/></center>

# Classical JOINs

<center><img width=750 src="https://archive.is/vagHZ/8a2b050a9297734156662434347f3148285e421c.png"/></center>

# Other types of join

<center><img width=750 src="https://archive.is/vagHZ/88409728af608e2ade8d094abd9b15ac8d40e026.png"/></center>

# INNER JOIN examples

## without an ALIAS

In [8]:
query = """
SELECT
	customer.customer_id,
	first_name,
	last_name,
	amount,
	payment_date
FROM
	customer
INNER JOIN payment 
    ON payment.customer_id = customer.customer_id
ORDER BY payment_date;
"""

In [9]:
show_query(query)

+-----------+----------+---------+------+--------------------+
|customer_id|first_name|last_name|amount|        payment_date|
+-----------+----------+---------+------+--------------------+
|        416|   Jeffery|   Pinson|  2.99|2007-02-14 21:21:...|
|        516|     Elmer|      Noe|  4.99|2007-02-14 21:23:...|
|        239|    Minnie|   Romero|  4.99|2007-02-14 21:29:...|
|        592|  Terrance|    Roush|  6.99|2007-02-14 21:41:...|
|         49|     Joyce|  Edwards|  0.99|2007-02-14 21:44:...|
|        264| Gwendolyn|      May|  3.99|2007-02-14 21:44:...|
|         46| Catherine| Campbell|  4.99|2007-02-14 21:45:...|
|        481|    Herman|   Devore|  2.99|2007-02-14 22:03:...|
|        139|     Amber|    Dixon|  2.99|2007-02-14 22:11:...|
|        595|  Terrence|Gunderson|  2.99|2007-02-14 22:16:...|
|        191|  Jeanette|   Greene|  2.99|2007-02-14 22:23:...|
|         95|     Paula|   Bryant|  2.99|2007-02-14 22:41:...|
|        197|       Sue|   Peters|  2.99|2007-02-14 22:

## with alias

In [10]:
query = """
SELECT
	c.customer_id,
	c.first_name customer_first_name,
	c.last_name customer_last_name,
	s.first_name staff_first_name,
	s.last_name staff_last_name,
	amount,
	payment_date
FROM
	customer c
INNER JOIN payment p 
    ON p.customer_id = c.customer_id
INNER JOIN staff s 
    ON p.staff_id = s.staff_id
ORDER BY payment_date;
"""

In [11]:
show_query(query)

+-----------+-------------------+------------------+----------------+---------------+------+--------------------+
|customer_id|customer_first_name|customer_last_name|staff_first_name|staff_last_name|amount|        payment_date|
+-----------+-------------------+------------------+----------------+---------------+------+--------------------+
|        416|            Jeffery|            Pinson|             Jon|       Stephens|  2.99|2007-02-14 21:21:...|
|        516|              Elmer|               Noe|             Jon|       Stephens|  4.99|2007-02-14 21:23:...|
|        239|             Minnie|            Romero|            Mike|        Hillyer|  4.99|2007-02-14 21:29:...|
|        592|           Terrance|             Roush|             Jon|       Stephens|  6.99|2007-02-14 21:41:...|
|         49|              Joyce|           Edwards|            Mike|        Hillyer|  0.99|2007-02-14 21:44:...|
|        264|          Gwendolyn|               May|             Jon|       Stephens|  3

## USING clause

In [12]:
query = """SELECT
	customer_id,
	first_name,
	last_name,
	amount,
	payment_date
FROM
	customer
INNER JOIN payment USING(customer_id)
ORDER BY payment_date;	"""
show_query(query)

+-----------+----------+---------+------+--------------------+
|customer_id|first_name|last_name|amount|        payment_date|
+-----------+----------+---------+------+--------------------+
|        416|   Jeffery|   Pinson|  2.99|2007-02-14 21:21:...|
|        516|     Elmer|      Noe|  4.99|2007-02-14 21:23:...|
|        239|    Minnie|   Romero|  4.99|2007-02-14 21:29:...|
|        592|  Terrance|    Roush|  6.99|2007-02-14 21:41:...|
|         49|     Joyce|  Edwards|  0.99|2007-02-14 21:44:...|
|        264| Gwendolyn|      May|  3.99|2007-02-14 21:44:...|
|         46| Catherine| Campbell|  4.99|2007-02-14 21:45:...|
|        481|    Herman|   Devore|  2.99|2007-02-14 22:03:...|
|        139|     Amber|    Dixon|  2.99|2007-02-14 22:11:...|
|        595|  Terrence|Gunderson|  2.99|2007-02-14 22:16:...|
|        191|  Jeanette|   Greene|  2.99|2007-02-14 22:23:...|
|         95|     Paula|   Bryant|  2.99|2007-02-14 22:41:...|
|        197|       Sue|   Peters|  2.99|2007-02-14 22:

# LEFT JOIN

## Basic usage

In [13]:
query = """
SELECT
	f.film_id,
	title,
	inventory_id
FROM
	film AS f
LEFT JOIN inventory AS i
    ON i.film_id = f.film_id
ORDER BY title;
"""

In [14]:
show_query(query)

+-------+----------------+------------+
|film_id|           title|inventory_id|
+-------+----------------+------------+
|      1|Academy Dinosaur|           1|
|      1|Academy Dinosaur|           8|
|      1|Academy Dinosaur|           7|
|      1|Academy Dinosaur|           6|
|      1|Academy Dinosaur|           5|
|      1|Academy Dinosaur|           4|
|      1|Academy Dinosaur|           3|
|      1|Academy Dinosaur|           2|
|      2|  Ace Goldfinger|          11|
|      2|  Ace Goldfinger|          10|
|      2|  Ace Goldfinger|           9|
|      3|Adaptation Holes|          13|
|      3|Adaptation Holes|          12|
|      3|Adaptation Holes|          15|
|      3|Adaptation Holes|          14|
|      4|Affair Prejudice|          22|
|      4|Affair Prejudice|          21|
|      4|Affair Prejudice|          20|
|      4|Affair Prejudice|          19|
|      4|Affair Prejudice|          18|
+-------+----------------+------------+
only showing top 20 rows



## Other uses

In [15]:
query = """
SELECT
	film.film_id,
	film.title,
	inventory_id
FROM
	film
LEFT JOIN inventory 
   ON inventory.film_id = film.film_id
WHERE inventory.film_id IS NULL
ORDER BY title;
"""

In [16]:
show_query(query)

+-------+--------------------+------------+
|film_id|               title|inventory_id|
+-------+--------------------+------------+
|     14|      Alice Fantasia|        null|
|     33|         Apollo Teen|        null|
|     36|      Argonauts Town|        null|
|     38|       Ark Ridgemont|        null|
|     41|Arsenic Independence|        null|
|     87|   Boondock Ballroom|        null|
|    108|       Butch Panther|        null|
|    128|       Catch Amistad|        null|
|    144| Chinatown Gladiator|        null|
|    148|      Chocolate Duck|        null|
|    171|Commandments Express|        null|
|    192|    Crossing Divorce|        null|
|    195|     Crowds Telemark|        null|
|    198|    Crystal Breaking|        null|
|    217|          Dazed Punk|        null|
|    221|Deliverance Mulho...|        null|
|    318|   Firehouse Vietnam|        null|
|    325|       Floats Garden|        null|
|    332|Frankenstein Stra...|        null|
|    359|  Gladiator Westward|  

## Don't forget: use ALIAS!

In [17]:
query = """SELECT
	f.film_id,
	title,
	inventory_id
FROM
	film AS f
LEFT JOIN inventory AS i
   ON i.film_id = f.film_id
WHERE i.film_id IS NULL
ORDER BY title;"""

In [18]:
show_query(query)

+-------+--------------------+------------+
|film_id|               title|inventory_id|
+-------+--------------------+------------+
|     14|      Alice Fantasia|        null|
|     33|         Apollo Teen|        null|
|     36|      Argonauts Town|        null|
|     38|       Ark Ridgemont|        null|
|     41|Arsenic Independence|        null|
|     87|   Boondock Ballroom|        null|
|    108|       Butch Panther|        null|
|    128|       Catch Amistad|        null|
|    144| Chinatown Gladiator|        null|
|    148|      Chocolate Duck|        null|
|    171|Commandments Express|        null|
|    192|    Crossing Divorce|        null|
|    195|     Crowds Telemark|        null|
|    198|    Crystal Breaking|        null|
|    217|          Dazed Punk|        null|
|    221|Deliverance Mulho...|        null|
|    318|   Firehouse Vietnam|        null|
|    325|       Floats Garden|        null|
|    332|Frankenstein Stra...|        null|
|    359|  Gladiator Westward|  

# RIGHT JOIN

## Creating mock data

In [19]:
film_schema = StructType([ \
    StructField("film_id",IntegerType(),True), \
    StructField("title",StringType(),True)
  ])
films = [(1,'Joker'),
            (2,'Avengers'),
            (3,'Parasite')]
columns = ['film_id','title']
rdd = spark.sparkContext.parallelize(films)
spark.createDataFrame(rdd, schema = film_schema).createOrReplaceTempView('films')

In [20]:
ratings_schema = StructType([ \
    StructField("review_id",IntegerType(),True),
    StructField("film_id",IntegerType(),True),
    StructField("review",StringType(),True),
  ])
ratings = [(1,1,'Excellent'),
         (2,1,'Awesome'),
         (3,2,'Cool'),
         (4,None,'Beautiful')]
columns = ['review_id','film_id','review']
rdd = spark.sparkContext.parallelize(ratings)
spark.createDataFrame(rdd, schema = ratings_schema).createOrReplaceTempView('film_reviews')

## Common use

In [21]:
query = """
SELECT 
   review, 
   title
FROM 
   films
RIGHT JOIN film_reviews 
   ON film_reviews.film_id = films.film_id;
"""

In [22]:
show_query(query)



+---------+--------+
|   review|   title|
+---------+--------+
|Excellent|   Joker|
|  Awesome|   Joker|
|     Cool|Avengers|
|Beautiful|    null|
+---------+--------+



                                                                                

## USING clause

In [23]:
query =  """
            SELECT review, title
            FROM films
            RIGHT JOIN film_reviews USING (film_id);
"""

In [24]:
show_query(query)

                                                                                

+---------+--------+
|   review|   title|
+---------+--------+
|Excellent|   Joker|
|  Awesome|   Joker|
|     Cool|Avengers|
|Beautiful|    null|
+---------+--------+



## RIGHT JOIN with WHERE

In [25]:
query = """
SELECT review, title
FROM films
RIGHT JOIN film_reviews using (film_id)
WHERE title IS NULL;
"""
show_query(query)

+---------+-----+
|   review|title|
+---------+-----+
|Beautiful| null|
+---------+-----+



## Transforming a RIGHT JOIN into a LEFT JOIN

In [26]:
query =  """
            SELECT review, title
            FROM film_reviews
            LEFT JOIN films USING (film_id);
"""
show_query(query)

+---------+--------+
|   review|   title|
+---------+--------+
|Excellent|   Joker|
|  Awesome|   Joker|
|     Cool|Avengers|
|Beautiful|    null|
+---------+--------+



# SELF JOIN

### Example

<center><img width=750 src="https://www.postgresqltutorial.com/wp-content/uploads/2018/03/PostgreSQL-Self-Join-Reporting-Structure.png"/></center>

### Building the example

In [27]:
company_schema = StructType([ \
    StructField("employee_id",IntegerType(),True),
    StructField("first_name",StringType(),True),
    StructField("last_name",StringType(),True),
    StructField("manager_id",IntegerType(),True),
  ])
employees = [(1, 'Windy', 'Hays', None),
             (2, 'Ava', 'Christensen', 1),
             (3, 'Hassan', 'Conner', 1),
             (4, 'Anna', 'Reeves', 2),
             (5, 'Sau', 'Norman', 2),
             (6, 'Kelsie', 'Hays', 3),
             (7, 'Tory', 'Goff', 3),
             (8, 'Salley', 'Lester', 3)]

rdd = spark.sparkContext.parallelize(employees)
spark.createDataFrame(rdd, schema = company_schema).createOrReplaceTempView('employee')

### query example

In [28]:
query = """
SELECT
    e.first_name || ' ' || e.last_name employee,
    m .first_name || ' ' || m.last_name manager
FROM
    employee AS e
INNER JOIN employee AS m ON m.employee_id = e.manager_id
ORDER BY manager;
"""

In [29]:
show_query(query)

+---------------+---------------+
|       employee|        manager|
+---------------+---------------+
|    Anna Reeves|Ava Christensen|
|     Sau Norman|Ava Christensen|
|    Kelsie Hays|  Hassan Conner|
|      Tory Goff|  Hassan Conner|
|  Salley Lester|  Hassan Conner|
|Ava Christensen|     Windy Hays|
|  Hassan Conner|     Windy Hays|
+---------------+---------------+



### Showing who's the boss

In [30]:
query = """
SELECT
    e.first_name || ' ' || e.last_name employee,
    m .first_name || ' ' || m .last_name manager
FROM
    employee e
LEFT JOIN employee m ON m .employee_id = e.manager_id
ORDER BY manager;
"""

In [31]:
show_query(query)

+---------------+---------------+
|       employee|        manager|
+---------------+---------------+
|     Windy Hays|           null|
|    Anna Reeves|Ava Christensen|
|     Sau Norman|Ava Christensen|
|    Kelsie Hays|  Hassan Conner|
|      Tory Goff|  Hassan Conner|
|  Salley Lester|  Hassan Conner|
|Ava Christensen|     Windy Hays|
|  Hassan Conner|     Windy Hays|
+---------------+---------------+



### Showing films with the same length

In [32]:
query = """SELECT
    f1.title,
    f2.title,
    f1.length
FROM
    film f1
INNER JOIN film f2 
    ON f1.film_id <> f2.film_id AND 
       f1.length = f2.length;"""

In [33]:
show_query(query)

+--------------------+--------------------+------+
|               title|               title|length|
+--------------------+--------------------+------+
|    Army Flintstones|    Telegraph Voyage|   148|
|    Army Flintstones|        Outlaw Hanky|   148|
|    Army Flintstones|Flintstones Happi...|   148|
|    Army Flintstones|Fireball Philadel...|   148|
|    Army Flintstones|      Easy Gladiator|   148|
|    Army Flintstones|     Blanket Beverly|   148|
|     Blanket Beverly|    Telegraph Voyage|   148|
|     Blanket Beverly|        Outlaw Hanky|   148|
|     Blanket Beverly|Flintstones Happi...|   148|
|     Blanket Beverly|Fireball Philadel...|   148|
|     Blanket Beverly|      Easy Gladiator|   148|
|     Blanket Beverly|    Army Flintstones|   148|
|      Easy Gladiator|    Telegraph Voyage|   148|
|      Easy Gladiator|        Outlaw Hanky|   148|
|      Easy Gladiator|Flintstones Happi...|   148|
|      Easy Gladiator|Fireball Philadel...|   148|
|      Easy Gladiator|     Blan

# FULL OUTER JOIN

In [34]:
departments_schema = StructType([ \
    StructField("department_id",IntegerType(),True),
    StructField("department_name",StringType(),True)])
employee_schema = StructType([ \
    StructField("employee_id",IntegerType(),True),
    StructField("employee_name",StringType(),True),
    StructField("department_id",IntegerType(),True)])

In [35]:
departments = [(1, 'Sales'),
               (2, 'Marketing'),
               (3, 'HR'),
               (4, 'IT'),
               (5,'Production')]

rdd = spark.sparkContext.parallelize(departments)
spark.createDataFrame(rdd, schema = departments_schema).createOrReplaceTempView('departments')

In [36]:
employees = [(1,'Bette Nicholson', 1),
        	 (2,'Christian Gable', 1),
        	 (3,'Joe Swank', 2),
        	 (4,'Fred Costner', 3),
        	 (5,'Sandra Kilmer', 4),
        	 (6,'Julia Mcqueen', None)]
rdd = spark.sparkContext.parallelize(employees)
spark.createDataFrame(rdd, schema = employee_schema).createOrReplaceTempView('employees')

In [37]:
show_query('SELECT * FROM departments')

+-------------+---------------+
|department_id|department_name|
+-------------+---------------+
|            1|          Sales|
|            2|      Marketing|
|            3|             HR|
|            4|             IT|
|            5|     Production|
+-------------+---------------+



In [38]:
show_query('SELECT * FROM employees')

+-----------+---------------+-------------+
|employee_id|  employee_name|department_id|
+-----------+---------------+-------------+
|          1|Bette Nicholson|            1|
|          2|Christian Gable|            1|
|          3|      Joe Swank|            2|
|          4|   Fred Costner|            3|
|          5|  Sandra Kilmer|            4|
|          6|  Julia Mcqueen|         null|
+-----------+---------------+-------------+



### Checking the company structure

In [39]:
query = """SELECT
	employee_name,
	department_name
FROM
	employees AS e
FULL OUTER JOIN departments AS d 
        ON d.department_id = e.department_id;
"""

In [40]:
show_query(query)

+---------------+---------------+
|  employee_name|department_name|
+---------------+---------------+
|  Julia Mcqueen|           null|
|Christian Gable|          Sales|
|Bette Nicholson|          Sales|
|      Joe Swank|      Marketing|
|   Fred Costner|             HR|
|  Sandra Kilmer|             IT|
|           null|     Production|
+---------------+---------------+



### Checking department without employees

In [47]:
query = """SELECT
	employee_name,
	department_name
FROM
	employees AS e
FULL OUTER JOIN departments AS d 
        ON d.department_id = e.department_id
WHERE employee_name IS NULL
"""

In [48]:
show_query(query)

+-------------+---------------+
|employee_name|department_name|
+-------------+---------------+
|         null|     Production|
+-------------+---------------+



### Checking employees without department

In [49]:
query = """SELECT
	employee_name,
	department_name
FROM
	employees AS e
FULL OUTER JOIN departments AS d 
        ON d.department_id = e.department_id
WHERE department_name IS NULL
"""

In [50]:
show_query(query)

+-------------+---------------+
|employee_name|department_name|
+-------------+---------------+
|Julia Mcqueen|           null|
+-------------+---------------+



# CROSS JOIN

### How many relationships between employees it can have? Show the combinations in a table

In [98]:
query = """SELECT SUM(1) AS total
FROM
	employees AS e1
CROSS JOIN employees AS e2
WHERE e1.employee_name <> e2.employee_name
"""

In [99]:
show_query(query)



+-----+
|total|
+-----+
|   30|
+-----+



                                                                                

In [94]:
query = """SELECT e1.employee_name AS employee_A,e2.employee_name AS employee_B
FROM
	employees AS e1
CROSS JOIN employees AS e2
WHERE e1.employee_name <> e2.employee_name
"""

In [95]:
show_query(query)



+---------------+---------------+
|     employee_A|     employee_B|
+---------------+---------------+
|Bette Nicholson|Christian Gable|
|Bette Nicholson|      Joe Swank|
|Bette Nicholson|   Fred Costner|
|Bette Nicholson|  Sandra Kilmer|
|Bette Nicholson|  Julia Mcqueen|
|Christian Gable|Bette Nicholson|
|Christian Gable|      Joe Swank|
|Christian Gable|   Fred Costner|
|Christian Gable|  Sandra Kilmer|
|Christian Gable|  Julia Mcqueen|
|      Joe Swank|Bette Nicholson|
|      Joe Swank|Christian Gable|
|      Joe Swank|   Fred Costner|
|      Joe Swank|  Sandra Kilmer|
|      Joe Swank|  Julia Mcqueen|
|   Fred Costner|Bette Nicholson|
|   Fred Costner|Christian Gable|
|   Fred Costner|      Joe Swank|
|   Fred Costner|  Sandra Kilmer|
|   Fred Costner|  Julia Mcqueen|
+---------------+---------------+
only showing top 20 rows



                                                                                

### How many interactions between department it can have? Show a table with all combinations

In [100]:
query = """SELECT SUM(1)
FROM
	departments AS d1
CROSS JOIN departments AS d2 
WHERE d1.department_name <> d2.department_name
"""

In [101]:
show_query(query)



+------+
|sum(1)|
+------+
|    20|
+------+



                                                                                

In [96]:
query = """SELECT d1.department_name,d2.department_name
FROM
	departments AS d1
CROSS JOIN departments AS d2 
WHERE d1.department_name <> d2.department_name
"""

In [97]:
show_query(query)



+---------------+---------------+
|department_name|department_name|
+---------------+---------------+
|          Sales|      Marketing|
|          Sales|             HR|
|          Sales|             IT|
|          Sales|     Production|
|      Marketing|          Sales|
|      Marketing|             HR|
|      Marketing|             IT|
|      Marketing|     Production|
|             HR|          Sales|
|             HR|      Marketing|
|             HR|             IT|
|             HR|     Production|
|             IT|          Sales|
|             IT|      Marketing|
|             IT|             HR|
|             IT|     Production|
|     Production|          Sales|
|     Production|      Marketing|
|     Production|             HR|
|     Production|             IT|
+---------------+---------------+



                                                                                

## Natural Join

In [122]:
band_schema1 = StructType([ \
    StructField("role",StringType(),True),
    StructField("name_rhcp",StringType(),True)])
band_schema2 = StructType([ \
    StructField("role",StringType(),True),
    StructField("name_muse",StringType(),True)])

In [126]:
band_A = [('vocal', 'Anthony Kiedis'),
               ('guitar', 'John Frusciante'),
               ('bass', 'Flea'),
               ('drums', 'Chad Smith')]
band_B = [('vocal', 'Matt Bellamy'),
               ('guitar', 'Matt Bellamy'),
               ('bass', 'Chris Wolstenholme'),
               ('drums', 'Dominic Howard')]
rdd = spark.sparkContext.parallelize(band_A)
spark.createDataFrame(rdd, schema = band_schema1).createOrReplaceTempView('band1')
rdd = spark.sparkContext.parallelize(band_B)
spark.createDataFrame(rdd, schema = band_schema2).createOrReplaceTempView('band2')

In [127]:
query = """SELECT * 
FROM band2
NATURAL JOIN band1"""

In [128]:
show_query(query)

+------+------------------+---------------+
|  role|         name_muse|      name_rhcp|
+------+------------------+---------------+
|  bass|Chris Wolstenholme|           Flea|
| drums|    Dominic Howard|     Chad Smith|
|guitar|      Matt Bellamy|John Frusciante|
| vocal|      Matt Bellamy| Anthony Kiedis|
+------+------------------+---------------+



# Exercise: redo all the above with Pyspark API