In [1]:
from pyspark.sql import SparkSession
import pyspark.sql.types as T 
import pyspark.sql.functions as F


spark = SparkSession \
    .builder \
    .appName("Exercise Solutions") \
    .getOrCreate()

# SQL Social-Network Query Exercises

Students at your hometown high school have decided to organize their social network using databases. So far, they have collected information about sixteen students in four grades, 9-12. 

Here's the schema:

Highschooler ( ID, name, grade )
English: There is a high school student with unique *ID* and a given *first name* in a certain *grade*.

Friend ( ID1, ID2 )
English: The student with *ID1* is friends with the student with *ID2*. Friendship is mutual, so if (123, 456) is in the Friend table, so is (456, 123).

Likes ( ID1, ID2 )
English: The student with *ID1* likes the student with *ID2*. Liking someone is not necessarily mutual, so if (123, 456) is in the Likes table, there is no guarantee that (456, 123) is also present.

### Highschooler

In [2]:
highschooler_df = spark.read.csv('./data/social/highschooler.csv', header=True)
highschooler_df.show(truncate=False)

+----+---------+-----+
|id  |name     |grade|
+----+---------+-----+
|1510|Jordan   |9    |
|1689|Gabriel  |9    |
|1381|Tiffany  |9    |
|1709|Cassandra|9    |
|1101|Haley    |10   |
|1782|Andrew   |10   |
|1468|Kris     |10   |
|1641|Brittany |10   |
|1247|Alexis   |11   |
|1316|Austin   |11   |
|1911|Gabriel  |11   |
|1501|Jessica  |11   |
|1304|Jordan   |12   |
|1025|John     |12   |
|1934|Kyle     |12   |
|1661|Logan    |12   |
+----+---------+-----+



### Friend

In [3]:
friend_df = spark.read.csv('./data/social/friend.csv', header=True)
friend_df.show(truncate=False)

+----+----+
|id1 |id2 |
+----+----+
|1510|1381|
|1510|1689|
|1689|1709|
|1381|1247|
|1709|1247|
|1689|1782|
|1782|1468|
|1782|1316|
|1782|1304|
|1468|1101|
|1468|1641|
|1101|1641|
|1247|1911|
|1247|1501|
|1911|1501|
|1501|1934|
|1316|1934|
|1934|1304|
|1304|1661|
|1661|1025|
+----+----+
only showing top 20 rows



### Likes

In [4]:
likes_df = spark.read.csv('./data/social/likes.csv', header=True)
likes_df.show(truncate=False)

+----+----+
|id1 |id2 |
+----+----+
|1689|1709|
|1709|1689|
|1782|1709|
|1911|1247|
|1247|1468|
|1641|1468|
|1316|1304|
|1501|1934|
|1934|1501|
|1025|1101|
+----+----+



## Q1
Find the names of all students who are friends with someone named Gabriel.

In [5]:
h2 = highschooler_df.alias('h2')

highschooler_df \
    .alias('h1') \
    .join(friend_df, highschooler_df.id == friend_df.id1) \
    .join(h2, friend_df.id2 == h2.id) \
    .where(F.col('h2.name') == 'Gabriel') \
    .select('h1.name') \
    .show(truncate=False)

+---------+
|name     |
+---------+
|Jordan   |
|Alexis   |
|Cassandra|
|Andrew   |
|Jessica  |
+---------+



## Q2
For every student who likes someone 2 or more grades younger than themselves, return that student's name and grade, and the name and grade of the student they like.

In [6]:
h2 = highschooler_df.alias('h2')

highschooler_df \
    .alias('h1') \
    .join(likes_df, highschooler_df.id == likes_df.id1) \
    .join(h2, likes_df.id2 == h2.id) \
    .where(F.col('h1.grade') - F.col('h2.grade') >= 2) \
    .select('h1.name', 'h1.grade', 'h2.name', 'h2.grade') \
    .show(truncate=False)

+----+-----+-----+-----+
|name|grade|name |grade|
+----+-----+-----+-----+
|John|12   |Haley|10   |
+----+-----+-----+-----+



## Q3
For every pair of students who both like each other, return the name and grade of both students. Include each pair only once, with the two names in alphabetical order.

In [7]:
h2 = highschooler_df.alias('h2')
l2 = likes_df.alias('l2')

highschooler_df \
    .join(likes_df, highschooler_df.id == likes_df.id1) \
    .alias('h1') \
    .join(l2, F.col('h1.id2') == l2.id1) \
    .join(h2, F.col('l2.id1') == h2.id) \
    .where((F.col('h1.id') == F.col('l2.id2')) & (F.col('h1.name') < F.col('h2.name'))) \
    .select('h1.name', 'h2.name') \
    .show(truncate=False)

+---------+-------+
|name     |name   |
+---------+-------+
|Cassandra|Gabriel|
|Jessica  |Kyle   |
+---------+-------+



## Q4
Find all students who do not appear in the Likes table (as a student who likes or is liked) and return their names and grades. Sort by grade, then by name within each grade.

In [8]:
l2 = likes_df.alias('l2')

highschooler_df \
    .join(likes_df.alias('l1'), highschooler_df.id == likes_df.id1, 'full') \
    .join(l2, highschooler_df.id == l2.id2, 'full') \
    .where((F.col('l1.id1').isNull()) & (F.col('l2.id2').isNull())) \
    .select('name', 'grade') \
    .show(truncate=False)

+-------+-----+
|name   |grade|
+-------+-----+
|Tiffany|9    |
|Jordan |9    |
|Logan  |12   |
+-------+-----+



## Q5
For every situation where student A likes student B, but we have no information about whom B likes (that is, B does not appear as an ID1 in the Likes table), return A and B's names and grades.

In [9]:
h2 = highschooler_df.alias('h2')

highschooler_df \
    .alias('h1') \
    .join(likes_df, highschooler_df.id == likes_df.id1) \
    .join(h2, likes_df.id2 == h2.id) \
    .where(~F.col('h2.id').isin([row['id1'] for row in likes_df.collect()])) \
    .select('h1.name', 'h1.grade', 'h2.name', 'h2.grade') \
    .show(truncate=False)

+--------+-----+------+-----+
|name    |grade|name  |grade|
+--------+-----+------+-----+
|Brittany|10   |Kris  |10   |
|Alexis  |11   |Kris  |10   |
|Austin  |11   |Jordan|12   |
|John    |12   |Haley |10   |
+--------+-----+------+-----+



## Q6
Find names and grades of students who only have friends in the same grade. Return the result sorted by grade, then by name within each grade.

In [10]:
h2 = highschooler_df.alias('h2')
id_s = highschooler_df \
            .alias('h1') \
            .join(friend_df, highschooler_df.id == friend_df.id1) \
            .join(h2, friend_df.id2 == h2.id) \
            .where(F.col('h1.grade') != F.col('h2.grade')) \
            .collect()
            
highschooler_df \
    .where(~F.col('id').isin([row['id'] for row in id_s])) \
    .orderBy(F.col('grade').desc(), 'name') \
    .show(truncate=False)

+----+--------+-----+
|id  |name    |grade|
+----+--------+-----+
|1510|Jordan  |9    |
|1025|John    |12   |
|1661|Logan   |12   |
|1911|Gabriel |11   |
|1641|Brittany|10   |
|1101|Haley   |10   |
|1468|Kris    |10   |
+----+--------+-----+



## Q7
For each student A who likes a student B where the two are not friends, find if they have a friend C in common (who can introduce them!). For all such trios, return the name and grade of A, B, and C.

## Q8
Find the difference between the number of students in the school and the number of different first names.

In [11]:
highschooler_df \
    .count() - highschooler_df \
                    .select('name') \
                    .distinct() \
                    .count()

2

## Q9
Find the name and grade of all students who are liked by more than one other student.

In [12]:
highschooler_df \
    .join(likes_df, highschooler_df.id == likes_df.id2) \
    .groupBy('id', 'name', 'grade') \
    .agg(F.count('*').alias('count_schooler')) \
    .where(F.col('count_schooler') > 1) \
    .select('name', 'grade') \
    .show(truncate=False)

+---------+-----+
|name     |grade|
+---------+-----+
|Kris     |10   |
|Cassandra|9    |
+---------+-----+



# SQL Social-Network Query Exercises Extras

## Q1
For every situation where student A likes student B, but student B likes a different student C, return the names and grades of A, B, and C.

In [13]:
highschooler_df \
    .alias('h1') \
    .join(likes_df.alias('l1'), F.col('h1.id') == F.col('l1.id1')) \
    .join(highschooler_df.alias('h2'), F.col('h2.id') == F.col('l1.id2')) \
    .join(likes_df.alias('l2'), F.col('h2.id') == F.col('l2.id1')) \
    .join(highschooler_df.alias('h3'), F.col('h3.id') == F.col('l2.id2')) \
    .where(F.col('h1.id') != F.col('l2.id2')) \
    .select('h1.name', 'h1.grade', 'h2.name', 'h2.grade', 'h3.name', 'h3.grade') \
    .show()

+-------+-----+---------+-----+-------+-----+
|   name|grade|     name|grade|   name|grade|
+-------+-----+---------+-----+-------+-----+
| Andrew|   10|Cassandra|    9|Gabriel|    9|
|Gabriel|   11|   Alexis|   11|   Kris|   10|
+-------+-----+---------+-----+-------+-----+



## Q2
Find those students for whom all of their friends are in different grades from themselves. Return the students' names and grades.

## Q3
What is the average number of friends per student? (Your result should be just one number.)

In [14]:
friend_df \
    .groupBy('id1') \
    .agg(F.count('*')) \
    .groupBy() \
    .avg() \
    .show()

+-------------+
|avg(count(1))|
+-------------+
|          2.5|
+-------------+



## Q4
Find the number of students who are either friends with Cassandra or are friends of friends of Cassandra. Do not count Cassandra, even though technically she is a friend of a friend.

## Q5
Find the name and grade of the student(s) with the greatest number of friends.

In [15]:
max_friends = friend_df \
                .groupBy('id1') \
                .agg(F.count('*')) \
                .groupBy().max('count(1)') \
                .first() \
                ['max(count(1))']

highschooler_df \
    .join(friend_df, highschooler_df.id == friend_df.id1) \
    .groupBy('name', 'grade') \
    .count() \
    .where(F.col('count') == max_friends) \
    .show(truncate=False)

+------+-----+-----+
|name  |grade|count|
+------+-----+-----+
|Andrew|10   |4    |
|Alexis|11   |4    |
+------+-----+-----+

