## PySpark 설치

In [1]:
!pip install pyspark==3.3.1 py4j==0.10.9.5 

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pyspark==3.3.1
  Downloading pyspark-3.3.1.tar.gz (281.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m281.4/281.4 MB[0m [31m3.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting py4j==0.10.9.5
  Downloading py4j-0.10.9.5-py2.py3-none-any.whl (199 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m199.7/199.7 KB[0m [31m11.1 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.3.1-py2.py3-none-any.whl size=281845512 sha256=26da422daebe2a300684f7fbcb6760a9634728e3d09e10db7c70248ecb068537
  Stored in directory: /root/.cache/pip/wheels/43/dc/11/ec201cd671da62fa9c5cc77078235e40722170ceba231d7598
Successfully built pyspark
Installing collected packages: py4j

In [2]:
from pyspark.sql import SparkSession

spark = SparkSession \
    .builder \
    .appName("Python Spark SQL #1") \
    .getOrCreate()

## 조인 실습 테이블 2개 로딩

In [3]:
vital = [
     { 'UserID': 100, 'VitalID': 1, 'Date': '2020-01-01', 'Weight': 75 },
     { 'UserID': 100, 'VitalID': 2, 'Date': '2020-01-02', 'Weight': 78 },
     { 'UserID': 101, 'VitalID': 3, 'Date': '2020-01-01', 'Weight': 90 },
     { 'UserID': 101, 'VitalID': 4, 'Date': '2020-01-02', 'Weight': 95 },
]

alert = [
    { 'AlertID': 1, 'VitalID': 4, 'AlertType': 'WeightIncrease', 'Date': '2020-01-01', 'UserID': 101},
    { 'AlertID': 2, 'VitalID': None, 'AlertType': 'MissingVital', 'Date': '2020-01-04', 'UserID': 100},
    { 'AlertID': 3, 'VitalID': None, 'AlertType': 'MissingVital', 'Date': '2020-01-05', 'UserID': 101}
]

In [4]:
rdd_vital = spark.sparkContext.parallelize(vital)
rdd_alert = spark.sparkContext.parallelize(alert)

In [5]:
df_vital = rdd_vital.toDF()
df_alert = rdd_alert.toDF()

In [6]:
df_vital.printSchema()

root
 |-- Date: string (nullable = true)
 |-- UserID: long (nullable = true)
 |-- VitalID: long (nullable = true)
 |-- Weight: long (nullable = true)



In [7]:
df_alert.printSchema()

root
 |-- AlertID: long (nullable = true)
 |-- AlertType: string (nullable = true)
 |-- Date: string (nullable = true)
 |-- UserID: long (nullable = true)
 |-- VitalID: long (nullable = true)



## JOIN by DataFrame

In [8]:
# INNER JOIN
join_expr = df_vital.VitalID == df_alert.VitalID
df_vital.join(df_alert, join_expr, "inner").show()

+----------+------+-------+------+-------+--------------+----------+------+-------+
|      Date|UserID|VitalID|Weight|AlertID|     AlertType|      Date|UserID|VitalID|
+----------+------+-------+------+-------+--------------+----------+------+-------+
|2020-01-02|   101|      4|    95|      1|WeightIncrease|2020-01-01|   101|      4|
+----------+------+-------+------+-------+--------------+----------+------+-------+



In [9]:
# LEFT JOIN
join_expr = df_vital.VitalID == df_alert.VitalID
df_vital.join(df_alert, join_expr, "left").show()

+----------+------+-------+------+-------+--------------+----------+------+-------+
|      Date|UserID|VitalID|Weight|AlertID|     AlertType|      Date|UserID|VitalID|
+----------+------+-------+------+-------+--------------+----------+------+-------+
|2020-01-01|   100|      1|    75|   null|          null|      null|  null|   null|
|2020-01-02|   100|      2|    78|   null|          null|      null|  null|   null|
|2020-01-01|   101|      3|    90|   null|          null|      null|  null|   null|
|2020-01-02|   101|      4|    95|      1|WeightIncrease|2020-01-01|   101|      4|
+----------+------+-------+------+-------+--------------+----------+------+-------+



In [10]:
# RIGHT JOIN
join_expr = df_vital.VitalID == df_alert.VitalID
df_vital.join(df_alert, join_expr, "right").show()

+----------+------+-------+------+-------+--------------+----------+------+-------+
|      Date|UserID|VitalID|Weight|AlertID|     AlertType|      Date|UserID|VitalID|
+----------+------+-------+------+-------+--------------+----------+------+-------+
|2020-01-02|   101|      4|    95|      1|WeightIncrease|2020-01-01|   101|      4|
|      null|  null|   null|  null|      2|  MissingVital|2020-01-04|   100|   null|
|      null|  null|   null|  null|      3|  MissingVital|2020-01-05|   101|   null|
+----------+------+-------+------+-------+--------------+----------+------+-------+



In [11]:
# FULL OUTER JOIN
join_expr = df_vital.VitalID == df_alert.VitalID
df_vital.join(df_alert, join_expr, "full").show()

+----------+------+-------+------+-------+--------------+----------+------+-------+
|      Date|UserID|VitalID|Weight|AlertID|     AlertType|      Date|UserID|VitalID|
+----------+------+-------+------+-------+--------------+----------+------+-------+
|      null|  null|   null|  null|      2|  MissingVital|2020-01-04|   100|   null|
|      null|  null|   null|  null|      3|  MissingVital|2020-01-05|   101|   null|
|2020-01-01|   100|      1|    75|   null|          null|      null|  null|   null|
|2020-01-02|   100|      2|    78|   null|          null|      null|  null|   null|
|2020-01-01|   101|      3|    90|   null|          null|      null|  null|   null|
|2020-01-02|   101|      4|    95|      1|WeightIncrease|2020-01-01|   101|      4|
+----------+------+-------+------+-------+--------------+----------+------+-------+



In [12]:
# CROSS JOIN
df_vital.join(df_alert, None, "cross").show()

+----------+------+-------+------+-------+--------------+----------+------+-------+
|      Date|UserID|VitalID|Weight|AlertID|     AlertType|      Date|UserID|VitalID|
+----------+------+-------+------+-------+--------------+----------+------+-------+
|2020-01-01|   100|      1|    75|      1|WeightIncrease|2020-01-01|   101|      4|
|2020-01-02|   100|      2|    78|      1|WeightIncrease|2020-01-01|   101|      4|
|2020-01-01|   100|      1|    75|      2|  MissingVital|2020-01-04|   100|   null|
|2020-01-01|   100|      1|    75|      3|  MissingVital|2020-01-05|   101|   null|
|2020-01-02|   100|      2|    78|      2|  MissingVital|2020-01-04|   100|   null|
|2020-01-02|   100|      2|    78|      3|  MissingVital|2020-01-05|   101|   null|
|2020-01-01|   101|      3|    90|      1|WeightIncrease|2020-01-01|   101|      4|
|2020-01-02|   101|      4|    95|      1|WeightIncrease|2020-01-01|   101|      4|
|2020-01-01|   101|      3|    90|      2|  MissingVital|2020-01-04|   100| 

In [13]:
# SELF JOIN
join_expr = df_vital.VitalID == df_vital.VitalID
df_vital.join(df_vital, join_expr, "left").show()

+----------+------+-------+------+----------+------+-------+------+
|      Date|UserID|VitalID|Weight|      Date|UserID|VitalID|Weight|
+----------+------+-------+------+----------+------+-------+------+
|2020-01-01|   100|      1|    75|2020-01-01|   100|      1|    75|
|2020-01-02|   100|      2|    78|2020-01-02|   100|      2|    78|
|2020-01-01|   101|      3|    90|2020-01-01|   101|      3|    90|
|2020-01-02|   101|      4|    95|2020-01-02|   101|      4|    95|
+----------+------+-------+------+----------+------+-------+------+



## JOIN by SQL

In [14]:
df_vital.createOrReplaceTempView("Vital")
df_alert.createOrReplaceTempView("Alert")

In [15]:
# INNER JOIN
df_inner_join = spark.sql("""SELECT * FROM Vital v
JOIN Alert a ON v.vitalID = a.vitalID;""")
df_inner_join.show()

+----------+------+-------+------+-------+--------------+----------+------+-------+
|      Date|UserID|VitalID|Weight|AlertID|     AlertType|      Date|UserID|VitalID|
+----------+------+-------+------+-------+--------------+----------+------+-------+
|2020-01-02|   101|      4|    95|      1|WeightIncrease|2020-01-01|   101|      4|
+----------+------+-------+------+-------+--------------+----------+------+-------+



In [16]:
# LEFT JOIN
df_left_join = spark.sql("""SELECT * FROM Vital v
LEFT JOIN Alert a ON v.vitalID = a.vitalID;""")
df_left_join.show()

+----------+------+-------+------+-------+--------------+----------+------+-------+
|      Date|UserID|VitalID|Weight|AlertID|     AlertType|      Date|UserID|VitalID|
+----------+------+-------+------+-------+--------------+----------+------+-------+
|2020-01-01|   100|      1|    75|   null|          null|      null|  null|   null|
|2020-01-02|   100|      2|    78|   null|          null|      null|  null|   null|
|2020-01-01|   101|      3|    90|   null|          null|      null|  null|   null|
|2020-01-02|   101|      4|    95|      1|WeightIncrease|2020-01-01|   101|      4|
+----------+------+-------+------+-------+--------------+----------+------+-------+



In [17]:
# RIGHT JOIN
df_right_join = spark.sql("""SELECT * FROM Vital v
RIGHT JOIN Alert a ON v.vitalID = a.vitalID;""")
df_right_join.show()

+----------+------+-------+------+-------+--------------+----------+------+-------+
|      Date|UserID|VitalID|Weight|AlertID|     AlertType|      Date|UserID|VitalID|
+----------+------+-------+------+-------+--------------+----------+------+-------+
|2020-01-02|   101|      4|    95|      1|WeightIncrease|2020-01-01|   101|      4|
|      null|  null|   null|  null|      2|  MissingVital|2020-01-04|   100|   null|
|      null|  null|   null|  null|      3|  MissingVital|2020-01-05|   101|   null|
+----------+------+-------+------+-------+--------------+----------+------+-------+



In [18]:
# OUTER JOIN
df_outer_join = spark.sql("""SELECT * FROM Vital v
FULL JOIN Alert a ON v.vitalID = a.vitalID;""")
df_outer_join.show()

+----------+------+-------+------+-------+--------------+----------+------+-------+
|      Date|UserID|VitalID|Weight|AlertID|     AlertType|      Date|UserID|VitalID|
+----------+------+-------+------+-------+--------------+----------+------+-------+
|      null|  null|   null|  null|      2|  MissingVital|2020-01-04|   100|   null|
|      null|  null|   null|  null|      3|  MissingVital|2020-01-05|   101|   null|
|2020-01-01|   100|      1|    75|   null|          null|      null|  null|   null|
|2020-01-02|   100|      2|    78|   null|          null|      null|  null|   null|
|2020-01-01|   101|      3|    90|   null|          null|      null|  null|   null|
|2020-01-02|   101|      4|    95|      1|WeightIncrease|2020-01-01|   101|      4|
+----------+------+-------+------+-------+--------------+----------+------+-------+



In [19]:
# CROSS JOIN
df_cross_join = spark.sql("""SELECT * FROM Vital v
CROSS JOIN Alert a""")
df_cross_join.show()

+----------+------+-------+------+-------+--------------+----------+------+-------+
|      Date|UserID|VitalID|Weight|AlertID|     AlertType|      Date|UserID|VitalID|
+----------+------+-------+------+-------+--------------+----------+------+-------+
|2020-01-01|   100|      1|    75|      1|WeightIncrease|2020-01-01|   101|      4|
|2020-01-02|   100|      2|    78|      1|WeightIncrease|2020-01-01|   101|      4|
|2020-01-01|   100|      1|    75|      2|  MissingVital|2020-01-04|   100|   null|
|2020-01-01|   100|      1|    75|      3|  MissingVital|2020-01-05|   101|   null|
|2020-01-02|   100|      2|    78|      2|  MissingVital|2020-01-04|   100|   null|
|2020-01-02|   100|      2|    78|      3|  MissingVital|2020-01-05|   101|   null|
|2020-01-01|   101|      3|    90|      1|WeightIncrease|2020-01-01|   101|      4|
|2020-01-02|   101|      4|    95|      1|WeightIncrease|2020-01-01|   101|      4|
|2020-01-01|   101|      3|    90|      2|  MissingVital|2020-01-04|   100| 

In [20]:
# self JOIN
df_self_join = spark.sql("""SELECT * FROM Vital v1
JOIN Vital v2""")
df_self_join.show()

+----------+------+-------+------+----------+------+-------+------+
|      Date|UserID|VitalID|Weight|      Date|UserID|VitalID|Weight|
+----------+------+-------+------+----------+------+-------+------+
|2020-01-01|   100|      1|    75|2020-01-01|   100|      1|    75|
|2020-01-01|   100|      1|    75|2020-01-02|   100|      2|    78|
|2020-01-02|   100|      2|    78|2020-01-01|   100|      1|    75|
|2020-01-02|   100|      2|    78|2020-01-02|   100|      2|    78|
|2020-01-01|   100|      1|    75|2020-01-01|   101|      3|    90|
|2020-01-01|   100|      1|    75|2020-01-02|   101|      4|    95|
|2020-01-02|   100|      2|    78|2020-01-01|   101|      3|    90|
|2020-01-02|   100|      2|    78|2020-01-02|   101|      4|    95|
|2020-01-01|   101|      3|    90|2020-01-01|   100|      1|    75|
|2020-01-01|   101|      3|    90|2020-01-02|   100|      2|    78|
|2020-01-02|   101|      4|    95|2020-01-01|   100|      1|    75|
|2020-01-02|   101|      4|    95|2020-01-02|   