<a href="https://colab.research.google.com/github/Maks2811/test_nagaev/blob/main/Pyspark_SQL_JOIN.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install pyspark

Collecting pyspark
  Downloading pyspark-3.5.1.tar.gz (317.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m317.0/317.0 MB[0m [31m3.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.5.1-py2.py3-none-any.whl size=317488490 sha256=44065efa04923ef36f1e7a7c7b5b3c6f1d8325fd3e7ffadc63a8f19c91c67b25
  Stored in directory: /root/.cache/pip/wheels/80/1d/60/2c256ed38dddce2fdd93be545214a63e02fbd8d74fb0b7f3a6
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.5.1


In [2]:
from pyspark.sql import SparkSession

# Создание SparkSession
spark = SparkSession.builder.appName("JoinExamples").getOrCreate()

# Пример данных для DataFrame people
people_data = [
    ("John", 30, 1),
    ("Doe", 25, 2),
    ("Jane", 35, 1),
    ("Mark", 40, 2),
    ("Smith", 23, 3)
]
people_columns = ["name", "age", "department_id"]
people_df = spark.createDataFrame(data=people_data, schema=people_columns)

# Пример данных для DataFrame departments
departments_data = [
    (1, "HR"),
    (2, "Finance"),
    (3, "Engineering"),
    (4, "Marketing")
]
departments_columns = ["id", "department_name"]
departments_df = spark.createDataFrame(data=departments_data, schema=departments_columns)

# Показ данных
people_df.show()
departments_df.show()

+-----+---+-------------+
| name|age|department_id|
+-----+---+-------------+
| John| 30|            1|
|  Doe| 25|            2|
| Jane| 35|            1|
| Mark| 40|            2|
|Smith| 23|            3|
+-----+---+-------------+

+---+---------------+
| id|department_name|
+---+---------------+
|  1|             HR|
|  2|        Finance|
|  3|    Engineering|
|  4|      Marketing|
+---+---------------+



In [4]:
# Внутреннее соединение
inner_join_df = people_df.join(departments_df, people_df.department_id == departments_df.id, "inner")
inner_join_df.show()

+-----+---+-------------+---+---------------+
| name|age|department_id| id|department_name|
+-----+---+-------------+---+---------------+
| John| 30|            1|  1|             HR|
| Jane| 35|            1|  1|             HR|
|  Doe| 25|            2|  2|        Finance|
| Mark| 40|            2|  2|        Finance|
|Smith| 23|            3|  3|    Engineering|
+-----+---+-------------+---+---------------+



In [5]:
# Левое внешнее соединение
left_outer_join_df = people_df.join(departments_df, people_df.department_id == departments_df.id, "left_outer")
left_outer_join_df.show()

+-----+---+-------------+---+---------------+
| name|age|department_id| id|department_name|
+-----+---+-------------+---+---------------+
| John| 30|            1|  1|             HR|
|  Doe| 25|            2|  2|        Finance|
| Jane| 35|            1|  1|             HR|
|Smith| 23|            3|  3|    Engineering|
| Mark| 40|            2|  2|        Finance|
+-----+---+-------------+---+---------------+



In [6]:
# Полное внешнее соединение (Full Outer Join)
full_outer_join_df = people_df.join(departments_df, people_df.department_id == departments_df.id, "full_outer")
full_outer_join_df.show()

+-----+----+-------------+---+---------------+
| name| age|department_id| id|department_name|
+-----+----+-------------+---+---------------+
| John|  30|            1|  1|             HR|
| Jane|  35|            1|  1|             HR|
|  Doe|  25|            2|  2|        Finance|
| Mark|  40|            2|  2|        Finance|
|Smith|  23|            3|  3|    Engineering|
| NULL|NULL|         NULL|  4|      Marketing|
+-----+----+-------------+---+---------------+



In [12]:
# Полное перекрестное соединение (Cross Join)
cross_join_df = people_df.crossJoin(departments_df)
cross_join_df.show()

+-----+---+-------------+---+---------------+
| name|age|department_id| id|department_name|
+-----+---+-------------+---+---------------+
| John| 30|            1|  1|             HR|
| John| 30|            1|  2|        Finance|
|  Doe| 25|            2|  1|             HR|
|  Doe| 25|            2|  2|        Finance|
| John| 30|            1|  3|    Engineering|
| John| 30|            1|  4|      Marketing|
|  Doe| 25|            2|  3|    Engineering|
|  Doe| 25|            2|  4|      Marketing|
| Jane| 35|            1|  1|             HR|
| Jane| 35|            1|  2|        Finance|
| Mark| 40|            2|  1|             HR|
| Mark| 40|            2|  2|        Finance|
|Smith| 23|            3|  1|             HR|
|Smith| 23|            3|  2|        Finance|
| Jane| 35|            1|  3|    Engineering|
| Jane| 35|            1|  4|      Marketing|
| Mark| 40|            2|  3|    Engineering|
| Mark| 40|            2|  4|      Marketing|
|Smith| 23|            3|  3|    E