In [1]:

from pyspark.sql import SparkSession
from pyspark.sql.functions import col


Create sample data:

```python
spark = SparkSession.builder.appName("joins").getOrCreate()

data1 = [(1,"Manasa"),
         (2,"Mevin"),
         (3,"Pavithra"),
         (4,"Menakha")]

data2 = [(1,"HR"),
         (2,"IT"),
         (5,"Finance")]

df1 = spark.createDataFrame(data1, ["id","name"])
df2 = spark.createDataFrame(data2, ["id","dept"])
```

---

18.1 INNER JOIN

# what it does

Returns only matching rows from both tables.

# pattern

Common keys only (intersection).

# important to remember

Default join type in PySpark.
Non-matching rows are removed.

# syntax

```python
df1.join(df2, df1.id == df2.id, "inner")
```

# example

```python
df1.join(df2, "id", "inner").show()
```

# output

```
+---+-------+----+
|id |name   |dept|
+---+-------+----+
|1  |Manasa |HR  |
|2  |Mevin  |IT  |
+---+-------+----+
```

---

18.2 CROSS JOIN

# what it does

Every row of df1 combines with every row of df2.

# pattern

Cartesian product.

# important

Very expensive operation.
Rows = m × n

# syntax

```python
df1.crossJoin(df2)
```

# example

```python
df1.crossJoin(df2).show()
```

# output (4 rows × 3 rows = 12 rows)

```
id name      id dept
1  Manasa    1  HR
1  Manasa    2  IT
1  Manasa    5  Finance
...
```

---

18.3 OUTER JOIN (FULL OUTER)

# what it does

Returns all rows from both tables.
Non matching → NULL.

# pattern

Union of left + right.

# syntax

```python
df1.join(df2, "id", "outer")
```

# example

```python
df1.join(df2, "id", "outer").show()
```

# output

```
+---+--------+--------+
|id |name    |dept    |
+---+--------+--------+
|1  |Manasa  |HR      |
|2  |Mevin   |IT      |
|3  |Pavithra|null    |
|4  |Menakha |null    |
|5  |null    |Finance |
+---+--------+--------+
```

---

18.4 LEFT JOIN

# what it does

All rows from left table.
Matching rows from right.

# pattern

Left table is priority.

# syntax

```python
df1.join(df2, "id", "left")
```

# example

```python
df1.join(df2, "id", "left").show()
```

# output

```
+---+--------+----+
|id |name    |dept|
+---+--------+----+
|1  |Manasa  |HR  |
|2  |Mevin   |IT  |
|3  |Pavithra|null|
|4  |Menakha |null|
+---+--------+----+
```

---

18.5 RIGHT JOIN

# what it does

All rows from right table.
Matching rows from left.

# syntax

```python
df1.join(df2, "id", "right")
```

# example

```python
df1.join(df2, "id", "right").show()
```

# output

```
+---+------+--------+
|id |name  |dept    |
+---+------+--------+
|1  |Manasa|HR      |
|2  |Mevin |IT      |
|5  |null  |Finance |
+---+------+--------+
```

---

18.6 LEFT SEMI JOIN

# what it does

Returns only rows from left table
Where match exists in right table.

# pattern

Like EXISTS in SQL.

# important

Returns only left columns.

# syntax

```python
df1.join(df2, "id", "left_semi")
```

# example

```python
df1.join(df2, "id", "left_semi").show()
```

# output

```
+---+-------+
|id |name   |
+---+-------+
|1  |Manasa |
|2  |Mevin  |
+---+-------+
```

---

18.7 LEFT ANTI JOIN

# what it does

Returns rows from left table
Where NO match in right.

# pattern

Like NOT EXISTS.

# important

Only left columns returned.

# syntax

```python
df1.join(df2, "id", "left_anti")
```

# example

```python
df1.join(df2, "id", "left_anti").show()
```

# output

```
+---+--------+
|id |name    |
+---+--------+
|3  |Pavithra|
|4  |Menakha |
+---+--------+
```

---

Quick Memory Trick

inner → only match
left → all left
right → all right
outer → everything
semi → match exists (left only)
anti → no match (left only)
cross → everything with everything




SyntaxError: invalid character '×' (U+00D7) (1112406838.py, line 78)

In [2]:

from pyspark.sql import SparkSession
from pyspark.sql.functions import col


spark = SparkSession.builder.appName("joins").getOrCreate()

data1 = [(1,"Manasa"),
         (2,"Mevin"),
         (3,"Pavithra"),
         (4,"Menakha")]

data2 = [(1,"HR"),
         (2,"IT"),
         (5,"Finance")]

df1 = spark.createDataFrame(data1, ["id","name"])
df2 = spark.createDataFrame(data2, ["id","dept"])


In [7]:
df1.join(df2 , df1.id == df2.id , "inner").show()
df1.join(df2 , "id" , "inner").show()

+---+------+---+----+
| id|  name| id|dept|
+---+------+---+----+
|  1|Manasa|  1|  HR|
|  2| Mevin|  2|  IT|
+---+------+---+----+

+---+------+----+
| id|  name|dept|
+---+------+----+
|  1|Manasa|  HR|
|  2| Mevin|  IT|
+---+------+----+



In [8]:
# cross join

df1.crossJoin(df2).show()

+---+--------+---+-------+
| id|    name| id|   dept|
+---+--------+---+-------+
|  1|  Manasa|  1|     HR|
|  2|   Mevin|  1|     HR|
|  1|  Manasa|  2|     IT|
|  1|  Manasa|  5|Finance|
|  2|   Mevin|  2|     IT|
|  2|   Mevin|  5|Finance|
|  3|Pavithra|  1|     HR|
|  4| Menakha|  1|     HR|
|  3|Pavithra|  2|     IT|
|  3|Pavithra|  5|Finance|
|  4| Menakha|  2|     IT|
|  4| Menakha|  5|Finance|
+---+--------+---+-------+



In [9]:
# outer join

df1.join(df2, "id", "outer").show()

+---+--------+-------+
| id|    name|   dept|
+---+--------+-------+
|  1|  Manasa|     HR|
|  2|   Mevin|     IT|
|  3|Pavithra|   NULL|
|  4| Menakha|   NULL|
|  5|    NULL|Finance|
+---+--------+-------+



In [12]:
# left join

df1.join(df2, "id", "left").show()

+---+--------+----+
| id|    name|dept|
+---+--------+----+
|  1|  Manasa|  HR|
|  2|   Mevin|  IT|
|  3|Pavithra|NULL|
|  4| Menakha|NULL|
+---+--------+----+



In [11]:
# RIGHT JOIN

# what it does

# All rows from right table.
# Matching rows from left.

df1.join(df2, "id", "right").show()

+---+------+-------+
| id|  name|   dept|
+---+------+-------+
|  1|Manasa|     HR|
|  5|  NULL|Finance|
|  2| Mevin|     IT|
+---+------+-------+



In [14]:
# LEFT SEMI JOIN
# semi → match exists (left only)

# what it does

# Returns only rows from left table
# Where match exists in right table.
# Like EXISTS in SQL.
# Returns only left columns.

df1.join(df2, "id", "left_semi").show()

+---+------+
| id|  name|
+---+------+
|  1|Manasa|
|  2| Mevin|
+---+------+



In [15]:
# LEFT ANTI JOIN

# anti → no match (left only)
# Returns rows from left table
# Where NO match in right.
# Like NOT EXISTS.

# Only left columns returned.

df1.join(df2, "id", "left_anti").show()

+---+--------+
| id|    name|
+---+--------+
|  3|Pavithra|
|  4| Menakha|
+---+--------+

