<a href="https://colab.research.google.com/github/Kiran45181/Pyspark/blob/main/Advanced%20Transformations%20and%20Actions%20With%20Pyspark.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Advanced Transformations and Actions With Pyspark


In [1]:
from pyspark.sql import SparkSession
spark =SparkSession.builder.appName("Basics").getOrCreate()
columns = ["Name","Department","Salary"]
data = [
    ("John", "Sales", 3000),
    ("Jane", "Finance", 4000),
    ("Mike", "Sales", 3500),
    ("Alice", "Finance", 3800),
    ("Bob", "IT", 4500)
]

df = spark.createDataFrame(data, columns)

df.show()

+-----+----------+------+
| Name|Department|Salary|
+-----+----------+------+
| John|     Sales|  3000|
| Jane|   Finance|  4000|
| Mike|     Sales|  3500|
|Alice|   Finance|  3800|
|  Bob|        IT|  4500|
+-----+----------+------+



In [10]:
# Group the DataFrame by the 'Department' column and count the number of rows (employees) in each department
df.groupBy("Department").count().show()

+----------+-----+
|Department|count|
+----------+-----+
|     Sales|    2|
|   Finance|    2|
|        IT|    1|
+----------+-----+



In [14]:
# Group the DataFrame by 'Department' and calculate the average salary for each department
df.groupBy("Department").avg("Salary").show()

+----------+-----------+
|Department|avg(Salary)|
+----------+-----------+
|     Sales|     3250.0|
|   Finance|     3900.0|
|        IT|     4500.0|
+----------+-----------+



In [8]:
from pyspark.sql import functions as F
df.groupBy("Department")\
.agg(
    F.avg("Salary").alias("avg_salary"),
    F.max("Salary").alias("max_salary"),
    F.min("Salary").alias("min_salary")
).show()


# \ ---> # It allows breaking long code into multiple lines for readability.

+----------+----------+----------+----------+
|Department|avg_salary|max_salary|min_salary|
+----------+----------+----------+----------+
|     Sales|    3250.0|      3500|      3000|
|   Finance|    3900.0|      4000|      3800|
|        IT|    4500.0|      4500|      4500|
+----------+----------+----------+----------+



In [9]:
# Create another DataFrame for department info
dept_data = [
    ("Sales", "Building A"),
    ("Finance", "Building B"),
    ("IT", "Building C")
]
dept_columns = ["Department", "Location"]

In [12]:
df.show()

+-----+----------+------+
| Name|Department|Salary|
+-----+----------+------+
| John|     Sales|  3000|
| Jane|   Finance|  4000|
| Mike|     Sales|  3500|
|Alice|   Finance|  3800|
|  Bob|        IT|  4500|
+-----+----------+------+



In [11]:
dept_df = spark.createDataFrame(dept_data, dept_columns)

#join employee data with department info
joined_df = df.join(dept_df,on="Department", how="inner")
joined_df.show()

+----------+-----+------+----------+
|Department| Name|Salary|  Location|
+----------+-----+------+----------+
|   Finance| Jane|  4000|Building B|
|   Finance|Alice|  3800|Building B|
|        IT|  Bob|  4500|Building C|
|     Sales| John|  3000|Building A|
|     Sales| Mike|  3500|Building A|
+----------+-----+------+----------+



In [13]:
dept_df.show()

+----------+----------+
|Department|  Location|
+----------+----------+
|     Sales|Building A|
|   Finance|Building B|
|        IT|Building C|
+----------+----------+



In [15]:
dept_df = spark.createDataFrame(dept_data, dept_columns)

#join employee data with department info
joined_df = df.join(dept_df,on="Department", how="right")
joined_df.show()

+----------+-----+------+----------+
|Department| Name|Salary|  Location|
+----------+-----+------+----------+
|     Sales| Mike|  3500|Building A|
|     Sales| John|  3000|Building A|
|   Finance|Alice|  3800|Building B|
|   Finance| Jane|  4000|Building B|
|        IT|  Bob|  4500|Building C|
+----------+-----+------+----------+



In [16]:
dept_df = spark.createDataFrame(dept_data, dept_columns)

#join employee data with department info
joined_df = df.join(dept_df,on="Department", how="left")
joined_df.show()

+----------+-----+------+----------+
|Department| Name|Salary|  Location|
+----------+-----+------+----------+
|     Sales| John|  3000|Building A|
|   Finance| Jane|  4000|Building B|
|     Sales| Mike|  3500|Building A|
|   Finance|Alice|  3800|Building B|
|        IT|  Bob|  4500|Building C|
+----------+-----+------+----------+



In [17]:
# Right join: All departments, employee details added where available
right_join_df = df.join(dept_df, on="Department", how="right")
right_join_df.show()


+----------+-----+------+----------+
|Department| Name|Salary|  Location|
+----------+-----+------+----------+
|     Sales| Mike|  3500|Building A|
|     Sales| John|  3000|Building A|
|   Finance|Alice|  3800|Building B|
|   Finance| Jane|  4000|Building B|
|        IT|  Bob|  4500|Building C|
+----------+-----+------+----------+



In [18]:
# Full join: Combines employees and departments, includes unmatched rows
full_join_df = df.join(dept_df, on="Department", how="outer")
full_join_df.show()


+----------+-----+------+----------+
|Department| Name|Salary|  Location|
+----------+-----+------+----------+
|   Finance| Jane|  4000|Building B|
|   Finance|Alice|  3800|Building B|
|        IT|  Bob|  4500|Building C|
|     Sales| John|  3000|Building A|
|     Sales| Mike|  3500|Building A|
+----------+-----+------+----------+



In [19]:
# Employee DataFrame
emp_data = [
    (1, "John", "Sales", 3000),
    (2, "Jane", "Finance", 4000),
    (3, "Mike", "Sales", 3500),
    (4, "Alice", "HR", 3800),
    (5, "Bob", "IT", 4500),
    (6, "Sam", "Support", 3200)
]
emp_cols = ["EmpID", "Name", "Department", "Salary"]
emp_df = spark.createDataFrame(emp_data, emp_cols)

# Department DataFrame
dept_data = [
    ("Sales", "Building A"),
    ("Finance", "Building B"),
    ("IT", "Building C"),
    ("Admin", "Building D")
]
dept_cols = ["Department", "Location"]
dept_df = spark.createDataFrame(dept_data, dept_cols)

# Display both
emp_df.show()
dept_df.show()

+-----+-----+----------+------+
|EmpID| Name|Department|Salary|
+-----+-----+----------+------+
|    1| John|     Sales|  3000|
|    2| Jane|   Finance|  4000|
|    3| Mike|     Sales|  3500|
|    4|Alice|        HR|  3800|
|    5|  Bob|        IT|  4500|
|    6|  Sam|   Support|  3200|
+-----+-----+----------+------+

+----------+----------+
|Department|  Location|
+----------+----------+
|     Sales|Building A|
|   Finance|Building B|
|        IT|Building C|
|     Admin|Building D|
+----------+----------+



In [20]:
# INNER JOIN: Return only matching rows (Department present in both DataFrames)
inner_join_df = emp_df.join(dept_df, on="Department", how="inner")
inner_join_df.show()


+----------+-----+----+------+----------+
|Department|EmpID|Name|Salary|  Location|
+----------+-----+----+------+----------+
|   Finance|    2|Jane|  4000|Building B|
|        IT|    5| Bob|  4500|Building C|
|     Sales|    1|John|  3000|Building A|
|     Sales|    3|Mike|  3500|Building A|
+----------+-----+----+------+----------+



In [21]:
  # LEFT JOIN: Return all employees, match department if available
left_join_df = emp_df.join(dept_df, on="Department", how="left")
left_join_df.show()


+----------+-----+-----+------+----------+
|Department|EmpID| Name|Salary|  Location|
+----------+-----+-----+------+----------+
|     Sales|    1| John|  3000|Building A|
|     Sales|    3| Mike|  3500|Building A|
|   Finance|    2| Jane|  4000|Building B|
|        HR|    4|Alice|  3800|      NULL|
|        IT|    5|  Bob|  4500|Building C|
|   Support|    6|  Sam|  3200|      NULL|
+----------+-----+-----+------+----------+



In [22]:
# RIGHT JOIN: Return all departments, match employees if available
right_join_df = emp_df.join(dept_df, on="Department", how="right")
right_join_df.show()


+----------+-----+----+------+----------+
|Department|EmpID|Name|Salary|  Location|
+----------+-----+----+------+----------+
|     Sales|    3|Mike|  3500|Building A|
|     Sales|    1|John|  3000|Building A|
|   Finance|    2|Jane|  4000|Building B|
|     Admin| NULL|NULL|  NULL|Building D|
|        IT|    5| Bob|  4500|Building C|
+----------+-----+----+------+----------+



In [23]:
# FULL JOIN: Return all rows from both DataFrames, matching where possible
full_join_df = emp_df.join(dept_df, on="Department", how="outer")
full_join_df.show()


+----------+-----+-----+------+----------+
|Department|EmpID| Name|Salary|  Location|
+----------+-----+-----+------+----------+
|     Admin| NULL| NULL|  NULL|Building D|
|   Finance|    2| Jane|  4000|Building B|
|        HR|    4|Alice|  3800|      NULL|
|        IT|    5|  Bob|  4500|Building C|
|     Sales|    1| John|  3000|Building A|
|     Sales|    3| Mike|  3500|Building A|
|   Support|    6|  Sam|  3200|      NULL|
+----------+-----+-----+------+----------+



In [24]:
# LEFT SEMI JOIN: Return employees whose Department exists in dept_df (no dept_df columns)
semi_join_df = emp_df.join(dept_df, on="Department", how="left_semi")
semi_join_df.show()


+----------+-----+----+------+
|Department|EmpID|Name|Salary|
+----------+-----+----+------+
|   Finance|    2|Jane|  4000|
|        IT|    5| Bob|  4500|
|     Sales|    1|John|  3000|
|     Sales|    3|Mike|  3500|
+----------+-----+----+------+



In [25]:
# LEFT ANTI JOIN: Return employees whose Department does NOT exist in dept_df
anti_join_df = emp_df.join(dept_df, on="Department", how="left_anti")
anti_join_df.show()


+----------+-----+-----+------+
|Department|EmpID| Name|Salary|
+----------+-----+-----+------+
|        HR|    4|Alice|  3800|
|   Support|    6|  Sam|  3200|
+----------+-----+-----+------+

