In [0]:
data = [
    (1, "Alice",   "IT",      "Delhi",    70000),
    (2, "Bob",     "HR",      "Mumbai",   50000),
    (3, "Charlie", "IT",      "Delhi",    80000),
    (4, "David",   "Finance", "Chennai",  90000),
    (5, "Eva",     "HR",      "Delhi",    55000),
    (6, "Frank",   "IT",      "Mumbai",   75000),
    (7, "Grace",   "Finance", "Delhi",    95000),
    (8, "Henry",   "IT",      "Chennai",  72000),
    (9, "Ivy",     "HR",      "Mumbai",   48000),
    (10,"Jack",    "Finance", "Mumbai",   88000)
]

columns = ["emp_id", "name", "dept", "city", "salary"]

df = spark.createDataFrame(data, columns)


In [0]:
df.show()

+------+-------+-------+-------+------+
|emp_id|   name|   dept|   city|salary|
+------+-------+-------+-------+------+
|     1|  Alice|     IT|  Delhi| 70000|
|     2|    Bob|     HR| Mumbai| 50000|
|     3|Charlie|     IT|  Delhi| 80000|
|     4|  David|Finance|Chennai| 90000|
|     5|    Eva|     HR|  Delhi| 55000|
|     6|  Frank|     IT| Mumbai| 75000|
|     7|  Grace|Finance|  Delhi| 95000|
|     8|  Henry|     IT|Chennai| 72000|
|     9|    Ivy|     HR| Mumbai| 48000|
|    10|   Jack|Finance| Mumbai| 88000|
+------+-------+-------+-------+------+



Narrow transformations

In [0]:
df_select = df.select("emp_id", "dept", "salary")
df_select.show()

+------+-------+------+
|emp_id|   dept|salary|
+------+-------+------+
|     1|     IT| 70000|
|     2|     HR| 50000|
|     3|     IT| 80000|
|     4|Finance| 90000|
|     5|     HR| 55000|
|     6|     IT| 75000|
|     7|Finance| 95000|
|     8|     IT| 72000|
|     9|     HR| 48000|
|    10|Finance| 88000|
+------+-------+------+



In [0]:
df_filter=df.filter(df.dept== "IT")
df_filter.show()


+------+-------+----+-------+------+
|emp_id|   name|dept|   city|salary|
+------+-------+----+-------+------+
|     1|  Alice|  IT|  Delhi| 70000|
|     3|Charlie|  IT|  Delhi| 80000|
|     6|  Frank|  IT| Mumbai| 75000|
|     8|  Henry|  IT|Chennai| 72000|
+------+-------+----+-------+------+



In [0]:
from pyspark.sql.functions import col

df_bonus = df.withColumn("bonus", col("salary") * 0.10)
df_bonus.show()

+------+-------+-------+-------+------+------+
|emp_id|   name|   dept|   city|salary| bonus|
+------+-------+-------+-------+------+------+
|     1|  Alice|     IT|  Delhi| 70000|7000.0|
|     2|    Bob|     HR| Mumbai| 50000|5000.0|
|     3|Charlie|     IT|  Delhi| 80000|8000.0|
|     4|  David|Finance|Chennai| 90000|9000.0|
|     5|    Eva|     HR|  Delhi| 55000|5500.0|
|     6|  Frank|     IT| Mumbai| 75000|7500.0|
|     7|  Grace|Finance|  Delhi| 95000|9500.0|
|     8|  Henry|     IT|Chennai| 72000|7200.0|
|     9|    Ivy|     HR| Mumbai| 48000|4800.0|
|    10|   Jack|Finance| Mumbai| 88000|8800.0|
+------+-------+-------+-------+------+------+



Wide transformations

In [0]:
from pyspark.sql.functions import avg

df_group = df.groupBy("dept").agg(
    avg("salary").alias("avg_salary")
)

df_group.show()

+-------+----------+
|   dept|avg_salary|
+-------+----------+
|     IT|   74250.0|
|     HR|   51000.0|
|Finance|   91000.0|
+-------+----------+



In [0]:
df_sorted = df.orderBy(df.salary.desc())
df_sorted.show()

+------+-------+-------+-------+------+
|emp_id|   name|   dept|   city|salary|
+------+-------+-------+-------+------+
|     7|  Grace|Finance|  Delhi| 95000|
|     4|  David|Finance|Chennai| 90000|
|    10|   Jack|Finance| Mumbai| 88000|
|     3|Charlie|     IT|  Delhi| 80000|
|     6|  Frank|     IT| Mumbai| 75000|
|     8|  Henry|     IT|Chennai| 72000|
|     1|  Alice|     IT|  Delhi| 70000|
|     5|    Eva|     HR|  Delhi| 55000|
|     2|    Bob|     HR| Mumbai| 50000|
|     9|    Ivy|     HR| Mumbai| 48000|
+------+-------+-------+-------+------+



In [0]:
a=df.filter(df.dept=="HR")
a.show()

+------+----+----+------+------+
|emp_id|name|dept|  city|salary|
+------+----+----+------+------+
|     2| Bob|  HR|Mumbai| 50000|
|     5| Eva|  HR| Delhi| 55000|
|     9| Ivy|  HR|Mumbai| 48000|
+------+----+----+------+------+



In [0]:
df.sortWithinPartitions("salary").filter(df.dept=="HR").show()


+------+----+----+------+------+
|emp_id|name|dept|  city|salary|
+------+----+----+------+------+
|     9| Ivy|  HR|Mumbai| 48000|
|     2| Bob|  HR|Mumbai| 50000|
|     5| Eva|  HR| Delhi| 55000|
+------+----+----+------+------+



Joins

In [0]:
dept_data = [
    ("IT", "Technology"),
    ("HR", "Human Resources"),
    ("Finance", "Finance & Accounts")
]

dept_columns = ["dept", "dept_name"]

df_dept = spark.createDataFrame(dept_data, dept_columns)
df_dept.show()


+-------+------------------+
|   dept|         dept_name|
+-------+------------------+
|     IT|        Technology|
|     HR|   Human Resources|
|Finance|Finance & Accounts|
+-------+------------------+



In [0]:
df_join = df.join(df_dept, on="dept", how="inner")
df_join.show()


+-------+------+-------+-------+------+------------------+
|   dept|emp_id|   name|   city|salary|         dept_name|
+-------+------+-------+-------+------+------------------+
|     IT|     1|  Alice|  Delhi| 70000|        Technology|
|     HR|     2|    Bob| Mumbai| 50000|   Human Resources|
|     IT|     3|Charlie|  Delhi| 80000|        Technology|
|Finance|     4|  David|Chennai| 90000|Finance & Accounts|
|     HR|     5|    Eva|  Delhi| 55000|   Human Resources|
|     IT|     6|  Frank| Mumbai| 75000|        Technology|
|Finance|     7|  Grace|  Delhi| 95000|Finance & Accounts|
|     IT|     8|  Henry|Chennai| 72000|        Technology|
|     HR|     9|    Ivy| Mumbai| 48000|   Human Resources|
|Finance|    10|   Jack| Mumbai| 88000|Finance & Accounts|
+-------+------+-------+-------+------+------------------+

