<a href="https://colab.research.google.com/github/Kiran45181/Pyspark/blob/main/Advanced_pyspark.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [36]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("PySpark Use Case Activities") \
    .getOrCreate()

from pyspark.sql.functions import col, rank, avg, udf, pandas_udf
from pyspark.sql.window import Window
import pandas as pd

employees_data = [
    (1, "Alice", "HR", 3000),
    (2, "Bob", "IT", 4000),
    (3, "Cathy", "HR", 3500),
    (4, "David", "IT", 4500),
    (5, "Eve", "Finance", 5000),
    (6, "Frank", "Finance", 4800),
]

employees_df = spark.createDataFrame(employees_data, ["id", "name", "department", "salary"])
employees_df.show()

+---+-----+----------+------+
| id| name|department|salary|
+---+-----+----------+------+
|  1|Alice|        HR|  3000|
|  2|  Bob|        IT|  4000|
|  3|Cathy|        HR|  3500|
|  4|David|        IT|  4500|
|  5|  Eve|   Finance|  5000|
|  6|Frank|   Finance|  4800|
+---+-----+----------+------+



In [37]:
from pyspark.sql.window import Window
from pyspark.sql.functions import rank, col

# Define window: partition by department and order by salary descending
window_spec = Window.partitionBy("department").orderBy(col("salary").desc())

# Apply rank
ranked_df = employees_df.withColumn("rank", rank().over(window_spec))

ranked_df.show()


+---+-----+----------+------+----+
| id| name|department|salary|rank|
+---+-----+----------+------+----+
|  5|  Eve|   Finance|  5000|   1|
|  6|Frank|   Finance|  4800|   2|
|  3|Cathy|        HR|  3500|   1|
|  1|Alice|        HR|  3000|   2|
|  4|David|        IT|  4500|   1|
|  2|  Bob|        IT|  4000|   2|
+---+-----+----------+------+----+



In [38]:
from pyspark.sql.window import Window
from pyspark.sql.functions import avg

# Define window spec: partition by department (no order needed for average)
window_spec = Window.partitionBy("department")

# Calculate average salary per department
avg_salary_df = employees_df.withColumn(
    "avg_salary_per_dept", avg("salary").over(window_spec)
)

avg_salary_df.show()


+---+-----+----------+------+-------------------+
| id| name|department|salary|avg_salary_per_dept|
+---+-----+----------+------+-------------------+
|  5|  Eve|   Finance|  5000|             4900.0|
|  6|Frank|   Finance|  4800|             4900.0|
|  1|Alice|        HR|  3000|             3250.0|
|  3|Cathy|        HR|  3500|             3250.0|
|  2|  Bob|        IT|  4000|             4250.0|
|  4|David|        IT|  4500|             4250.0|
+---+-----+----------+------+-------------------+



##Practice

In [39]:
from pyspark.sql import SparkSession
#create a emp table with salary dept
emp_data = [("James", "Sales", 3000), ("Michael", "Sales", 4600), ("Robert", "Sales", 4100), ("Maria", "Finance", 3000), ("James", "Sales", 3900)]


column = ["name", "dept", "salary"]

spark = SparkSession.builder.appName("SparkByExamples").getOrCreate()

emp_df = spark.createDataFrame(emp_data, column)
emp_df.show()

+-------+-------+------+
|   name|   dept|salary|
+-------+-------+------+
|  James|  Sales|  3000|
|Michael|  Sales|  4600|
| Robert|  Sales|  4100|
|  Maria|Finance|  3000|
|  James|  Sales|  3900|
+-------+-------+------+



In [40]:
#create dept dataset
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("SparkByExamples").getOrCreate()

dept_data = [("Finance", 10), ("Marketing", 20), ("Sales", 30), ("IT", 40)]

dept_columns = ["dept", "dept_id"]

dept_df = spark.createDataFrame(dept_data, dept_columns)
dept_df.show()


+---------+-------+
|     dept|dept_id|
+---------+-------+
|  Finance|     10|
|Marketing|     20|
|    Sales|     30|
|       IT|     40|
+---------+-------+



In [41]:
emp_df.join(dept_df, "dept", "inner").show()

+-------+-------+------+-------+
|   dept|   name|salary|dept_id|
+-------+-------+------+-------+
|Finance|  Maria|  3000|     10|
|  Sales|  James|  3000|     30|
|  Sales|Michael|  4600|     30|
|  Sales| Robert|  4100|     30|
|  Sales|  James|  3900|     30|
+-------+-------+------+-------+



In [42]:
emp_df.select("Name","salary").orderBy(col("salary").desc()).show()

+-------+------+
|   Name|salary|
+-------+------+
|Michael|  4600|
| Robert|  4100|
|  James|  3900|
|  James|  3000|
|  Maria|  3000|
+-------+------+



In [43]:
emp_df.filter(col("salary")>4000).show()

+-------+-----+------+
|   name| dept|salary|
+-------+-----+------+
|Michael|Sales|  4600|
| Robert|Sales|  4100|
+-------+-----+------+



In [44]:
emp_df.withColumn("bonus", col("salary")*0.1).show()

+-------+-------+------+-----+
|   name|   dept|salary|bonus|
+-------+-------+------+-----+
|  James|  Sales|  3000|300.0|
|Michael|  Sales|  4600|460.0|
| Robert|  Sales|  4100|410.0|
|  Maria|Finance|  3000|300.0|
|  James|  Sales|  3900|390.0|
+-------+-------+------+-----+



In [45]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("WindowPractice").getOrCreate()

data = [
    (1, "Alice", "HR", 3000),
    (2, "Bob", "IT", 4000),
    (3, "Cathy", "HR", 3500),
    (4, "David", "IT", 4500),
    (5, "Eve", "Finance", 5000),
    (6, "Frank", "Finance", 4800),
]

columns = ["id", "name", "department", "salary"]

df = spark.createDataFrame(data, columns)
df.show()

+---+-----+----------+------+
| id| name|department|salary|
+---+-----+----------+------+
|  1|Alice|        HR|  3000|
|  2|  Bob|        IT|  4000|
|  3|Cathy|        HR|  3500|
|  4|David|        IT|  4500|
|  5|  Eve|   Finance|  5000|
|  6|Frank|   Finance|  4800|
+---+-----+----------+------+



In [46]:
from pyspark.sql.window import Window
from pyspark.sql.functions import rank, col
window_spec = Window.partitionBy("department").orderBy(col("salary").desc())

df.withColumn("rank",rank().over(window_spec)).show()

+---+-----+----------+------+----+
| id| name|department|salary|rank|
+---+-----+----------+------+----+
|  5|  Eve|   Finance|  5000|   1|
|  6|Frank|   Finance|  4800|   2|
|  3|Cathy|        HR|  3500|   1|
|  1|Alice|        HR|  3000|   2|
|  4|David|        IT|  4500|   1|
|  2|  Bob|        IT|  4000|   2|
+---+-----+----------+------+----+



##Complex Nested Schema Handling

In [47]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import explode

spark = SparkSession.builder.appName("AdvancedOps").getOrCreate()

data = [
    ("John", ["Python", "Java"]),
    ("Jane", ["SQL", "R", "Scala"]),
    ("Mike", [])
]
columns = ["Name", "Skills"]

df = spark.createDataFrame(data, columns)
df.show(truncate=False)

+----+---------------+
|Name|Skills         |
+----+---------------+
|John|[Python, Java] |
|Jane|[SQL, R, Scala]|
|Mike|[]             |
+----+---------------+



In [48]:
df_explode = df.withColumn("Skills",explode(df.Skills))
df_explode.show()

+----+------+
|Name|Skills|
+----+------+
|John|Python|
|John|  Java|
|Jane|   SQL|
|Jane|     R|
|Jane| Scala|
+----+------+



In [53]:
df.createOrReplaceTempView("people")


In [54]:
df_lateral = spark.sql("""
    SELECT Name, Skill
    FROM people
    LATERAL VIEW explode(Skills) AS Skill
""")

df_lateral.show()


+----+------+
|Name| Skill|
+----+------+
|John|Python|
|John|  Java|
|Jane|   SQL|
|Jane|     R|
|Jane| Scala|
+----+------+



##Pivot

In [55]:
data = [
    ("ProductA", "Jan", 100),
    ("ProductA", "Feb", 150),
    ("ProductA", "Mar", 120),
    ("ProductB", "Jan", 200),
    ("ProductB", "Feb", 230),
    ("ProductB", "Mar", 210),
]
columns = ["Product", "Month", "Sales"]

df = spark.createDataFrame(data, columns)
df.show()

+--------+-----+-----+
| Product|Month|Sales|
+--------+-----+-----+
|ProductA|  Jan|  100|
|ProductA|  Feb|  150|
|ProductA|  Mar|  120|
|ProductB|  Jan|  200|
|ProductB|  Feb|  230|
|ProductB|  Mar|  210|
+--------+-----+-----+



In [56]:
pivot_df = df.groupBy("Product").pivot("Month").sum("Sales")
pivot_df.show()

+--------+---+---+---+
| Product|Feb|Jan|Mar|
+--------+---+---+---+
|ProductB|230|200|210|
|ProductA|150|100|120|
+--------+---+---+---+



In [58]:
#First use pivot to create wide format
wide_df = df.groupBy("Product").pivot("Month").sum("Sales")


#Then unpivot back to long format
unpivot_df = wide_df.selectExpr("Product", "stack(3, 'Jan', Jan, 'Feb', Feb, 'Mar', Mar) as (Month, Sales)")

unpivot_df.show()

+--------+-----+-----+
| Product|Month|Sales|
+--------+-----+-----+
|ProductB|  Jan|  200|
|ProductB|  Feb|  230|
|ProductB|  Mar|  210|
|ProductA|  Jan|  100|
|ProductA|  Feb|  150|
|ProductA|  Mar|  120|
+--------+-----+-----+

