<a href="https://colab.research.google.com/github/Kiran45181/Pyspark/blob/main/Advanced_DataFrame_operations.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

##Advanced DataFrame operations

##Adv Pyspark (Window Functions)

In [14]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *

spark = SparkSession.builder.appName("AdvancedOps").getOrCreate()

data = [
    (1, "Alice", 2000, ["math", "science"], {"city": "NYC", "zip": "10001"}),
    (2, "Bob", 1500, ["english"], {"city": "SF", "zip": "94105"}),
    (3, "Charlie", 2200, ["math", "history", "science"], {"city": "NYC", "zip": "10001"}),
    (4, "David", 1200, ["art"], {"city": "LA", "zip": "90001"}),
]

df = spark.createDataFrame(data, schema=["id", "name", "salary", "subjects", "address"])
df.show(truncate=False)

+---+-------+------+------------------------+---------------------------+
|id |name   |salary|subjects                |address                    |
+---+-------+------+------------------------+---------------------------+
|1  |Alice  |2000  |[math, science]         |{zip -> 10001, city -> NYC}|
|2  |Bob    |1500  |[english]               |{zip -> 94105, city -> SF} |
|3  |Charlie|2200  |[math, history, science]|{zip -> 10001, city -> NYC}|
|4  |David  |1200  |[art]                   |{zip -> 90001, city -> LA} |
+---+-------+------+------------------------+---------------------------+



In [15]:
#import window function in pyspark
from pyspark.sql.window import Window

window_spec = Window.partitionBy("address.city").orderBy("salary")
df.withColumn("rank", rank().over(window_spec)).show()


+---+-------+------+--------------------+--------------------+----+
| id|   name|salary|            subjects|             address|rank|
+---+-------+------+--------------------+--------------------+----+
|  4|  David|  1200|               [art]|{zip -> 90001, ci...|   1|
|  1|  Alice|  2000|     [math, science]|{zip -> 10001, ci...|   1|
|  3|Charlie|  2200|[math, history, s...|{zip -> 10001, ci...|   2|
|  2|    Bob|  1500|           [english]|{zip -> 94105, ci...|   1|
+---+-------+------+--------------------+--------------------+----+



In [16]:
from pyspark.sql.window import Window
from pyspark.sql.functions import col, row_number, rank, dense_rank, max, sum, avg

# Employee Data
data = [
    (1, "John", "Sales", 3000),
    (2, "Jane", "Finance", 4000),
    (3, "Mike", "Sales", 3500),
    (4, "Alice", "Finance", 3800),
    (5, "Bob", "IT", 4500),
    (6, "Tom", "Sales", 3700),
    (7, "Jerry", "Finance", 4200),
    (8, "Sam", "IT", 4700),
    (9, "Steve", "Sales", 3100),
    (10, "Rachel", "IT", 4600)
]
columns = ["EmpID", "Name", "Department", "Salary"]
df = spark.createDataFrame(data, columns)

df.show()

+-----+------+----------+------+
|EmpID|  Name|Department|Salary|
+-----+------+----------+------+
|    1|  John|     Sales|  3000|
|    2|  Jane|   Finance|  4000|
|    3|  Mike|     Sales|  3500|
|    4| Alice|   Finance|  3800|
|    5|   Bob|        IT|  4500|
|    6|   Tom|     Sales|  3700|
|    7| Jerry|   Finance|  4200|
|    8|   Sam|        IT|  4700|
|    9| Steve|     Sales|  3100|
|   10|Rachel|        IT|  4600|
+-----+------+----------+------+



In [17]:
window_spec = Window.partitionBy("Department").orderBy(col("Salary").desc())

df.withColumn("Rank", rank().over(window_spec)).show()

+-----+------+----------+------+----+
|EmpID|  Name|Department|Salary|Rank|
+-----+------+----------+------+----+
|    7| Jerry|   Finance|  4200|   1|
|    2|  Jane|   Finance|  4000|   2|
|    4| Alice|   Finance|  3800|   3|
|    8|   Sam|        IT|  4700|   1|
|   10|Rachel|        IT|  4600|   2|
|    5|   Bob|        IT|  4500|   3|
|    6|   Tom|     Sales|  3700|   1|
|    3|  Mike|     Sales|  3500|   2|
|    9| Steve|     Sales|  3100|   3|
|    1|  John|     Sales|  3000|   4|
+-----+------+----------+------+----+



In [18]:
#max in department
window_spec = Window.partitionBy("Department").orderBy(col("Salary").desc())

df.withColumn("Max Salary", max("Salary").over(window_spec)).show()

+-----+------+----------+------+----------+
|EmpID|  Name|Department|Salary|Max Salary|
+-----+------+----------+------+----------+
|    7| Jerry|   Finance|  4200|      4200|
|    2|  Jane|   Finance|  4000|      4200|
|    4| Alice|   Finance|  3800|      4200|
|    8|   Sam|        IT|  4700|      4700|
|   10|Rachel|        IT|  4600|      4700|
|    5|   Bob|        IT|  4500|      4700|
|    6|   Tom|     Sales|  3700|      3700|
|    3|  Mike|     Sales|  3500|      3700|
|    9| Steve|     Sales|  3100|      3700|
|    1|  John|     Sales|  3000|      3700|
+-----+------+----------+------+----------+



In [19]:
#department name and max salary
window_spec = Window.partitionBy("Department").orderBy(col("Salary").desc())

df.withColumn("Max Salary", max("Salary").over(window_spec)).select("Department", "Max Salary").distinct().show()

+----------+----------+
|Department|Max Salary|
+----------+----------+
|   Finance|      4200|
|        IT|      4700|
|     Sales|      3700|
+----------+----------+



In [21]:
## apply unboundpreceding on above
window_spec = Window.partitionBy("Department").orderBy("Salary").rowsBetween(Window.unboundedPreceding, 0)
df.withColumn("Cumulative Salary", sum("Salary").over(window_spec)).show()

+-----+------+----------+------+-----------------+
|EmpID|  Name|Department|Salary|Cumulative Salary|
+-----+------+----------+------+-----------------+
|    4| Alice|   Finance|  3800|             3800|
|    2|  Jane|   Finance|  4000|             7800|
|    7| Jerry|   Finance|  4200|            12000|
|    5|   Bob|        IT|  4500|             4500|
|   10|Rachel|        IT|  4600|             9100|
|    8|   Sam|        IT|  4700|            13800|
|    1|  John|     Sales|  3000|             3000|
|    9| Steve|     Sales|  3100|             6100|
|    3|  Mike|     Sales|  3500|             9600|
|    6|   Tom|     Sales|  3700|            13300|
+-----+------+----------+------+-----------------+



In [22]:
window_base = Window.partitionBy("Department").orderBy("Salary")
window_current = window_base.rowsBetween(0, 0)
df.withColumn("salary_currentOnly", sum("Salary").over(window_current)).show()


+-----+------+----------+------+------------------+
|EmpID|  Name|Department|Salary|salary_currentOnly|
+-----+------+----------+------+------------------+
|    4| Alice|   Finance|  3800|              3800|
|    2|  Jane|   Finance|  4000|              4000|
|    7| Jerry|   Finance|  4200|              4200|
|    5|   Bob|        IT|  4500|              4500|
|   10|Rachel|        IT|  4600|              4600|
|    8|   Sam|        IT|  4700|              4700|
|    1|  John|     Sales|  3000|              3000|
|    9| Steve|     Sales|  3100|              3100|
|    3|  Mike|     Sales|  3500|              3500|
|    6|   Tom|     Sales|  3700|              3700|
+-----+------+----------+------+------------------+



In [23]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *

spark = SparkSession.builder.appName("AdvancedOps").getOrCreate()

data = [
    (1, "Alice", 2000, ["math", "science"], {"city": "NYC", "zip": "10001"}),
    (2, "Bob", 1500, ["english"], {"city": "SF", "zip": "94105"}),
    (3, "Charlie", 2200, ["math", "history", "science"], {"city": "NYC", "zip": "10001"}),
    (4, "David", 1200, ["art"], {"city": "LA", "zip": "90001"}),
]

df = spark.createDataFrame(data, schema=["id", "name", "salary", "subjects", "address"])
df.show(truncate=False)

+---+-------+------+------------------------+---------------------------+
|id |name   |salary|subjects                |address                    |
+---+-------+------+------------------------+---------------------------+
|1  |Alice  |2000  |[math, science]         |{zip -> 10001, city -> NYC}|
|2  |Bob    |1500  |[english]               |{zip -> 94105, city -> SF} |
|3  |Charlie|2200  |[math, history, science]|{zip -> 10001, city -> NYC}|
|4  |David  |1200  |[art]                   |{zip -> 90001, city -> LA} |
+---+-------+------+------------------------+---------------------------+



In [28]:
#Row By Row by using UDF Apache Pyspark
from pyspark.sql.types import IntegerType
from pyspark.sql.functions import udf

# Define UDF
@udf(IntegerType())
def subject_count(subjects):
    return len(subjects)


df.withColumn("subject_count", subject_count(df.subjects)).show()

+---+-------+------+--------------------+--------------------+-------------+
| id|   name|salary|            subjects|             address|subject_count|
+---+-------+------+--------------------+--------------------+-------------+
|  1|  Alice|  2000|     [math, science]|{zip -> 10001, ci...|            2|
|  2|    Bob|  1500|           [english]|{zip -> 94105, ci...|            1|
|  3|Charlie|  2200|[math, history, s...|{zip -> 10001, ci...|            3|
|  4|  David|  1200|               [art]|{zip -> 90001, ci...|            1|
+---+-------+------+--------------------+--------------------+-------------+



In [26]:
#pandas UDF (Vectorized UDF)
import pandas as pd
from pyspark.sql.functions import pandas_udf
from pyspark.sql.types import DoubleType

@pandas_udf(DoubleType())
def multiply_by_two(s: pd.Series) -> pd.Series:
  return s * 2

df.withColumn("double_salary", multiply_by_two(df.salary)).show()

+---+-------+------+--------------------+--------------------+-------------+
| id|   name|salary|            subjects|             address|double_salary|
+---+-------+------+--------------------+--------------------+-------------+
|  1|  Alice|  2000|     [math, science]|{zip -> 10001, ci...|       4000.0|
|  2|    Bob|  1500|           [english]|{zip -> 94105, ci...|       3000.0|
|  3|Charlie|  2200|[math, history, s...|{zip -> 10001, ci...|       4400.0|
|  4|  David|  1200|               [art]|{zip -> 90001, ci...|       2400.0|
+---+-------+------+--------------------+--------------------+-------------+

