In [1]:
!pip install pyspark py4j

Collecting pyspark
  Downloading pyspark-3.5.0.tar.gz (316.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m316.9/316.9 MB[0m [31m4.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.5.0-py2.py3-none-any.whl size=317425344 sha256=22079246624ef8266d23ff8a66f9c65cb54d9011362a7db8f3ce4087d63ba3f1
  Stored in directory: /root/.cache/pip/wheels/41/4e/10/c2cf2467f71c678cfc8a6b9ac9241e5e44a01940da8fbb17fc
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.5.0


In [3]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, when

# Create a Spark session
spark = SparkSession.builder.appName("WhenExample").getOrCreate()

# Create a DataFrame
data = [("Alice", 25),
        ("Bob", 30),
        ("Charlie", 22),
        ("David", None)]

columns = ["Name", "Age"]

df = spark.createDataFrame(data, columns)

df.show()

# Use the when function for conditional transformation
result_df = df.withColumn("Age_Group",
                          when(col("Age").isNull(), "Unknown")
                          .when(col("Age") < 25, "Young")
                          .when((col("Age") >= 25) & (col("Age") <= 30), "Adult")
                          .otherwise("Senior"))

# Show the result
result_df.show()


+-------+----+
|   Name| Age|
+-------+----+
|  Alice|  25|
|    Bob|  30|
|Charlie|  22|
|  David|NULL|
+-------+----+

+-------+----+---------+
|   Name| Age|Age_Group|
+-------+----+---------+
|  Alice|  25|    Adult|
|    Bob|  30|    Adult|
|Charlie|  22|    Young|
|  David|NULL|  Unknown|
+-------+----+---------+



In [19]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, when

# Create a Spark session
spark = SparkSession.builder.appName("WhenExample").getOrCreate()

# Create a DataFrame
data = [("Alice", 25),
        ("Bob", 30),
        ("Charlie", 22),
        ("David", 55),
        ("Eva", 65)]

columns = ["Name", "Age"]

df = spark.createDataFrame(data, columns)

# # Use the when function to categorize age groups
# result_df = df.withColumn("Age_Group",
#                           when(col("Age") < 18, "Child")
#                           .when((col("Age") >= 18) & (col("Age") < 65), "Adult")
#                           .otherwise("Senior"))

# # Show the result
# result_df.show()

#my way
result_df =  df.withColumn("Age_Class", when(col("Age") <18, "Child" )
                                      .when((col("Age") >=18) & (col("Age") < 55), "Adult")
                                      .when(col("Age") >= 65,"Senior" )
                                      .otherwise("Cannot be classified"))
result_df.show()

+-------+---+--------------------+
|   Name|Age|           Age_Class|
+-------+---+--------------------+
|  Alice| 25|               Adult|
|    Bob| 30|               Adult|
|Charlie| 22|               Adult|
|  David| 55|Cannot be classified|
|    Eva| 65|              Senior|
+-------+---+--------------------+



In [31]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, when

# Create a Spark session
spark = SparkSession.builder.appName("WhenExample").getOrCreate()

# Create a DataFrame
data = [("Alice", 50000),
        ("Bob", 75000),
        ("Charlie", 90000),
        ("David", 120000),
        ("Eva", 180000)]

columns = ["Name", "Salary"]

df = spark.createDataFrame(data, columns)

# # Use the when function to assign a status based on salary
# result_df = df.withColumn("Salary_Status",
#                           when(col("Salary") < 60000, "Low")
#                           .when((col("Salary") >= 60000) & (col("Salary") < 100000), "Medium")
#                           .otherwise("High"))

# Show the result

print("Raw Data")
df.show()

 #  .when((col("Salary") > 60000) & (col("Salary") < 100000), "Medium")

result_df = df.withColumn("Salary",
                          when((col("Salary") < 60000), "Low")
                          .when((col("Salary") >= 60000) & (col("Salary") < 100000), "Medium")
                          .otherwise("High"))


result_df.show()


Raw Data
+-------+------+
|   Name|Salary|
+-------+------+
|  Alice| 50000|
|    Bob| 75000|
|Charlie| 90000|
|  David|120000|
|    Eva|180000|
+-------+------+

+-------+------+
|   Name|Salary|
+-------+------+
|  Alice|   Low|
|    Bob|Medium|
|Charlie|Medium|
|  David|  High|
|    Eva|  High|
+-------+------+



In [40]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, when, avg

# Create a Spark session
spark = SparkSession.builder.appName("WhenWithAggregates").getOrCreate()

# Create a DataFrame
data = [("Alice", 25, "Group1"),
        ("Bob", 30, "Group1"),
        ("Charlie", 22, "Group2"),
        ("David", 28, "Group2"),
        ("Eva", 35, "Group1")]

columns = ["Name", "Age", "Group"]

df = spark.createDataFrame(data, columns)

print("Raw Data")
df.show()

avg_age_under_30 = avg(when(col("Age") < 30, col("Age"))).alias("avg_age_under_30")
avg_age_30_and_over = avg(when(col("Age") >= 30 , col("Age"))).alias("avg_age_30_and_over")

result_df = df.groupBy("Group")\
              .agg(avg_age_under_30,
                   avg_age_30_and_over)

result_df.show()

Raw Data
+-------+---+------+
|   Name|Age| Group|
+-------+---+------+
|  Alice| 25|Group1|
|    Bob| 30|Group1|
|Charlie| 22|Group2|
|  David| 28|Group2|
|    Eva| 35|Group1|
+-------+---+------+

+------+----------------+-------------------+
| Group|avg_age_under_30|avg_age_30_and_over|
+------+----------------+-------------------+
|Group1|            25.0|               32.5|
|Group2|            25.0|               NULL|
+------+----------------+-------------------+



In [44]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, when, count

# Create a Spark session
spark = SparkSession.builder.appName("WhenWithAggregates").getOrCreate()

# Create a DataFrame
data = [("Alice", "Yes"),
        ("Bob", "No"),
        ("Charlie", "Yes"),
        ("David", "No"),
        ("Eva", "Yes")]

columns = ["Name", "Has_Salary"]

df = spark.createDataFrame(data, columns)

print("RawData")
df.show()

count_yes = count(when(col("Has_Salary") == "Yes", 1)).alias("count_yes")
count_no = count(when(col("Has_Salary") == "No", 0)).alias("count_no")

result_df = df.groupBy("Has_Salary").agg(count_yes,count_no)
result_df.show()



RawData
+-------+----------+
|   Name|Has_Salary|
+-------+----------+
|  Alice|       Yes|
|    Bob|        No|
|Charlie|       Yes|
|  David|        No|
|    Eva|       Yes|
+-------+----------+

+----------+---------+--------+
|Has_Salary|count_yes|count_no|
+----------+---------+--------+
|        No|        0|       2|
|       Yes|        3|       0|
+----------+---------+--------+



In [48]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, when

# Create a Spark session
spark = SparkSession.builder.appName("ComplexWhenExample").getOrCreate()

# Create a DataFrame
data = [("Alice", "Excellent", 50000),
        ("Bob", "Good", 75000),
        ("Charlie", "Poor", 90000),
        ("David", "Excellent", 120000),
        ("Eva", "Good", 180000)]

columns = ["Name", "Performance", "Salary"]

df = spark.createDataFrame(data, columns)

print("Raw Data")
df.show()


result_df = df.withColumn("Adjusted_Salary",
                           when(col("Performance") == "Excellent", col("Salary") * 1.2)\
                           .when(col("Performance") == "Good", col("Salary") * 1.1)\
                           .when(col("Performance") == "Poor", col("Salary") * 0.9)\
                           .otherwise(col("Salary") * -1))

result_df.show()


Raw Data
+-------+-----------+------+
|   Name|Performance|Salary|
+-------+-----------+------+
|  Alice|  Excellent| 50000|
|    Bob|       Good| 75000|
|Charlie|       Poor| 90000|
|  David|  Excellent|120000|
|    Eva|       Good|180000|
+-------+-----------+------+

+-------+-----------+------+------------------+
|   Name|Performance|Salary|   Adjusted_Salary|
+-------+-----------+------+------------------+
|  Alice|  Excellent| 50000|           60000.0|
|    Bob|       Good| 75000|           82500.0|
|Charlie|       Poor| 90000|           81000.0|
|  David|  Excellent|120000|          144000.0|
|    Eva|       Good|180000|198000.00000000003|
+-------+-----------+------+------------------+



In [54]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, when

# Create a Spark session
spark = SparkSession.builder.appName("ComplexWhenExample").getOrCreate()

# Create a DataFrame
data = [("Alice", 500, 5),
        ("Bob", 750, 15),
        ("Charlie", 900, 8),
        ("David", 1200, 25),
        ("Eva", 1800, 35)]

columns = ["Customer", "Total_Spending", "Total_Purchases"]

df = spark.createDataFrame(data, columns)

print("Raw Data")
df.show()

# # Use when with multiple conditions to categorize customers based on purchase behavior
# result_df = df.withColumn("Customer_Category",
#                           when((col("Total_Purchases") > 20) & (col("Total_Spending") > 1000), "High Value")
#                           .when((col("Total_Purchases") > 10) & (col("Total_Spending") > 500), "Medium Value")
#                           .otherwise("Low Value"))

# # Show the result
# result_df.show()

result_df = df.withColumn("Customer_Category",
                          when((col("Total_Purchases") > 20) & (col("Total_Spending") > 1000), "High Value")
                          .when((col("Total_Purchases") > 10) & (col("Total_Spending") > 500), "Medium Value")
						  .otherwise("Low Value"))

result_df.show()

Raw Data
+--------+--------------+---------------+
|Customer|Total_Spending|Total_Purchases|
+--------+--------------+---------------+
|   Alice|           500|              5|
|     Bob|           750|             15|
| Charlie|           900|              8|
|   David|          1200|             25|
|     Eva|          1800|             35|
+--------+--------------+---------------+

+--------+--------------+---------------+-----------------+
|Customer|Total_Spending|Total_Purchases|Customer_Category|
+--------+--------------+---------------+-----------------+
|   Alice|           500|              5|        Low Value|
|     Bob|           750|             15|     Medium Value|
| Charlie|           900|              8|        Low Value|
|   David|          1200|             25|       High Value|
|     Eva|          1800|             35|       High Value|
+--------+--------------+---------------+-----------------+

