<a href="https://colab.research.google.com/github/Jayakumar1305/ETL-Workflow-with-PYTHON/blob/main/Practice.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install pyspark



In [None]:
from pyspark.sql import SparkSession
from pyspark.sql import Row

In [None]:
df = SparkSession.builder.appName("ColabPySpark").getOrCreate()

In [None]:
data = [(1, "Alice", 25, 5000), (2, "Bob", 30, 6000), (3, "Charlie", 35, 7000)]
columns = ["ID", "Name", "Age", "Salary"]

df = df.createDataFrame(data, columns)


In [None]:
df.show()

+---+-------+---+------+
| ID|   Name|Age|Salary|
+---+-------+---+------+
|  1|  Alice| 25|  5000|
|  2|    Bob| 30|  6000|
|  3|Charlie| 35|  7000|
+---+-------+---+------+



In [None]:
df.printSchema()

root
 |-- ID: long (nullable = true)
 |-- Name: string (nullable = true)
 |-- Age: long (nullable = true)
 |-- Salary: long (nullable = true)



In [None]:
df.columns

['ID', 'Name', 'Age']

In [None]:
df.describe().show()

+-------+---+-------+----+
|summary| ID|   Name| Age|
+-------+---+-------+----+
|  count|  3|      3|   3|
|   mean|2.0|   NULL|30.0|
| stddev|1.0|   NULL| 5.0|
|    min|  1|  Alice|  25|
|    max|  3|Charlie|  35|
+-------+---+-------+----+



In [None]:
df = df.withColumn("Bonus", df["Salary"] * 0.1)

In [None]:
df.show()

+---+-------+---+------+-----+
| ID|   Name|Age|Salary|Bonus|
+---+-------+---+------+-----+
|  1|  Alice| 25|  5000|500.0|
|  2|    Bob| 30|  6000|600.0|
|  3|Charlie| 35|  7000|700.0|
+---+-------+---+------+-----+



In [None]:
from pyspark.sql.functions import sum, min, max, count,avg # Removed 'add'

# To perform addition, you can use the '+' operator on columns:
# For example:
# df.select(df["Salary"] + df["Bonus"]).show()

In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, LongType

# Initialize Spark Session
spark = SparkSession.builder.appName("ColabPySpark").getOrCreate()

# Define Schema (Adding Bonus Column)
schema = StructType([
    StructField("ID", LongType(), True),
    StructField("Name", StringType(), True),
    StructField("Age", IntegerType(), True),
    StructField("Salary", LongType(), True),
    StructField("Bonus", LongType(), True)  # Adding Bonus column
])

# Define Data
data = [(1, "Alice", 25, 5000, 500),
        (2, "Bob", 30, 6000, 600),
        (3, "Charlie", 35, 7000, 700)]

df = spark.createDataFrame(data, schema=schema)

# New row(s) to add
new_data = [(4, "Eve", 40, 8000, 800),
            (5, "David", 45, 9000, 900)]  # Adding multiple rows

new_df = spark.createDataFrame(new_data, schema=schema)

# Append new rows using union()
df = df.union(new_df)

# Show Updated DataFrame
df.show()


+---+-------+---+------+-----+
| ID|   Name|Age|Salary|Bonus|
+---+-------+---+------+-----+
|  1|  Alice| 25|  5000|  500|
|  2|    Bob| 30|  6000|  600|
|  3|Charlie| 35|  7000|  700|
|  4|    Eve| 40|  8000|  800|
|  5|  David| 45|  9000|  900|
+---+-------+---+------+-----+



In [None]:
df.select(
    sum("Salary").alias("Total Salary"),
    min("Salary").alias("Minimum Salary"),
    max("Salary").alias("Maximum Salary"),
    count("ID").alias("Number of Employees"),
    avg("Salary").alias("Average Salary")
).show()

+------------+--------------+--------------+-------------------+--------------+
|Total Salary|Minimum Salary|Maximum Salary|Number of Employees|Average Salary|
+------------+--------------+--------------+-------------------+--------------+
|       35000|          5000|          9000|                  5|        7000.0|
+------------+--------------+--------------+-------------------+--------------+



If null present in the column then Pyspark won't consider null value it wil give output.


In [24]:
from pyspark.sql import SparkSession
Array_df = SparkSession.builder.appName("ColabPySpark").getOrCreate()

In [47]:
Array_df = spark.createDataFrame([
    (1, "Alice", None, 5000, "HR"),
        (None, "Bob", 30, 6000, "IT"),
        (3, "Charlie", 35, None, "Finance"),
        (4, None, 34, 7000, "Admin")], ["id", "name", "age", "salary","dept"]) # Changed this line
#Array_df = spark.CreateDataframe(data,["id", "name", "age", "salary","dept"]) # Removing this line


In [48]:
Array_df.show()

+----+-------+----+------+-------+
|  id|   name| age|salary|   dept|
+----+-------+----+------+-------+
|   1|  Alice|NULL|  5000|     HR|
|NULL|    Bob|  30|  6000|     IT|
|   3|Charlie|  35|  NULL|Finance|
|   4|   NULL|  34|  7000|  Admin|
+----+-------+----+------+-------+



In [27]:
from pyspark.sql.functions import col, avg, when, coalesce

Array, Explode
Array : it use to combine two record into one record
Explode : it use to increase the row

In [49]:
Array_df.show()

+----+-------+----+------+-------+
|  id|   name| age|salary|   dept|
+----+-------+----+------+-------+
|   1|  Alice|NULL|  5000|     HR|
|NULL|    Bob|  30|  6000|     IT|
|   3|Charlie|  35|  NULL|Finance|
|   4|   NULL|  34|  7000|  Admin|
+----+-------+----+------+-------+



Average of AGE

In [35]:
Array_df.select(avg(col("age")))

DataFrame[avg(age): double]

In [36]:
Array_df.select(avg(col("age"))).collect()

[Row(avg(age)=32.5)]

In [37]:
Array_df.select(avg(col("age"))).collect()[0][0]

32.5

In [40]:
agv_age = Array_df.select(avg(col("age"))).collect()[0][0]


32.5


In [41]:
print(agv_age)

32.5


If name is null then change to Unknow

In [50]:
Array_df.show()

+----+-------+----+------+-------+
|  id|   name| age|salary|   dept|
+----+-------+----+------+-------+
|   1|  Alice|NULL|  5000|     HR|
|NULL|    Bob|  30|  6000|     IT|
|   3|Charlie|  35|  NULL|Finance|
|   4|   NULL|  34|  7000|  Admin|
+----+-------+----+------+-------+



In [51]:
Array_df.withColumn("name", when(col("name").isNull(), "Unknown").otherwise(col("name"))).show()

+----+-------+----+------+-------+
|  id|   name| age|salary|   dept|
+----+-------+----+------+-------+
|   1|  Alice|NULL|  5000|     HR|
|NULL|    Bob|  30|  6000|     IT|
|   3|Charlie|  35|  NULL|Finance|
|   4|Unknown|  34|  7000|  Admin|
+----+-------+----+------+-------+



In [53]:
Array_df.withColumn("age", when(col("age").isNull(), agv_age).otherwise(col("age"))).show()

+----+-------+----+------+-------+
|  id|   name| age|salary|   dept|
+----+-------+----+------+-------+
|   1|  Alice|32.5|  5000|     HR|
|NULL|    Bob|30.0|  6000|     IT|
|   3|Charlie|35.0|  NULL|Finance|
|   4|   NULL|34.0|  7000|  Admin|
+----+-------+----+------+-------+



In [54]:
Array_df.show()

+----+-------+----+------+-------+
|  id|   name| age|salary|   dept|
+----+-------+----+------+-------+
|   1|  Alice|NULL|  5000|     HR|
|NULL|    Bob|  30|  6000|     IT|
|   3|Charlie|  35|  NULL|Finance|
|   4|   NULL|  34|  7000|  Admin|
+----+-------+----+------+-------+

