# PYSPARK - ASSESSMENT - 02

**Module 1: Setup & SparkSession Initialization**

In [1]:
from pyspark.sql import SparkSession
spark = SparkSession.builder \
.appName("BotCampus PySpark Practice") \
.master("local[*]") \
.getOrCreate()

In [2]:
data = [
("Anjali", "Bangalore", 24),
("Ravi", "Hyderabad", 28),
("Kavya", "Delhi", 22),
("Meena", "Chennai", 25),
("Arjun", "Mumbai", 30)
]
columns = ["name", "city", "age"]
df = spark.createDataFrame(data, columns)

df.printSchema()
df.show()

# RDD Conversion
rdd = df.rdd
print(rdd.collect())
print(rdd.map(lambda x: (x.name, x.city)).collect())

root
 |-- name: string (nullable = true)
 |-- city: string (nullable = true)
 |-- age: long (nullable = true)

+------+---------+---+
|  name|     city|age|
+------+---------+---+
|Anjali|Bangalore| 24|
|  Ravi|Hyderabad| 28|
| Kavya|    Delhi| 22|
| Meena|  Chennai| 25|
| Arjun|   Mumbai| 30|
+------+---------+---+

[Row(name='Anjali', city='Bangalore', age=24), Row(name='Ravi', city='Hyderabad', age=28), Row(name='Kavya', city='Delhi', age=22), Row(name='Meena', city='Chennai', age=25), Row(name='Arjun', city='Mumbai', age=30)]
[('Anjali', 'Bangalore'), ('Ravi', 'Hyderabad'), ('Kavya', 'Delhi'), ('Meena', 'Chennai'), ('Arjun', 'Mumbai')]


**Module 2: RDDs & Transformations**

In [5]:
from pyspark.sql.functions import *

feedback = spark.sparkContext.parallelize([
"Ravi from Bangalore loved the delivery",
"Meena from Hyderabad had a late order",
"Ajay from Pune liked the service",
"Anjali from Delhi faced UI issues",
"Rohit from Mumbai gave positive feedback"
])

# a
words = feedback.flatMap(lambda line: line.lower().split())

# b
stop_words = {"from", "the", "a", "had", "an", "and"}
filtered_words = words.filter(lambda word: word not in stop_words)

# c
word_count = filtered_words.map(lambda word: (word, 1)).reduceByKey(lambda a, b: a+b)

# d
top3 = word_count.takeOrdered(3, key=lambda x: -x[1])
print(top3)

[('loved', 1), ('liked', 1), ('service', 1)]


**Module 3: DataFrames & Transformation (With Joins)**

In [13]:
students = [
("Amit", "10-A", 89),
("Kavya", "10-B", 92),
("Anjali", "10-A", 78),
("Rohit", "10-B", 85),
("Sneha", "10-C", 80)
]

columns = ["name", "section", "marks"]

attendance = [
("Amit", 24),
("Kavya", 22),
("Anjali", 20),
("Rohit", 25),
("Sneha", 19)
]

columns2 = ["name", "days_present"]

studDF = spark.createDataFrame(students, columns)
attDF = spark.createDataFrame(attendance, columns2)

# a
joinedDF = studDF.join(attDF, on="name" )
print("Joined data:\n")
joinedDF.show()

# b
newDF =  joinedDF.withColumn("attendance_rate", col('days_present') / 25)
print("Added Attendance Rate column:\n")
newDF.show()

# c
newDF = newDF.withColumn("grade",
                         when( col('marks') > 90, "A")
                         .when((col('marks') <= 90) & (col('marks') >= 80), "B")
                         .otherwise("C")
                         )
print("Added Grade column:\n")
newDF.show()

# d
filteredDF = newDF.filter(
    (col("grade").isin("A", "B")) & (col("attendance_rate") < 0.8)
    )
print("\nFiltered Data:\n")
filteredDF.show()

Joined data:

+------+-------+-----+------------+
|  name|section|marks|days_present|
+------+-------+-----+------------+
|  Amit|   10-A|   89|          24|
|Anjali|   10-A|   78|          20|
| Kavya|   10-B|   92|          22|
| Rohit|   10-B|   85|          25|
| Sneha|   10-C|   80|          19|
+------+-------+-----+------------+

Added Attendance Rate column:

+------+-------+-----+------------+---------------+
|  name|section|marks|days_present|attendance_rate|
+------+-------+-----+------------+---------------+
|  Amit|   10-A|   89|          24|           0.96|
|Anjali|   10-A|   78|          20|            0.8|
| Kavya|   10-B|   92|          22|           0.88|
| Rohit|   10-B|   85|          25|            1.0|
| Sneha|   10-C|   80|          19|           0.76|
+------+-------+-----+------------+---------------+

Added Grade column:

+------+-------+-----+------------+---------------+-----+
|  name|section|marks|days_present|attendance_rate|grade|
+------+-------+-----+--

**Module 4: Ingest CSV & JSON, Save to Parquet**

In [9]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [11]:
csvPath = (
    "/content/drive/MyDrive/Hexware_Training_DataEngineering/"
    "Aug05-Day12/Assessment02/4/employee.csv"
)

jsonPath = (
    "/content/drive/MyDrive/Hexware_Training_DataEngineering/"
    "Aug05-Day12/Assessment02/4/employee.json"
)

# a
emp1DF = spark.read.csv(csvPath, header=True, inferSchema=True)
emp2DF = spark.read.option("multiline","true").json(jsonPath)

print("Employee CSV schema:\n")
emp1DF.printSchema()
emp1DF.show()

print("Employee JSON schema:\n")
emp2DF.printSchema()
emp2DF.show()

# b
flattend_emp = emp2DF.select(
    col("id"),
    col("name"),
    col("contact.email").alias("email"),
    col("contact.city").alias("city"),
    explode(col("skills")).alias("skill")
)
print("Flattened Employee JSON:\n")
flattend_emp.show()

# c
csvOutPath = (
    "/content/drive/MyDrive/Hexware_Training_DataEngineering/"
    "Aug05-Day12/Assessment02/4/output/employee1.parquet"
)

jsonOutPath = (
    "/content/drive/MyDrive/Hexware_Training_DataEngineering/"
    "Aug05-Day12/Assessment02/4/0utput/employee2.parquet"
)

emp1DF.write.mode("overwrite").partitionBy("city").parquet("csvOutPath")
flattend_emp.write.mode("overwrite").partitionBy("city").parquet("jsonOutOath")
print("Files Saved Successfully")

Employee CSV schema:

root
 |-- emp_id: integer (nullable = true)
 |-- name: string (nullable = true)
 |-- dept: string (nullable = true)
 |-- city: string (nullable = true)
 |-- salary: integer (nullable = true)

+------+-----+-------+---------+------+
|emp_id| name|   dept|     city|salary|
+------+-----+-------+---------+------+
|   101| Anil|     IT|Bangalore| 80000|
|   102|Kiran|     HR|   Mumbai| 65000|
|   103|Deepa|Finance|  Chennai| 72000|
+------+-----+-------+---------+------+

Employee JSON schema:

root
 |-- contact: struct (nullable = true)
 |    |-- city: string (nullable = true)
 |    |-- email: string (nullable = true)
 |-- id: long (nullable = true)
 |-- name: string (nullable = true)
 |-- skills: array (nullable = true)
 |    |-- element: string (containsNull = true)

+--------------------+---+-------+--------------------+
|             contact| id|   name|              skills|
+--------------------+---+-------+--------------------+
|{Hyderabad, nandi...|201|Nandini

**Module 5: Spark SQL with Temp Views**

In [24]:
newDF.createOrReplaceTempView("students_view")
print("Student View:\n")
spark.sql("SELECT * FROM students_view").show()

# a
print("Average marks per section:\n")
spark.sql("""
SELECT section, AVG(marks) AS avg_marks
FROM students_view
GROUP BY section
ORDER BY section
""").show()

# b
print("Top Scorers:\n")
spark.sql("""
SELECT section, name, marks
FROM students_view s1
WHERE marks = (
  SELECT MAX(marks)
  FROM students_view s2
  WHERE s1.section = s2.section
)
""").show()

# c
print("Grade wise student count:\n")
spark.sql("""
SELECT grade, COUNT(name) AS student_count
FROM students_view
GROUP BY grade
ORDER BY grade
""").show()

# d
print("Students with marks above average:\n")
spark.sql("""
SELECT * FROM students_view
WHERE marks > (SELECT AVG(marks) FROM students_view)
""").show()

# e
print("Attendance-adjusted performance:\n")
spark.sql("""
select name, section, marks, attendance_rate,
round(marks * attendance_rate, 2) as adjusted_score
from students_view
order by adjusted_score desc
""").show()

Student View:

+------+-------+-----+------------+---------------+-----+
|  name|section|marks|days_present|attendance_rate|grade|
+------+-------+-----+------------+---------------+-----+
|  Amit|   10-A|   89|          24|           0.96|    B|
|Anjali|   10-A|   78|          20|            0.8|    C|
| Kavya|   10-B|   92|          22|           0.88|    A|
| Rohit|   10-B|   85|          25|            1.0|    B|
| Sneha|   10-C|   80|          19|           0.76|    B|
+------+-------+-----+------------+---------------+-----+

Average marks per section:

+-------+---------+
|section|avg_marks|
+-------+---------+
|   10-A|     83.5|
|   10-B|     88.5|
|   10-C|     80.0|
+-------+---------+

Top Scorers:

+-------+-----+-----+
|section| name|marks|
+-------+-----+-----+
|   10-A| Amit|   89|
|   10-B|Kavya|   92|
|   10-C|Sneha|   80|
+-------+-----+-----+

Grade wise student count:

+-----+-------------+
|grade|student_count|
+-----+-------------+
|    A|            1|
|    B|  

**Module 6: Partitioned Data & Incremental Loading**

In [25]:
# Initial load
outPath = "output/students/"
newDF.write.partitionBy("section").parquet("outPath")

# Incremental Load
incremental = [("Tejas", "10-A", 91)]
df_inc = spark.createDataFrame(incremental, ["name", "section", "marks"])
df_inc.write.mode("append").partitionBy("section").parquet(outPath)

df_inc.printSchema()

root
 |-- name: string (nullable = true)
 |-- section: string (nullable = true)
 |-- marks: long (nullable = true)



In [28]:
import os

# a
print("Folders List:\n")
print(os.listdir(outPath))

# b
df_10A = spark.read.parquet("output/students/section=10-A")
df_10A.show()

# c
count_after = df_10A.count()
print(f"Number of students in section 10-A after incremental load: {count_after}")

Folders List:

['._SUCCESS.crc', 'section=10-A', '_SUCCESS']
+-----+-----+
| name|marks|
+-----+-----+
|Tejas|   91|
+-----+-----+

Number of students in section 10-A after incremental load: 1


**Module 7: ETL Pipeline – End to End**

In [30]:
# a
empPath = ("/content/drive/MyDrive/Hexware_Training_DataEngineering/"
          "Aug05-Day12/Assessment02/7/employee.csv"
          )
df_emp = spark.read.csv(empPath, header = True, inferSchema= True)
df_emp.show()

+------+------+-------+------+-----+
|emp_id|  name|   dept|salary|bonus|
+------+------+-------+------+-----+
|     1| Arjun|     IT| 75000| 5000|
|     2| Kavya|     HR| 62000| NULL|
|     3| Sneha|Finance| 68000| 4000|
|     4|Ramesh|  Sales| 58000| NULL|
+------+------+-------+------+-----+



In [32]:
# b
df_emp = df_emp.fillna({"bonus": 2000})
df_emp.show()

+------+------+-------+------+-----+
|emp_id|  name|   dept|salary|bonus|
+------+------+-------+------+-----+
|     1| Arjun|     IT| 75000| 5000|
|     2| Kavya|     HR| 62000| 2000|
|     3| Sneha|Finance| 68000| 4000|
|     4|Ramesh|  Sales| 58000| 2000|
+------+------+-------+------+-----+



In [33]:
# c
df_emp = df_emp.withColumn("total_ctc", expr("salary + bonus"))
df_emp.show()

+------+------+-------+------+-----+---------+
|emp_id|  name|   dept|salary|bonus|total_ctc|
+------+------+-------+------+-----+---------+
|     1| Arjun|     IT| 75000| 5000|    80000|
|     2| Kavya|     HR| 62000| 2000|    64000|
|     3| Sneha|Finance| 68000| 4000|    72000|
|     4|Ramesh|  Sales| 58000| 2000|    60000|
+------+------+-------+------+-----+---------+



In [35]:
# d
df_filtered = df_emp.filter("total_ctc > 65000")
df_filtered.show()

+------+-----+-------+------+-----+---------+
|emp_id| name|   dept|salary|bonus|total_ctc|
+------+-----+-------+------+-----+---------+
|     1|Arjun|     IT| 75000| 5000|    80000|
|     3|Sneha|Finance| 68000| 4000|    72000|
+------+-----+-------+------+-----+---------+



In [36]:
# e
empOutPath1 = ("/content/drive/MyDrive/Hexware_Training_DataEngineering/"
              "Aug05-Day12/Assessment02/Output1/employee.parquet"
              )
empOutPath2 = ("/content/drive/MyDrive/Hexware_Training_DataEngineering/"
              "Aug05-Day12/Assessment02/Output2/employee.json"
              )

df_emp.write.mode("overwrite").parquet(empOutPath1)
df_emp.write.mode("overwrite").json(empOutPath2)
print("Files saved successsfully")


Files saved successsfully
