**Module 1: Setup & SparkSession Initialization**

Install and configure PySpark in your local system or Colab.

In [1]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
.appName("BotCampus PySpark Practice") \
.master("local[*]") \
.getOrCreate()

Create a DataFrame from:

In [2]:
data = [
("Anjali", "Bangalore", 24),
("Ravi", "Hyderabad", 28),
("Kavya", "Delhi", 22),
("Meena", "Chennai", 25),
("Arjun", "Mumbai", 30)
]
columns = ["name", "city", "age"]

df_data=spark.createDataFrame(data,columns)
df_data.show()

+------+---------+---+
|  name|     city|age|
+------+---------+---+
|Anjali|Bangalore| 24|
|  Ravi|Hyderabad| 28|
| Kavya|    Delhi| 22|
| Meena|  Chennai| 25|
| Arjun|   Mumbai| 30|
+------+---------+---+



Show schema, explain data types, and convert to RDD.

In [3]:
df_data.printSchema()

root
 |-- name: string (nullable = true)
 |-- city: string (nullable = true)
 |-- age: long (nullable = true)



In [4]:
rdd_data=df_data.rdd

Print .collect() and df.rdd.map() output.

In [5]:
print(rdd_data.collect())
print(df_data.collect())

[Row(name='Anjali', city='Bangalore', age=24), Row(name='Ravi', city='Hyderabad', age=28), Row(name='Kavya', city='Delhi', age=22), Row(name='Meena', city='Chennai', age=25), Row(name='Arjun', city='Mumbai', age=30)]
[Row(name='Anjali', city='Bangalore', age=24), Row(name='Ravi', city='Hyderabad', age=28), Row(name='Kavya', city='Delhi', age=22), Row(name='Meena', city='Chennai', age=25), Row(name='Arjun', city='Mumbai', age=30)]


In [6]:
rdd_data.map(lambda x:x['name']).collect()

['Anjali', 'Ravi', 'Kavya', 'Meena', 'Arjun']

Module 2: RDDs & Transformations

In [7]:
feedback = spark.sparkContext.parallelize([
"Ravi from Bangalore loved the delivery",
"Meena from Hyderabad had a late order",
"Ajay from Pune liked the service",
"Anjali from Delhi faced UI issues",
"Rohit from Mumbai gave positive feedback"
])

Split each line into words ( flatMap ).

In [8]:
feedback.flatMap(lambda x:x.lower().split(" ")).collect()

['ravi',
 'from',
 'bangalore',
 'loved',
 'the',
 'delivery',
 'meena',
 'from',
 'hyderabad',
 'had',
 'a',
 'late',
 'order',
 'ajay',
 'from',
 'pune',
 'liked',
 'the',
 'service',
 'anjali',
 'from',
 'delhi',
 'faced',
 'ui',
 'issues',
 'rohit',
 'from',
 'mumbai',
 'gave',
 'positive',
 'feedback']

Remove stop words ( from , the , etc.).

In [9]:
stop_words = set(["from","the","had","a","an","and","is","of","to","in","for"])
feedback.flatMap(lambda x:x.lower().split(" ")).filter(lambda x:x not in stop_words).collect()

['ravi',
 'bangalore',
 'loved',
 'delivery',
 'meena',
 'hyderabad',
 'late',
 'order',
 'ajay',
 'pune',
 'liked',
 'service',
 'anjali',
 'delhi',
 'faced',
 'ui',
 'issues',
 'rohit',
 'mumbai',
 'gave',
 'positive',
 'feedback']

Count each word frequency using reduceByKey .

In [10]:
feedback.flatMap(lambda x:x.lower().split(" ")).map(lambda x:(x,1)).reduceByKey(lambda x,y:x+y).collect()

[('from', 5),
 ('loved', 1),
 ('liked', 1),
 ('service', 1),
 ('anjali', 1),
 ('faced', 1),
 ('issues', 1),
 ('rohit', 1),
 ('mumbai', 1),
 ('positive', 1),
 ('feedback', 1),
 ('ravi', 1),
 ('bangalore', 1),
 ('the', 2),
 ('delivery', 1),
 ('meena', 1),
 ('hyderabad', 1),
 ('had', 1),
 ('a', 1),
 ('late', 1),
 ('order', 1),
 ('ajay', 1),
 ('pune', 1),
 ('delhi', 1),
 ('ui', 1),
 ('gave', 1)]

Find top 3 most frequent non-stop words.

In [11]:
feedback.flatMap(lambda x:x.lower().split(" ")).filter(lambda x:x not in stop_words).map(lambda x:(x,1)).reduceByKey(lambda x,y:x+y).sortBy(lambda x:x[1],ascending=False).take(3)

[('loved', 1), ('liked', 1), ('service', 1)]

**Module 3: DataFrames & Transformation (With Joins)**

In [12]:
students = [
("Amit", "10-A", 89),
("Kavya", "10-B", 92),
("Anjali", "10-A", 78),
("Rohit", "10-B", 85),
("Sneha", "10-C", 80)
]
columns = ["name", "section", "marks"]
df_students=spark.createDataFrame(students,columns)
df_students.show()

+------+-------+-----+
|  name|section|marks|
+------+-------+-----+
|  Amit|   10-A|   89|
| Kavya|   10-B|   92|
|Anjali|   10-A|   78|
| Rohit|   10-B|   85|
| Sneha|   10-C|   80|
+------+-------+-----+



In [13]:
attendance = [
("Amit", 24),
("Kavya", 22),
("Anjali", 20),
("Rohit", 25),
("Sneha", 19)
]
columns2 = ["name", "days_present"]
df_attendance=spark.createDataFrame(attendance,columns2)
df_attendance.show()

+------+------------+
|  name|days_present|
+------+------------+
|  Amit|          24|
| Kavya|          22|
|Anjali|          20|
| Rohit|          25|
| Sneha|          19|
+------+------------+



Join both DataFrames on name .

In [14]:
df_merged=df_students.join(df_attendance,on='name',how='inner')
df_merged.show()

+------+-------+-----+------------+
|  name|section|marks|days_present|
+------+-------+-----+------------+
|  Amit|   10-A|   89|          24|
|Anjali|   10-A|   78|          20|
| Kavya|   10-B|   92|          22|
| Rohit|   10-B|   85|          25|
| Sneha|   10-C|   80|          19|
+------+-------+-----+------------+



Create a new column: attendance_rate = days_present / 25

In [15]:
df_merged=df_merged.withColumn('attendance_rate',(df_merged['days_present']/25)*100)
df_merged.show()

+------+-------+-----+------------+---------------+
|  name|section|marks|days_present|attendance_rate|
+------+-------+-----+------------+---------------+
|  Amit|   10-A|   89|          24|           96.0|
|Anjali|   10-A|   78|          20|           80.0|
| Kavya|   10-B|   92|          22|           88.0|
| Rohit|   10-B|   85|          25|          100.0|
| Sneha|   10-C|   80|          19|           76.0|
+------+-------+-----+------------+---------------+



Grade students using when :
A: >90, B: 80–90, C: <80.

In [16]:
from pyspark.sql import functions as F
df_students=df_students.withColumn('grade',F.when(df_students['marks']>90,'A').when((df_students['marks']>=80) & (df_students['marks']<=90),'B').otherwise('C'))
df_students.show()

+------+-------+-----+-----+
|  name|section|marks|grade|
+------+-------+-----+-----+
|  Amit|   10-A|   89|    B|
| Kavya|   10-B|   92|    A|
|Anjali|   10-A|   78|    C|
| Rohit|   10-B|   85|    B|
| Sneha|   10-C|   80|    B|
+------+-------+-----+-----+



Filter students with good grades but poor attendance (<80%).

In [17]:
df_merged=df_merged.withColumn('grade',F.when(df_merged['marks']>90,'A').when((df_merged['marks']>=80) & (df_merged['marks']<=90),'B').otherwise('C'))

df_merged.filter((df_merged['grade'].isin(['A','B'])) & (df_merged['attendance_rate']<80)).show()

+-----+-------+-----+------------+---------------+-----+
| name|section|marks|days_present|attendance_rate|grade|
+-----+-------+-----+------------+---------------+-----+
|Sneha|   10-C|   80|          19|           76.0|    B|
+-----+-------+-----+------------+---------------+-----+



Module 4: Ingest CSV & JSON, Save to Parquet

1. Ingest CSV:

In [18]:
data="""emp_id,name,dept,city,salary
101,Anil,IT,Bangalore,80000
102,Kiran,HR,Mumbai,65000
103,Deepa,Finance,Chennai,72000"""

with open ('employee1.csv','w') as f:
  f.write(data)

2. Ingest JSON:

In [19]:
import json

data={
"id": 201,
"name": "Nandini",
"contact": {
"email": "nandi@example.com",
"city": "Hyderabad"
},
"skills": ["Python", "Spark", "SQL"]
}


with open ('employee2.json','w') as f:
  json.dump(data,f)


Read both formats into DataFrames.

In [20]:
df_csv=spark.read.csv('employee1.csv',header=True,inferSchema=True)
df_csv.show()

+------+-----+-------+---------+------+
|emp_id| name|   dept|     city|salary|
+------+-----+-------+---------+------+
|   101| Anil|     IT|Bangalore| 80000|
|   102|Kiran|     HR|   Mumbai| 65000|
|   103|Deepa|Finance|  Chennai| 72000|
+------+-----+-------+---------+------+



In [21]:
df_json=spark.read.json('employee2.json')
df_json.show()

+--------------------+---+-------+--------------------+
|             contact| id|   name|              skills|
+--------------------+---+-------+--------------------+
|{Hyderabad, nandi...|201|Nandini|[Python, Spark, SQL]|
+--------------------+---+-------+--------------------+



Flatten nested JSON using select , col , alias , explode .

In [22]:
import pyspark.sql.functions as F
df_flat=df_json.select(F.col('id'),F.col('name'),F.col('contact.email').alias('email'),F.col('contact.city').alias('city'),F.explode(F.col('skills')).alias('skill'))
df_flat.show()

+---+-------+-----------------+---------+------+
| id|   name|            email|     city| skill|
+---+-------+-----------------+---------+------+
|201|Nandini|nandi@example.com|Hyderabad|Python|
|201|Nandini|nandi@example.com|Hyderabad| Spark|
|201|Nandini|nandi@example.com|Hyderabad|   SQL|
+---+-------+-----------------+---------+------+



Save both as Parquet files partitioned by city.

In [23]:
df_csv.write.partitionBy('city').parquet('/temp/output/employee1.parquet')
df_flat.write.partitionBy('city').parquet('/temp/output/employee2.parquet')

**Module 5: Spark SQL with Temp Views**

Register the students DataFrame as students_view .

In [24]:
df_students.createOrReplaceTempView('students_view')

a) Average marks per section

In [25]:
spark.sql("select section,avg(marks) from students_view group by section").show()

+-------+----------+
|section|avg(marks)|
+-------+----------+
|   10-A|      83.5|
|   10-B|      88.5|
|   10-C|      80.0|
+-------+----------+



b) Top scorer in each section

In [26]:
spark.sql("select s1.section,s1.name,s1.marks from students_view s1 where s1.marks=(select max(s2.marks) from students_view s2 where s2.section=s1.section )").show()

+-------+-----+-----+
|section| name|marks|
+-------+-----+-----+
|   10-B|Kavya|   92|
|   10-A| Amit|   89|
|   10-C|Sneha|   80|
+-------+-----+-----+



c) Count of students in each grade category

In [27]:
spark.sql("select grade,count(*) from students_view group by grade").show()

+-----+--------+
|grade|count(1)|
+-----+--------+
|    B|       3|
|    A|       1|
|    C|       1|
+-----+--------+



d) Students with marks above class average

In [28]:
spark.sql("select * from students_view where marks>(select avg(marks) as AVG_mark from students_view)").show()

+-----+-------+-----+-----+
| name|section|marks|grade|
+-----+-------+-----+-----+
| Amit|   10-A|   89|    B|
|Kavya|   10-B|   92|    A|
|Rohit|   10-B|   85|    B|
+-----+-------+-----+-----+



e) Attendance-adjusted performance

In [29]:
df_merged.createOrReplaceTempView('merged_view')
spark.sql("select name,section,marks,days_present,(marks*(days_present/25)) as adjusted_marks from merged_view").show()

+------+-------+-----+------------+------------------+
|  name|section|marks|days_present|    adjusted_marks|
+------+-------+-----+------------+------------------+
|  Amit|   10-A|   89|          24|             85.44|
|Anjali|   10-A|   78|          20|62.400000000000006|
| Kavya|   10-B|   92|          22|             80.96|
| Rohit|   10-B|   85|          25|              85.0|
| Sneha|   10-C|   80|          19|              60.8|
+------+-------+-----+------------+------------------+



**Module 6: Partitioned Data & Incremental Loading**

Step 1: Full Load

In [30]:
df_students.write.partitionBy("section").parquet("output/students/")

Step 2: Incremental Load

In [31]:
incremental = [("Tejas", "10-A", 91)]
df_inc = spark.createDataFrame(incremental, ["name", "section", "marks"])
df_inc.write.mode("append").partitionBy("section").parquet("output/students/")

List files in output/students/ using Python.

In [32]:
import os
print(os.listdir('output/students'))

['._SUCCESS.crc', 'section=10-A', 'section=10-B', '_SUCCESS', 'section=10-C']


Read only partition 10-A and list students.

In [33]:
df_10_A=spark.read.parquet('output/students/section=10-A')
df_10_A.show()

+------+-----+-----+
|  name|marks|grade|
+------+-----+-----+
|Anjali|   78|    C|
|  Amit|   89|    B|
| Tejas|   91| NULL|
+------+-----+-----+



Compare before/after counts for section 10-A .

In [34]:
print(df_10_A.count())

3


In [35]:
print(df_students.filter(F.col("section") == "10-A").count())

2


**Module 7: ETL Pipeline – End to End**

Given Raw Data (CSV):

In [36]:
data="""emp_id,name,dept,salary,bonus
1,Arjun,IT,75000,5000
2,Kavya,HR,62000,
3,Sneha,Finance,68000,4000
4,Ramesh,Sales,58000,"""

with open ('employees.csv','w') as f:
  f.write(data)

Load CSV with inferred schema.

In [37]:
df_employees=spark.read.csv('employees.csv',header=True,inferSchema=True)
df_employees.show()

+------+------+-------+------+-----+
|emp_id|  name|   dept|salary|bonus|
+------+------+-------+------+-----+
|     1| Arjun|     IT| 75000| 5000|
|     2| Kavya|     HR| 62000| NULL|
|     3| Sneha|Finance| 68000| 4000|
|     4|Ramesh|  Sales| 58000| NULL|
+------+------+-------+------+-----+



Fill null bonuses with 2000 .

In [38]:
df_employees=df_employees.fillna({'bonus':2000})
df_employees.show()

+------+------+-------+------+-----+
|emp_id|  name|   dept|salary|bonus|
+------+------+-------+------+-----+
|     1| Arjun|     IT| 75000| 5000|
|     2| Kavya|     HR| 62000| 2000|
|     3| Sneha|Finance| 68000| 4000|
|     4|Ramesh|  Sales| 58000| 2000|
+------+------+-------+------+-----+



Create total_ctc = salary + bonus .

In [39]:
df_employees=df_employees.withColumn('total_ctc',F.col('salary')+F.col('bonus'))
df_employees.show()

+------+------+-------+------+-----+---------+
|emp_id|  name|   dept|salary|bonus|total_ctc|
+------+------+-------+------+-----+---------+
|     1| Arjun|     IT| 75000| 5000|    80000|
|     2| Kavya|     HR| 62000| 2000|    64000|
|     3| Sneha|Finance| 68000| 4000|    72000|
|     4|Ramesh|  Sales| 58000| 2000|    60000|
+------+------+-------+------+-----+---------+



Filter employees with total_ctc > 65000

In [40]:
df_employees.filter(F.col('total_ctc')>65000).show()

+------+-----+-------+------+-----+---------+
|emp_id| name|   dept|salary|bonus|total_ctc|
+------+-----+-------+------+-----+---------+
|     1|Arjun|     IT| 75000| 5000|    80000|
|     3|Sneha|Finance| 68000| 4000|    72000|
+------+-----+-------+------+-----+---------+



Save result in:

JSON format.

Parquet format partitioned by department.

In [41]:
df_employees.write.json('employees.json')
df_employees.write.partitionBy('dept').parquet('employees.parquet')