Initialize Spark with:

In [2]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("BotCampus PySpark Practice").master("local[*]").getOrCreate()

Create a DataFrame

In [5]:
data = [
("Anjali", "Bangalore", 24),
("Ravi", "Hyderabad", 28),
("Kavya", "Delhi", 22),
("Meena", "Chennai", 25),
("Arjun", "Mumbai", 30)
]
columns = ["name", "city", "age"]
df = spark.createDataFrame(data,columns)
df.show()

+------+---------+---+
|  name|     city|age|
+------+---------+---+
|Anjali|Bangalore| 24|
|  Ravi|Hyderabad| 28|
| Kavya|    Delhi| 22|
| Meena|  Chennai| 25|
| Arjun|   Mumbai| 30|
+------+---------+---+



Show schema, explain data types, and convert to RDD.

In [6]:
df.printSchema()

root
 |-- name: string (nullable = true)
 |-- city: string (nullable = true)
 |-- age: long (nullable = true)



In [7]:
print(df.dtypes)

[('name', 'string'), ('city', 'string'), ('age', 'bigint')]


In [8]:
rdd = df.rdd
rdd.collect()

[Row(name='Anjali', city='Bangalore', age=24),
 Row(name='Ravi', city='Hyderabad', age=28),
 Row(name='Kavya', city='Delhi', age=22),
 Row(name='Meena', city='Chennai', age=25),
 Row(name='Arjun', city='Mumbai', age=30)]

In [9]:
rdd.map(lambda x: (x[0].upper(), x[1], x[2] + 1)).collect()

[('ANJALI', 'Bangalore', 25),
 ('RAVI', 'Hyderabad', 29),
 ('KAVYA', 'Delhi', 23),
 ('MEENA', 'Chennai', 26),
 ('ARJUN', 'Mumbai', 31)]

RDDs & Transformations

In [20]:
feedback = spark.sparkContext.parallelize([
"Ravi from Bangalore loved the delivery",
"Meena from Hyderabad had a late order",
"Ajay from Pune liked the service",
"Anjali from Delhi faced UI issues",
"Rohit from Mumbai gave positive feedback"
])

Split each line into words ( flatMap ).

In [21]:
words = feedback.flatMap(lambda line: line.split()).collect()
print(words)

['Ravi', 'from', 'Bangalore', 'loved', 'the', 'delivery', 'Meena', 'from', 'Hyderabad', 'had', 'a', 'late', 'order', 'Ajay', 'from', 'Pune', 'liked', 'the', 'service', 'Anjali', 'from', 'Delhi', 'faced', 'UI', 'issues', 'Rohit', 'from', 'Mumbai', 'gave', 'positive', 'feedback']


Remove stop words ( from , the , etc.).

In [24]:
stop_words = {"from", "the", "a", "had", "gave", "and", "of"}

words = feedback.flatMap(lambda line: line.split())

filtered_words = words.filter(lambda word: word.lower() not in stop_words)

filtered_words.collect()


['Ravi',
 'Bangalore',
 'loved',
 'delivery',
 'Meena',
 'Hyderabad',
 'late',
 'order',
 'Ajay',
 'Pune',
 'liked',
 'service',
 'Anjali',
 'Delhi',
 'faced',
 'UI',
 'issues',
 'Rohit',
 'Mumbai',
 'positive',
 'feedback']

Count each word frequency using reduceByKey

In [25]:
word_pair = filtered_words.map(lambda word: (word.lower(),1))
countt = word_pair.reduceByKey(lambda a,b : a+b)
countt.collect()

[('loved', 1),
 ('liked', 1),
 ('service', 1),
 ('anjali', 1),
 ('faced', 1),
 ('issues', 1),
 ('rohit', 1),
 ('mumbai', 1),
 ('positive', 1),
 ('feedback', 1),
 ('ravi', 1),
 ('bangalore', 1),
 ('delivery', 1),
 ('meena', 1),
 ('hyderabad', 1),
 ('late', 1),
 ('order', 1),
 ('ajay', 1),
 ('pune', 1),
 ('delhi', 1),
 ('ui', 1)]

Find top 3 most frequent non-stop words.

In [26]:
top_3_words = countt.takeOrdered(3, key=lambda x: -x[1])
print(top_3_words)


[('loved', 1), ('liked', 1), ('service', 1)]


DataFrames & Transformation

In [29]:
students = [
("Amit", "10-A", 89),
("Kavya", "10-B", 92),
("Anjali", "10-A", 78),
("Rohit", "10-B", 85),
("Sneha", "10-C", 80)
]
columns = ["name", "section", "marks"]
df_students = spark.createDataFrame(students,columns)
attendance = [
("Amit", 24),
("Kavya", 22),
("Anjali", 20),
("Rohit", 25),
("Sneha", 19)
]
columns2 = ["name", "days_present"]
df_attendence = spark.createDataFrame(attendance,columns2)

df_students.show()
df_attendence.show()

+------+-------+-----+
|  name|section|marks|
+------+-------+-----+
|  Amit|   10-A|   89|
| Kavya|   10-B|   92|
|Anjali|   10-A|   78|
| Rohit|   10-B|   85|
| Sneha|   10-C|   80|
+------+-------+-----+

+------+------------+
|  name|days_present|
+------+------------+
|  Amit|          24|
| Kavya|          22|
|Anjali|          20|
| Rohit|          25|
| Sneha|          19|
+------+------------+



Join both DataFrames on name

In [32]:
joined = df_students.join(df_attendence, on='name')
joined.show()


+------+-------+-----+------------+
|  name|section|marks|days_present|
+------+-------+-----+------------+
|  Amit|   10-A|   89|          24|
|Anjali|   10-A|   78|          20|
| Kavya|   10-B|   92|          22|
| Rohit|   10-B|   85|          25|
| Sneha|   10-C|   80|          19|
+------+-------+-----+------------+



Create a new column: attendance_rate = days_present / 25 .

In [35]:
from pyspark.sql.functions import col
joined = joined.withColumn('attendence_rate',(col('days_present')/25) *100)
joined.show()

+------+-------+-----+------------+---------------+
|  name|section|marks|days_present|attendence_rate|
+------+-------+-----+------------+---------------+
|  Amit|   10-A|   89|          24|           96.0|
|Anjali|   10-A|   78|          20|           80.0|
| Kavya|   10-B|   92|          22|           88.0|
| Rohit|   10-B|   85|          25|          100.0|
| Sneha|   10-C|   80|          19|           76.0|
+------+-------+-----+------------+---------------+



Grade students using when :
A: >90, B: 80–90, C: <80.

In [37]:
from pyspark.sql.functions import when
joined = joined.withColumn('grade', when(col('marks')>90,'A')
                                      .when((col('marks')>=80)&(col('marks')<90),'B')
                                      .otherwise('C'))
joined.show()

+------+-------+-----+------------+---------------+-----+
|  name|section|marks|days_present|attendence_rate|grade|
+------+-------+-----+------------+---------------+-----+
|  Amit|   10-A|   89|          24|           96.0|    B|
|Anjali|   10-A|   78|          20|           80.0|    C|
| Kavya|   10-B|   92|          22|           88.0|    A|
| Rohit|   10-B|   85|          25|          100.0|    B|
| Sneha|   10-C|   80|          19|           76.0|    B|
+------+-------+-----+------------+---------------+-----+



Filter students with good grades but poor attendance (<80%).

In [38]:
filtered = joined.filter((col('grade')!='C')&(col('attendence_rate')<80))
filtered.show()

+-----+-------+-----+------------+---------------+-----+
| name|section|marks|days_present|attendence_rate|grade|
+-----+-------+-----+------------+---------------+-----+
|Sneha|   10-C|   80|          19|           76.0|    B|
+-----+-------+-----+------------+---------------+-----+



CSV & JSON

Read both formats into DataFrames.

In [40]:
csv_data = """emp_id,name,dept,city,salary
101,Anil,IT,Bangalore,80000
102,Kiran,HR,Mumbai,65000
103,Deepa,Finance,Chennai,72000"""

with open('employees.csv','w')as file:
  file.write(csv_data)

df_csv = spark.read.option("header", True).option("inferSchema", True).csv("employees.csv")
df_csv.show()

+------+-----+-------+---------+------+
|emp_id| name|   dept|     city|salary|
+------+-----+-------+---------+------+
|   101| Anil|     IT|Bangalore| 80000|
|   102|Kiran|     HR|   Mumbai| 65000|
|   103|Deepa|Finance|  Chennai| 72000|
+------+-----+-------+---------+------+



In [41]:
json_data = """
{
  "id": 201,
  "name": "Nandini",
  "contact": {
    "email": "nandi@example.com",
    "city": "Hyderabad"
  },
  "skills": ["Python", "Spark", "SQL"]
}
"""

with open("employee.json", "w") as f:
    f.write(json_data)

df_json = spark.read.option("multiline", True).json("employee.json")
df_json.printSchema()
df_json.show(truncate=False)

root
 |-- contact: struct (nullable = true)
 |    |-- city: string (nullable = true)
 |    |-- email: string (nullable = true)
 |-- id: long (nullable = true)
 |-- name: string (nullable = true)
 |-- skills: array (nullable = true)
 |    |-- element: string (containsNull = true)

+------------------------------+---+-------+--------------------+
|contact                       |id |name   |skills              |
+------------------------------+---+-------+--------------------+
|{Hyderabad, nandi@example.com}|201|Nandini|[Python, Spark, SQL]|
+------------------------------+---+-------+--------------------+



Flatten nested JSON using select , col , alias , explode .

In [43]:
from pyspark.sql.functions import col,explode

flattened = df_json.select(
    col('id'),
    col('name'),
    col('contact.email').alias('email'),
    col('contact.city').alias('city'),
    explode(col('skills')).alias('skills')
)

flattened.show()

+---+-------+-----------------+---------+------+
| id|   name|            email|     city|skills|
+---+-------+-----------------+---------+------+
|201|Nandini|nandi@example.com|Hyderabad|Python|
|201|Nandini|nandi@example.com|Hyderabad| Spark|
|201|Nandini|nandi@example.com|Hyderabad|   SQL|
+---+-------+-----------------+---------+------+



Save both as Parquet files partitioned by city.

In [44]:
df_csv.write.mode("overwrite").partitionBy("city").parquet("output/employees_csv")
flattened.write.mode("overwrite").partitionBy("city").parquet("output/employees_json")

In [45]:
import shutil
from google.colab import files

shutil.make_archive("employees_csv", 'zip', "output/employees_csv")
files.download("employees_csv.zip")

shutil.make_archive("employees_json", 'zip', "output/employees_json")
files.download("employees_json.zip")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Spark SQL

Register the students DataFrame as students_view

In [46]:
joined.show()

+------+-------+-----+------------+---------------+-----+
|  name|section|marks|days_present|attendence_rate|grade|
+------+-------+-----+------------+---------------+-----+
|  Amit|   10-A|   89|          24|           96.0|    B|
|Anjali|   10-A|   78|          20|           80.0|    C|
| Kavya|   10-B|   92|          22|           88.0|    A|
| Rohit|   10-B|   85|          25|          100.0|    B|
| Sneha|   10-C|   80|          19|           76.0|    B|
+------+-------+-----+------------+---------------+-----+



In [47]:
joined.createOrReplaceTempView('students_view')


Average marks per section

In [48]:
spark.sql("""
select section, avg(marks)
from students_view
group by section""").show()

+-------+----------+
|section|avg(marks)|
+-------+----------+
|   10-C|      80.0|
|   10-A|      83.5|
|   10-B|      88.5|
+-------+----------+



Top scorer in each section

In [50]:
spark.sql("""
    SELECT section, name, marks
    FROM (
        SELECT *, ROW_NUMBER() OVER (PARTITION BY section ORDER BY marks DESC) as rn
        FROM students_view
    ) WHERE rn = 1
""").show()


+-------+-----+-----+
|section| name|marks|
+-------+-----+-----+
|   10-A| Amit|   89|
|   10-B|Kavya|   92|
|   10-C|Sneha|   80|
+-------+-----+-----+



Count of students in each grade category

In [51]:
spark.sql("""
select grade, count(*)
from students_view
group by grade""").show()

+-----+--------+
|grade|count(1)|
+-----+--------+
|    B|       3|
|    C|       1|
|    A|       1|
+-----+--------+



Students with marks above class average

In [52]:
spark.sql("""
select *
from students_view
where marks > (select avg(marks) from students_view)""").show()

+-----+-------+-----+------------+---------------+-----+
| name|section|marks|days_present|attendence_rate|grade|
+-----+-------+-----+------------+---------------+-----+
| Amit|   10-A|   89|          24|           96.0|    B|
|Kavya|   10-B|   92|          22|           88.0|    A|
|Rohit|   10-B|   85|          25|          100.0|    B|
+-----+-------+-----+------------+---------------+-----+



Attendance-adjusted performance

In [56]:
spark.sql("""
select * , ((marks * attendence_rate) /100) as adjusted_perf
from students_view
order by adjusted_perf desc""").show()

+------+-------+-----+------------+---------------+-----+-------------+
|  name|section|marks|days_present|attendence_rate|grade|adjusted_perf|
+------+-------+-----+------------+---------------+-----+-------------+
|  Amit|   10-A|   89|          24|           96.0|    B|        85.44|
| Rohit|   10-B|   85|          25|          100.0|    B|         85.0|
| Kavya|   10-B|   92|          22|           88.0|    A|        80.96|
|Anjali|   10-A|   78|          20|           80.0|    C|         62.4|
| Sneha|   10-C|   80|          19|           76.0|    B|         60.8|
+------+-------+-----+------------+---------------+-----+-------------+



Full Load

In [58]:
joined.write.mode('overwrite').partitionBy("section").parquet("output/students/")

Incremental Load

In [59]:
incremental = [("Tejas", "10-A", 91)]
df_inc = spark.createDataFrame(incremental, ["name", "section", "marks"])
df_inc.write.mode("append").partitionBy("section").parquet("output/students/")

List files in output/students/ using Python.

In [60]:
!ls output/students/

'section=10-A'	'section=10-B'	'section=10-C'	 _SUCCESS


In [61]:
df_all = spark.read.parquet('output/students/')
df_all.show()

+------+-----+------------+---------------+-----+-------+
|  name|marks|days_present|attendence_rate|grade|section|
+------+-----+------------+---------------+-----+-------+
|  Amit|   89|          24|           96.0|    B|   10-A|
|Anjali|   78|          20|           80.0|    C|   10-A|
| Kavya|   92|          22|           88.0|    A|   10-B|
| Rohit|   85|          25|          100.0|    B|   10-B|
| Sneha|   80|          19|           76.0|    B|   10-C|
| Tejas|   91|        NULL|           NULL| NULL|   10-A|
+------+-----+------------+---------------+-----+-------+



Read only partition 10-A and list students.

In [63]:
df_10a = spark.read.parquet('output/students/section=10-A')
df_10a.show()

+------+-----+------------+---------------+-----+
|  name|marks|days_present|attendence_rate|grade|
+------+-----+------------+---------------+-----+
|  Amit|   89|          24|           96.0|    B|
|Anjali|   78|          20|           80.0|    C|
| Tejas|   91|        NULL|           NULL| NULL|
+------+-----+------------+---------------+-----+



Compare before/after counts for section 10-A

In [65]:
df_students.filter(df_students.section == '10-A').count()
upd = spark.read.parquet('output/students/')
upd.filter(upd.section == '10-A').count()

3

In [66]:
import shutil
from google.colab import files

shutil.make_archive("students_parquet", 'zip', "output/students/")

files.download("students_parquet.zip")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

ETL Pipeline

Load CSV with inferred schema.

In [67]:
data = """
emp_id,name,dept,salary,bonus
1,Arjun,IT,75000,5000
2,Kavya,HR,62000,
3,Sneha,Finance,68000,4000
4,Ramesh,Sales,58000,
"""

with open('etl.csv','w') as file:
  file.write(data)

In [69]:
df_csv = spark.read.option('header','true').option('inferschema','true').csv('etl.csv')
df_csv.show()

+------+------+-------+------+-----+
|emp_id|  name|   dept|salary|bonus|
+------+------+-------+------+-----+
|     1| Arjun|     IT| 75000| 5000|
|     2| Kavya|     HR| 62000| NULL|
|     3| Sneha|Finance| 68000| 4000|
|     4|Ramesh|  Sales| 58000| NULL|
+------+------+-------+------+-----+



Fill null bonuses with 2000 .

In [70]:
df_csv = df_csv.fillna({'bonus':2000})
df_csv.show()

+------+------+-------+------+-----+
|emp_id|  name|   dept|salary|bonus|
+------+------+-------+------+-----+
|     1| Arjun|     IT| 75000| 5000|
|     2| Kavya|     HR| 62000| 2000|
|     3| Sneha|Finance| 68000| 4000|
|     4|Ramesh|  Sales| 58000| 2000|
+------+------+-------+------+-----+



Create total_ctc = salary + bonus .

In [71]:
from pyspark.sql.functions import expr
df_csv = df_csv.withColumn('total_ctc',expr('salary+bonus'))
df_csv.show()

+------+------+-------+------+-----+---------+
|emp_id|  name|   dept|salary|bonus|total_ctc|
+------+------+-------+------+-----+---------+
|     1| Arjun|     IT| 75000| 5000|    80000|
|     2| Kavya|     HR| 62000| 2000|    64000|
|     3| Sneha|Finance| 68000| 4000|    72000|
|     4|Ramesh|  Sales| 58000| 2000|    60000|
+------+------+-------+------+-----+---------+



Filter employees with total_ctc > 65000

In [73]:
filter = df_csv.filter(col('total_ctc')>65000)
filter.show()

+------+-----+-------+------+-----+---------+
|emp_id| name|   dept|salary|bonus|total_ctc|
+------+-----+-------+------+-----+---------+
|     1|Arjun|     IT| 75000| 5000|    80000|
|     3|Sneha|Finance| 68000| 4000|    72000|
+------+-----+-------+------+-----+---------+



Save result

In [76]:
filter.write.mode("overwrite").partitionBy("dept").parquet("/tmp/final_employees_parquet/")
filter.write.mode("overwrite").json("/tmp/final_employees_json/")

!zip -r /content/final_employees_parquet.zip /tmp/final_employees_parquet/
!zip -r /content/final_employees_json.zip /tmp/final_employees_json/

from google.colab import files
files.download("/content/final_employees_parquet.zip")
files.download("/content/final_employees_json.zip")

updating: tmp/final_employees_parquet/ (stored 0%)
updating: tmp/final_employees_parquet/._SUCCESS.crc (stored 0%)
updating: tmp/final_employees_parquet/_SUCCESS (stored 0%)
  adding: tmp/final_employees_parquet/dept=IT/ (stored 0%)
  adding: tmp/final_employees_parquet/dept=IT/part-00000-ef3df3f9-8835-4b2d-b61c-4c749c819f60.c000.snappy.parquet (deflated 51%)
  adding: tmp/final_employees_parquet/dept=IT/.part-00000-ef3df3f9-8835-4b2d-b61c-4c749c819f60.c000.snappy.parquet.crc (stored 0%)
  adding: tmp/final_employees_parquet/dept=Finance/ (stored 0%)
  adding: tmp/final_employees_parquet/dept=Finance/part-00000-ef3df3f9-8835-4b2d-b61c-4c749c819f60.c000.snappy.parquet (deflated 51%)
  adding: tmp/final_employees_parquet/dept=Finance/.part-00000-ef3df3f9-8835-4b2d-b61c-4c749c819f60.c000.snappy.parquet.crc (stored 0%)
updating: tmp/final_employees_json/ (stored 0%)
updating: tmp/final_employees_json/._SUCCESS.crc (stored 0%)
updating: tmp/final_employees_json/_SUCCESS (stored 0%)
  adding

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>