Setup Spark:

In [None]:
from pyspark.sql import SparkSession

In [None]:
spark = SparkSession.builder.appName('BotCampus Intermediate Session').master('local[*]').getOrCreate()

Load starter data:

In [None]:
data = [("Ananya", "Bangalore", 24),
("Ravi", "Hyderabad", 28),
("Kavya", "Delhi", 22),
("Meena", "Chennai", 25)]
columns = ["name", "city", "age"]

In [None]:
df = spark.createDataFrame(data,columns)
df.show()

+------+---------+---+
|  name|     city|age|
+------+---------+---+
|Ananya|Bangalore| 24|
|  Ravi|Hyderabad| 28|
| Kavya|    Delhi| 22|
| Meena|  Chennai| 25|
+------+---------+---+



In [None]:
feedback = spark.sparkContext.parallelize([
"Ravi from Bangalore loved the mobile app",
"Meena from Delhi reported poor response time",
"Ajay from Pune liked the delivery speed",
"Ananya from Hyderabad had an issue with UI",
"Rohit from Mumbai gave positive feedback"
])

Count total number of words.

In [None]:
words_count = feedback.flatMap(lambda line: line.split()).count()
print("TOTAL NO.OF WORDS: ",words_count)

TOTAL NO.OF WORDS:  35


Find top 3 most common words.

In [None]:
word_count = feedback.flatMap(lambda line : line.split())
words = word_count.map(lambda word: word.lower())
word_pairs = words.map(lambda word: (word, 1))
count = word_pairs.reduceByKey(lambda a,b : a + b)
topp = count.takeOrdered(3, key=lambda x: -x[1])
print(topp)

[('from', 5), ('the', 2), ('loved', 1)]


Remove stop words ( from , with , the , etc.).

In [None]:
stop_words = {"from", "with", "the", "an", "and", "had", "gave"}
filtered = feedback.map(lambda line: ' '
                        .join([word.lower() for word in line.split()
                        if word.lower() not in stop_words]) )
filtered.collect()

['ravi bangalore loved mobile app',
 'meena delhi reported poor response time',
 'ajay pune liked delivery speed',
 'ananya hyderabad issue ui',
 'rohit mumbai positive feedback']

Create a dictionary of word → count.

In [36]:
word_count = (feedback
              .flatMap(lambda line: line.split())
              .map(lambda line: (line.lower(),1))
              .reduceByKey(lambda a,b : a+b))
dictt = dict(word_count.collect())
print(dictt)

{'from': 5, 'loved': 1, 'app': 1, 'poor': 1, 'response': 1, 'liked': 1, 'speed': 1, 'ananya': 1, 'an': 1, 'issue': 1, 'with': 1, 'rohit': 1, 'mumbai': 1, 'positive': 1, 'feedback': 1, 'ravi': 1, 'bangalore': 1, 'the': 2, 'mobile': 1, 'meena': 1, 'delhi': 1, 'reported': 1, 'time': 1, 'ajay': 1, 'pune': 1, 'delivery': 1, 'hyderabad': 1, 'had': 1, 'ui': 1, 'gave': 1}


DataFrames – Transformations

Create exam_scores DataFrame:

In [60]:
scores = [
("Ravi", "Math", 88),
("Ananya", "Science", 92),
("Kavya", "English", 79),
("Ravi", "English", 67),
("Neha", "Math", 94),
("Meena", "Science", 85)
]
columns = ["name", "subject", "score"]
df_scores = spark.createDataFrame(scores, columns)
df_scores.show()

+------+-------+-----+
|  name|subject|score|
+------+-------+-----+
|  Ravi|   Math|   88|
|Ananya|Science|   92|
| Kavya|English|   79|
|  Ravi|English|   67|
|  Neha|   Math|   94|
| Meena|Science|   85|
+------+-------+-----+



Add grade column ( >=90 → A, 80-89 → B, 70-79 → C, else D).

In [61]:
from pyspark.sql.functions import when
df_scores = df_scores.withColumn('Grade',
                             when (df_scores.score >= 90, 'A')
                             .when((df_scores.score>= 80)&(df_scores.score<=89),'B')
                              .when((df_scores.score>= 70)&(df_scores.score<=79),'C')
                              .otherwise ('D'))

df_scores.show()

+------+-------+-----+-----+
|  name|subject|score|Grade|
+------+-------+-----+-----+
|  Ravi|   Math|   88|    B|
|Ananya|Science|   92|    A|
| Kavya|English|   79|    C|
|  Ravi|English|   67|    D|
|  Neha|   Math|   94|    A|
| Meena|Science|   85|    B|
+------+-------+-----+-----+



Group by subject, find average score.

In [62]:
graded = df_scores.groupBy('subject').avg('score')
graded.show()

+-------+----------+
|subject|avg(score)|
+-------+----------+
|Science|      88.5|
|   Math|      91.0|
|English|      73.0|
+-------+----------+



Use when and otherwise to classify subject difficulty ( Math/Science =Difficult).

In [63]:
difficult = {'Science','Math'}
df_scores = df_scores.withColumn('Difficulty level',when (df_scores.subject .isin (*difficult) , 'Difficult').otherwise('Easy'))
df_scores.show()

+------+-------+-----+-----+----------------+
|  name|subject|score|Grade|Difficulty level|
+------+-------+-----+-----+----------------+
|  Ravi|   Math|   88|    B|       Difficult|
|Ananya|Science|   92|    A|       Difficult|
| Kavya|English|   79|    C|            Easy|
|  Ravi|English|   67|    D|            Easy|
|  Neha|   Math|   94|    A|       Difficult|
| Meena|Science|   85|    B|       Difficult|
+------+-------+-----+-----+----------------+



Rank students per subject using Window function.

In [64]:
from pyspark.sql.window import Window
from pyspark.sql.functions import rank
windoww = Window.partitionBy('Subject').orderBy(df_scores.score.desc())
rankk = df_scores.withColumn('Rank',rank().over(windoww))
rankk.show()

+------+-------+-----+-----+----------------+----+
|  name|subject|score|Grade|Difficulty level|Rank|
+------+-------+-----+-----+----------------+----+
| Kavya|English|   79|    C|            Easy|   1|
|  Ravi|English|   67|    D|            Easy|   2|
|  Neha|   Math|   94|    A|       Difficult|   1|
|  Ravi|   Math|   88|    B|       Difficult|   2|
|Ananya|Science|   92|    A|       Difficult|   1|
| Meena|Science|   85|    B|       Difficult|   2|
+------+-------+-----+-----+----------------+----+



Apply UDF to format names (e.g., make all uppercase).

In [65]:
from pyspark.sql.functions import udf
from pyspark.sql.types import StringType
def to_upper(name):
  return name.upper()

upperr = udf(to_upper,StringType())
df_scores = df_scores.withColumn('NAME',upperr(df_scores.name))
df_scores.show()

+------+-------+-----+-----+----------------+
|  NAME|subject|score|Grade|Difficulty level|
+------+-------+-----+-----+----------------+
|  RAVI|   Math|   88|    B|       Difficult|
|ANANYA|Science|   92|    A|       Difficult|
| KAVYA|English|   79|    C|            Easy|
|  RAVI|English|   67|    D|            Easy|
|  NEHA|   Math|   94|    A|       Difficult|
| MEENA|Science|   85|    B|       Difficult|
+------+-------+-----+-----+----------------+



CSV & JSON

CSV file: students.csv

In [66]:
csv_data = [
    (1, "Amit", "IT", "Bangalore", 78000),
    (2, "Kavya", "HR", "Chennai", 62000),
    (3, "Arjun", "Finance", "Hyderabad", 55000)
]

csv_columns = ["id", "name", "department", "city", "salary"]

df_csv = spark.createDataFrame(csv_data, csv_columns)

df_csv.write.mode("overwrite").option("header", "true").csv("students.csv")

JSON file employee_nested.json

In [67]:
import json
json_data = [
    {
        "id": 101,
        "name": "Sneha",
        "address": {"city": "Mumbai", "pincode": 400001},
        "skills": ["Python", "Spark"]
    }
]
with open("employee_nested.json", "w") as f:
    json.dump(json_data, f, indent=5)

Load both datasets into PySpark.

In [68]:
df_csv = spark.read.option('header','true').option('inferSchema','true').csv('students.csv')

df_json =spark.read.option('multiline','true').json('employee_nested.json')

Print schema and infer nested structure.

In [69]:
df_csv.printSchema()
df_csv.show()

root
 |-- id: integer (nullable = true)
 |-- name: string (nullable = true)
 |-- department: string (nullable = true)
 |-- city: string (nullable = true)
 |-- salary: integer (nullable = true)

+---+-----+----------+---------+------+
| id| name|department|     city|salary|
+---+-----+----------+---------+------+
|  2|Kavya|        HR|  Chennai| 62000|
|  3|Arjun|   Finance|Hyderabad| 55000|
|  1| Amit|        IT|Bangalore| 78000|
+---+-----+----------+---------+------+



In [71]:
df_json.printSchema()
df_json.show(truncate= False)

root
 |-- address: struct (nullable = true)
 |    |-- city: string (nullable = true)
 |    |-- pincode: long (nullable = true)
 |-- id: long (nullable = true)
 |-- name: string (nullable = true)
 |-- skills: array (nullable = true)
 |    |-- element: string (containsNull = true)

+----------------+---+-----+---------------+
|address         |id |name |skills         |
+----------------+---+-----+---------------+
|{Mumbai, 400001}|101|Sneha|[Python, Spark]|
+----------------+---+-----+---------------+



Flatten the JSON (use explode , select , alias ).

In [72]:
from pyspark.sql.functions import col,explode

flattened = df_json.select(
    'id','name',col('address.city').alias('city'), col('address.pincode').alias('PIN') , explode('skills').alias('skill')
)
flattened.show()

+---+-----+------+------+------+
| id| name|  city|   PIN| skill|
+---+-----+------+------+------+
|101|Sneha|Mumbai|400001|Python|
|101|Sneha|Mumbai|400001| Spark|
+---+-----+------+------+------+



Convert both to Parquet and write to /tmp/output

In [73]:
df_csv.write.mode('overwrite').parquet('/tmp/output/students_parquet')
df_json.write.mode('overwrite').parquet('/tmp/output/employee_parquet')

In [74]:
!ls

employee_nested.json  sample_data  students.csv


In [78]:
from google.colab import files
import shutil, glob

csv_part = glob.glob("students.csv/part-*.csv")[0]
shutil.copy(csv_part, "students_csv.csv")

files.download("students_csv.csv")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [77]:
import zipfile
import os

def zip_parquet(source_dir, zip_name):
    with zipfile.ZipFile(zip_name, 'w') as zipf:
        for foldername, _, filenames in os.walk(source_dir):
            for filename in filenames:
                file_path = os.path.join(foldername, filename)
                zipf.write(file_path, os.path.relpath(file_path, source_dir))

zip_parquet("/tmp/output/students_parquet", "/tmp/students_parquet.zip")
files.download("/tmp/students_parquet.zip")

zip_parquet("/tmp/output/employee_parquet", "/tmp/employee_parquet.zip")
files.download("/tmp/employee_parquet.zip")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Spark SQL

Create view from exam scores

In [79]:
df_scores.show()

+------+-------+-----+-----+----------------+
|  NAME|subject|score|Grade|Difficulty level|
+------+-------+-----+-----+----------------+
|  RAVI|   Math|   88|    B|       Difficult|
|ANANYA|Science|   92|    A|       Difficult|
| KAVYA|English|   79|    C|            Easy|
|  RAVI|English|   67|    D|            Easy|
|  NEHA|   Math|   94|    A|       Difficult|
| MEENA|Science|   85|    B|       Difficult|
+------+-------+-----+-----+----------------+



In [80]:
df_scores.createOrReplaceTempView('exam_scores')

Top scorer per subject

In [83]:
spark.sql("""
          select subject,name,score
          from(select * , row_number() over (partition by subject order by score desc) as rn
          from exam_scores)
          where rn = 1""").show()

+-------+------+-----+
|subject|  name|score|
+-------+------+-----+
|English| KAVYA|   79|
|   Math|  NEHA|   94|
|Science|ANANYA|   92|
+-------+------+-----+



Count of students per grade

In [85]:
spark.sql("""
select grade, count(*) as count
from exam_scores
group by grade
order by count(*) desc""").show()

+-----+-----+
|grade|count|
+-----+-----+
|    B|    2|
|    A|    2|
|    C|    1|
|    D|    1|
+-----+-----+



Students with multiple subjects

In [87]:
spark.sql("""
select name, count(distinct subject) as subject
from exam_scores
group by name
having count(distinct subject) >1""").show()

+----+-------+
|name|subject|
+----+-------+
|RAVI|      2|
+----+-------+



Subjects with average score above 85

In [90]:
spark.sql("""
select subject,avg(score) as avg
from exam_scores
group by subject
having avg(score)>85
"""
).show()

+-------+----+
|subject| avg|
+-------+----+
|Science|88.5|
|   Math|91.0|
+-------+----+



another DataFrame attendance(name, days_present)

In [91]:
attendance_data = [
    ("RAVI", 18),
    ("ANANYA", 22),
    ("KAVYA", 20),
    ("NEHA", 25),
    ("MEENA", 19)
]

columns = ["NAME", "days_present"]
df_attendance = spark.createDataFrame(attendance_data, columns)
df_attendance.show()


+------+------------+
|  NAME|days_present|
+------+------------+
|  RAVI|          18|
|ANANYA|          22|
| KAVYA|          20|
|  NEHA|          25|
| MEENA|          19|
+------+------------+



Join with scores

In [92]:
joined = df_scores.join(df_attendance, on='NAME', how='left')
joined.show()

+------+-------+-----+-----+----------------+------------+
|  NAME|subject|score|Grade|Difficulty level|days_present|
+------+-------+-----+-----+----------------+------------+
|  RAVI|   Math|   88|    B|       Difficult|          18|
|ANANYA|Science|   92|    A|       Difficult|          22|
| KAVYA|English|   79|    C|            Easy|          20|
|  NEHA|   Math|   94|    A|       Difficult|          25|
| MEENA|Science|   85|    B|       Difficult|          19|
|  RAVI|English|   67|    D|            Easy|          18|
+------+-------+-----+-----+----------------+------------+



Calculate attendance-adjusted grade:

In [94]:
from pyspark.sql.functions import udf
from pyspark.sql.types import StringType

def downgrade_grade(grade, days):
    if days < 20:
        downgrade_map = {'A': 'B', 'B': 'C', 'C': 'D', 'D': 'D','D':'E'}
        return downgrade_map.get(grade, grade)
    else:
        return grade

adjusted_udf = udf(downgrade_grade, StringType())

adjusted = joined.withColumn("Adjusted_Grade", adjusted_udf("Grade", "days_present"))
adjusted.select("NAME", "subject", "score", "Grade", "days_present", "Adjusted_Grade").show()


+------+-------+-----+-----+------------+--------------+
|  NAME|subject|score|Grade|days_present|Adjusted_Grade|
+------+-------+-----+-----+------------+--------------+
|  RAVI|   Math|   88|    B|          18|             C|
|ANANYA|Science|   92|    A|          22|             A|
| KAVYA|English|   79|    C|          20|             C|
|  NEHA|   Math|   94|    A|          25|             A|
| MEENA|Science|   85|    B|          19|             C|
|  RAVI|English|   67|    D|          18|             E|
+------+-------+-----+-----+------------+--------------+



Partitioned Load (Full + Incremental)

Initial Load:

In [95]:
df_scores.write.partitionBy("subject").parquet("/tmp/scores/")

Incremental Load:

In [97]:
incremental = [("Meena", "Math", 93)]
columns = ["NAME", "subject", "score"]
df_inc = spark.createDataFrame(incremental, columns)
df_inc.write.mode("append").partitionBy("subject").parquet("/tmp/scores/")

List all folders inside /tmp/scores/

In [98]:
!ls /tmp/scores

'subject=English'  'subject=Math'  'subject=Science'   _SUCCESS


Read only Math partition and display all entries.

In [99]:
df_math = spark.read.parquet("/tmp/scores/subject=Math/")
df_math.show()

+-----+-----+-----+----------------+
| NAME|score|Grade|Difficulty level|
+-----+-----+-----+----------------+
| RAVI|   88|    B|       Difficult|
| NEHA|   94|    A|       Difficult|
|Meena|   93| NULL|            NULL|
+-----+-----+-----+----------------+



In [100]:
!zip -r /content/scores_partitioned.zip /tmp/scores/
from google.colab import files
files.download("/content/scores_partitioned.zip")

  adding: tmp/scores/ (stored 0%)
  adding: tmp/scores/._SUCCESS.crc (stored 0%)
  adding: tmp/scores/subject=Science/ (stored 0%)
  adding: tmp/scores/subject=Science/part-00000-ec5f6920-3770-4dbb-8ac3-e3134249f839.c000.snappy.parquet (deflated 48%)
  adding: tmp/scores/subject=Science/.part-00000-ec5f6920-3770-4dbb-8ac3-e3134249f839.c000.snappy.parquet.crc (stored 0%)
  adding: tmp/scores/subject=Science/part-00001-ec5f6920-3770-4dbb-8ac3-e3134249f839.c000.snappy.parquet (deflated 48%)
  adding: tmp/scores/subject=Science/.part-00001-ec5f6920-3770-4dbb-8ac3-e3134249f839.c000.snappy.parquet.crc (stored 0%)
  adding: tmp/scores/subject=English/ (stored 0%)
  adding: tmp/scores/subject=English/part-00000-ec5f6920-3770-4dbb-8ac3-e3134249f839.c000.snappy.parquet (deflated 47%)
  adding: tmp/scores/subject=English/.part-00000-ec5f6920-3770-4dbb-8ac3-e3134249f839.c000.snappy.parquet.crc (stored 0%)
  adding: tmp/scores/subject=English/part-00001-ec5f6920-3770-4dbb-8ac3-e3134249f839.c000.sna

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

ETL: Clean, Transform, Load

Raw CSV:

In [101]:
csv_data = """emp_id,name,dept,salary,bonus
1,Arjun,IT,78000,5000
2,Kavya,HR,62000,
3,Sneha,Finance,55000,3000"""

with open("/content/employeesp6.csv", "w") as f:
    f.write(csv_data)

Load data with header.

In [102]:
df_csvp6 = spark.read.option('header','true').option('inferschema','true').csv('employeesp6.csv')
df_csvp6.show()

+------+-----+-------+------+-----+
|emp_id| name|   dept|salary|bonus|
+------+-----+-------+------+-----+
|     1|Arjun|     IT| 78000| 5000|
|     2|Kavya|     HR| 62000| NULL|
|     3|Sneha|Finance| 55000| 3000|
+------+-----+-------+------+-----+



Fill missing bonus with 2000.

In [103]:
df_csvp6 = df_csvp6.fillna({'bonus':2000})
df_csvp6.show()

+------+-----+-------+------+-----+
|emp_id| name|   dept|salary|bonus|
+------+-----+-------+------+-----+
|     1|Arjun|     IT| 78000| 5000|
|     2|Kavya|     HR| 62000| 2000|
|     3|Sneha|Finance| 55000| 3000|
+------+-----+-------+------+-----+



Calculate total_ctc = salary + bonus

In [106]:
from pyspark.sql.functions import expr
df_csvp6 = df_csvp6.withColumn('total_ctc',expr('salary+bonus'))
df_csvp6.show()

+------+-----+-------+------+-----+---------+
|emp_id| name|   dept|salary|bonus|total_ctc|
+------+-----+-------+------+-----+---------+
|     1|Arjun|     IT| 78000| 5000|    83000|
|     2|Kavya|     HR| 62000| 2000|    64000|
|     3|Sneha|Finance| 55000| 3000|    58000|
+------+-----+-------+------+-----+---------+



Filter where total_ctc > 60,000.

In [107]:
filter = df_csvp6.filter(col('total_ctc')>60000)
filter.show()

+------+-----+----+------+-----+---------+
|emp_id| name|dept|salary|bonus|total_ctc|
+------+-----+----+------+-----+---------+
|     1|Arjun|  IT| 78000| 5000|    83000|
|     2|Kavya|  HR| 62000| 2000|    64000|
+------+-----+----+------+-----+---------+



Save final DataFrame to Parquet and JSON.

In [108]:
filter.write.mode("overwrite").parquet("/tmp/final_employees_parquet/")
filter.write.mode("overwrite").json("/tmp/final_employees_json/")


In [109]:
!zip -r /content/final_employees_parquet.zip /tmp/final_employees_parquet/
!zip -r /content/final_employees_json.zip /tmp/final_employees_json/

from google.colab import files
files.download("/content/final_employees_parquet.zip")
files.download("/content/final_employees_json.zip")

  adding: tmp/final_employees_parquet/ (stored 0%)
  adding: tmp/final_employees_parquet/._SUCCESS.crc (stored 0%)
  adding: tmp/final_employees_parquet/_SUCCESS (stored 0%)
  adding: tmp/final_employees_parquet/part-00000-52808c50-fdac-4047-a5f8-3ddda0df661b-c000.snappy.parquet (deflated 51%)
  adding: tmp/final_employees_parquet/.part-00000-52808c50-fdac-4047-a5f8-3ddda0df661b-c000.snappy.parquet.crc (stored 0%)
  adding: tmp/final_employees_json/ (stored 0%)
  adding: tmp/final_employees_json/._SUCCESS.crc (stored 0%)
  adding: tmp/final_employees_json/.part-00000-47695477-2d9c-4dc1-8046-29c9cee78be7-c000.json.crc (stored 0%)
  adding: tmp/final_employees_json/part-00000-47695477-2d9c-4dc1-8046-29c9cee78be7-c000.json (deflated 36%)
  adding: tmp/final_employees_json/_SUCCESS (stored 0%)


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>