In [1]:
from pyspark.sql import SparkSession
spark = SparkSession.builder \
    .appName("BotCampus Intermediate Session") \
    .master("local[*]") \
    .getOrCreate()



In [2]:
data = [("Ananya", "Bangalore", 24),
        ("Ravi", "Hyderabad", 28),
        ("Kavya", "Delhi", 22),
        ("Meena", "Chennai", 25)]
columns = ["name", "city", "age"]
df = spark.createDataFrame(data, columns)
df.show()

+------+---------+---+
|  name|     city|age|
+------+---------+---+
|Ananya|Bangalore| 24|
|  Ravi|Hyderabad| 28|
| Kavya|    Delhi| 22|
| Meena|  Chennai| 25|
+------+---------+---+



In [4]:
feedback = spark.sparkContext.parallelize([
    "Ravi from Bangalore loved the mobile app",
    "Meena from Delhi reported poor response time",
    "Ajay from Pune liked the delivery speed",
    "Ananya from Hyderabad had an issue with UI",
    "Rohit from Mumbai gave positive feedback"
])

In [8]:
word_count = feedback.flatMap(lambda line: line.split()).count()
print("Total words:", word_count)

Total words: 35


In [13]:
from collections import Counter
top_words = feedback.flatMap(lambda line: line.lower().split()) \
                    .filter(lambda word: word not in ['from', 'with', 'the', 'an']) \
                    .map(lambda word: (word, 1)) \
                    .reduceByKey(lambda a, b: a + b) \
                    .takeOrdered(3, key=lambda x: -x[1])
print("Top 3 words:", top_words)

Top 3 words: [('loved', 1), ('app', 1), ('poor', 1)]


In [19]:
word_dict = feedback.flatMap(lambda line: line.lower().split()) \
                    .filter(lambda word: word not in ['from', 'with', 'the', 'an']) \
                    .map(lambda word: (word, 1)) \
                    .reduceByKey(lambda a, b: a + b) \
                    .collectAsMap()
print("Word count dictionary:", word_dict)

Word count dictionary: {'loved': 1, 'app': 1, 'poor': 1, 'response': 1, 'liked': 1, 'speed': 1, 'ananya': 1, 'issue': 1, 'rohit': 1, 'mumbai': 1, 'positive': 1, 'feedback': 1, 'ravi': 1, 'bangalore': 1, 'mobile': 1, 'meena': 1, 'delhi': 1, 'reported': 1, 'time': 1, 'ajay': 1, 'pune': 1, 'delivery': 1, 'hyderabad': 1, 'had': 1, 'ui': 1, 'gave': 1}


In [24]:
scores = [("Ravi", "Math", 88),
          ("Ananya", "Science", 92),
          ("Kavya", "English", 79),
          ("Ravi", "English", 67),
          ("Neha", "Math", 94),
          ("Meena", "Science", 85)]
columns = ["name", "subject", "score"]
df_scores = spark.createDataFrame(scores, columns)

In [29]:
from pyspark.sql.functions import when
df_scores = df_scores.withColumn("grade",
             when(df_scores.score >= 90, "A")
            .when((df_scores.score >= 80) & (df_scores.score < 90), "B")
            .when((df_scores.score >= 70) & (df_scores.score < 80), "C")
            .otherwise("D"))

In [37]:
df_scores.show()

+------+-------+-----+-----+
|  name|subject|score|grade|
+------+-------+-----+-----+
|  Ravi|   Math|   88|    B|
|Ananya|Science|   92|    A|
| Kavya|English|   79|    C|
|  Ravi|English|   67|    D|
|  Neha|   Math|   94|    A|
| Meena|Science|   85|    B|
+------+-------+-----+-----+



In [38]:
df_scores.groupBy("subject").avg("score").show()

+-------+----------+
|subject|avg(score)|
+-------+----------+
|Science|      88.5|
|   Math|      91.0|
|English|      73.0|
+-------+----------+



In [42]:
df_scores = df_scores.withColumn("difficulty",
             when(df_scores.subject.isin("Math", "Science"), "Difficult")
             .otherwise("Easy"))

In [45]:
df_scores.show()

+------+-------+-----+-----+----------+
|  name|subject|score|grade|difficulty|
+------+-------+-----+-----+----------+
|  Ravi|   Math|   88|    B| Difficult|
|Ananya|Science|   92|    A| Difficult|
| Kavya|English|   79|    C|      Easy|
|  Ravi|English|   67|    D|      Easy|
|  Neha|   Math|   94|    A| Difficult|
| Meena|Science|   85|    B| Difficult|
+------+-------+-----+-----+----------+



In [47]:
from pyspark.sql.window import Window
from pyspark.sql.functions import rank
window_spec = Window.partitionBy("subject").orderBy(df_scores.score.desc())
df_scores = df_scores.withColumn("rank", rank().over(window_spec))

In [48]:
df_scores.show()

+------+-------+-----+-----+----------+----+
|  name|subject|score|grade|difficulty|rank|
+------+-------+-----+-----+----------+----+
| Kavya|English|   79|    C|      Easy|   1|
|  Ravi|English|   67|    D|      Easy|   2|
|  Neha|   Math|   94|    A| Difficult|   1|
|  Ravi|   Math|   88|    B| Difficult|   2|
|Ananya|Science|   92|    A| Difficult|   1|
| Meena|Science|   85|    B| Difficult|   2|
+------+-------+-----+-----+----------+----+



In [56]:
from pyspark.sql.functions import udf
from pyspark.sql.types import StringType

def format_name(name):
    return name.upper()

format_name_udf = udf(format_name, StringType())
df_scores = df_scores.withColumn("formatted_name", format_name_udf(df_scores.name))
df_scores.show()

+------+-------+-----+-----+----------+----+--------------+
|  name|subject|score|grade|difficulty|rank|formatted_name|
+------+-------+-----+-----+----------+----+--------------+
| Kavya|English|   79|    C|      Easy|   1|         KAVYA|
|  Ravi|English|   67|    D|      Easy|   2|          RAVI|
|  Neha|   Math|   94|    A| Difficult|   1|          NEHA|
|  Ravi|   Math|   88|    B| Difficult|   2|          RAVI|
|Ananya|Science|   92|    A| Difficult|   1|        ANANYA|
| Meena|Science|   85|    B| Difficult|   2|         MEENA|
+------+-------+-----+-----+----------+----+--------------+



In [57]:
import pandas as pd


students_data = {
    "id": [1, 2, 3],
    "name": ["Amit", "Kavya", "Arjun"],
    "department": ["IT", "HR", "Finance"],
    "city": ["Bangalore", "Chennai", "Hyderabad"],
    "salary": [78000, 62000, 55000]
}

df = pd.DataFrame(students_data)
df.to_csv("students.csv", index=False)

In [58]:
import json

employee_data = {
    "id": 101,
    "name": "Sneha",
    "address": {
        "city": "Mumbai",
        "pincode": 400001
    },
    "skills": ["Python", "Spark", "SQL"]
}

with open("employee_nested.json", "w") as f:
    json.dump(employee_data, f, indent=4)

In [60]:
students_df = spark.read.option("header", True).csv("students.csv")
students_df.printSchema()

root
 |-- id: string (nullable = true)
 |-- name: string (nullable = true)
 |-- department: string (nullable = true)
 |-- city: string (nullable = true)
 |-- salary: string (nullable = true)



In [61]:
students_df.show()

+---+-----+----------+---------+------+
| id| name|department|     city|salary|
+---+-----+----------+---------+------+
|  1| Amit|        IT|Bangalore| 78000|
|  2|Kavya|        HR|  Chennai| 62000|
|  3|Arjun|   Finance|Hyderabad| 55000|
+---+-----+----------+---------+------+



In [62]:
employees_df = spark.read.option("multiline", True).json("employee_nested.json")
employees_df.printSchema()

root
 |-- address: struct (nullable = true)
 |    |-- city: string (nullable = true)
 |    |-- pincode: long (nullable = true)
 |-- id: long (nullable = true)
 |-- name: string (nullable = true)
 |-- skills: array (nullable = true)
 |    |-- element: string (containsNull = true)



In [65]:
from pyspark.sql.functions import explode, col
flattened_df = employees_df.select(
    "id",
    "name",
    col("address.city").alias("city"),
    col("address.pincode").alias("pincode"),
    explode("skills").alias("skill")
)

In [66]:
students_df.write.parquet("/tmp/output/students", mode="overwrite")
flattened_df.write.parquet("/tmp/output/employees", mode="overwrite")

In [67]:
df_scores.createOrReplaceTempView("exam_scores")


In [71]:
spark.sql("""
SELECT subject, name, MAX(score) AS top_score
FROM exam_scores
GROUP BY subject, name
ORDER BY subject, top_score DESC
""").show()

+-------+------+---------+
|subject|  name|top_score|
+-------+------+---------+
|English| Kavya|       79|
|English|  Ravi|       67|
|   Math|  Neha|       94|
|   Math|  Ravi|       88|
|Science|Ananya|       92|
|Science| Meena|       85|
+-------+------+---------+



In [74]:
spark.sql("SELECT grade, COUNT(*) AS count FROM exam_scores GROUP BY grade").show()

+-----+-----+
|grade|count|
+-----+-----+
|    B|    2|
|    C|    1|
|    A|    2|
|    D|    1|
+-----+-----+



In [77]:
spark.sql("""
SELECT name, COUNT(subject) AS subjects
FROM exam_scores
GROUP BY name
HAVING subjects > 1
""").show()

+----+--------+
|name|subjects|
+----+--------+
|Ravi|       2|
+----+--------+



In [80]:
spark.sql("""
SELECT subject, AVG(score) AS avg_score
FROM exam_scores
GROUP BY subject
HAVING avg_score > 85
""").show()

+-------+---------+
|subject|avg_score|
+-------+---------+
|Science|     88.5|
|   Math|     91.0|
+-------+---------+



In [84]:
attendance = [("Ravi", 18), ("Ananya", 21), ("Kavya", 20), ("Neha", 25), ("Meena", 19)]
att_cols = ["name", "days_present"]
df_att = spark.createDataFrame(attendance, att_cols)

In [85]:
df_att.show()

+------+------------+
|  name|days_present|
+------+------------+
|  Ravi|          18|
|Ananya|          21|
| Kavya|          20|
|  Neha|          25|
| Meena|          19|
+------+------------+



In [90]:
from pyspark.sql.functions import expr

df_joined = df_scores.join(df_att, on="name", how="left")
df_final = df_joined.withColumn("adjusted_grade", expr("""
CASE
  WHEN days_present < 20 AND grade = 'A' THEN 'B'
  WHEN days_present < 20 AND grade = 'B' THEN 'C'
  WHEN days_present < 20 AND grade = 'C' THEN 'D'
  WHEN days_present < 20 AND grade = 'D' THEN 'D'
  ELSE grade
END
"""))
df_final.select("name", "subject", "score", "grade", "days_present", "adjusted_grade").show()

+------+-------+-----+-----+------------+--------------+
|  name|subject|score|grade|days_present|adjusted_grade|
+------+-------+-----+-----+------------+--------------+
|Ananya|Science|   92|    A|          21|             A|
|  Ravi|   Math|   88|    B|          18|             C|
| Kavya|English|   79|    C|          20|             C|
|  Ravi|English|   67|    D|          18|             D|
|  Neha|   Math|   94|    A|          25|             A|
| Meena|Science|   85|    B|          19|             C|
+------+-------+-----+-----+------------+--------------+



In [91]:
df_scores.write.partitionBy("subject").parquet("/tmp/scores/", mode="overwrite")

In [95]:
incremental = [("Meena", "Math", 93)]
df_inc = spark.createDataFrame(incremental, ["name", "subject", "score"])
df_inc.write.mode("append").partitionBy("subject").parquet("/tmp/scores/")

In [97]:
df_inc.show()

+-----+-------+-----+
| name|subject|score|
+-----+-------+-----+
|Meena|   Math|   93|
+-----+-------+-----+



In [104]:
df_math = spark.read.parquet("/tmp/scores/subject=Math")
df_math.show()

+-----+-----+-----+----------+----+--------------+
| name|score|grade|difficulty|rank|formatted_name|
+-----+-----+-----+----------+----+--------------+
| Neha|   94|    A| Difficult|   1|          NEHA|
| Ravi|   88|    B| Difficult|   2|          RAVI|
|Meena|   93| NULL|      NULL|NULL|          NULL|
|Meena|   93| NULL|      NULL|NULL|          NULL|
|Meena|   93| NULL|      NULL|NULL|          NULL|
|Meena|   93| NULL|      NULL|NULL|          NULL|
+-----+-----+-----+----------+----+--------------+



In [106]:
import pandas as pd

employee_data = {
    "emp_id": [1, 2, 3],
    "name": ["Arjun", "Kavya", "Sneha"],
    "dept": ["IT", "HR", "Finance"],
    "salary": [78000, 62000, 55000],
    "bonus": [5000, None, 3000]  # None represents a missing value
}

df = pd.DataFrame(employee_data)
df.to_csv("emp_raw.csv", index=False)

In [110]:
etl_df = spark.read.option("header", True).csv("emp_raw.csv")
etl_df.printSchema()

root
 |-- emp_id: string (nullable = true)
 |-- name: string (nullable = true)
 |-- dept: string (nullable = true)
 |-- salary: string (nullable = true)
 |-- bonus: string (nullable = true)



In [111]:
etl_df.show()

+------+-----+-------+------+------+
|emp_id| name|   dept|salary| bonus|
+------+-----+-------+------+------+
|     1|Arjun|     IT| 78000|5000.0|
|     2|Kavya|     HR| 62000|  NULL|
|     3|Sneha|Finance| 55000|3000.0|
+------+-----+-------+------+------+



In [115]:
etl_df = etl_df.fillna({"bonus": 2000})

In [116]:
etl_df.show()

+------+-----+-------+------+------+
|emp_id| name|   dept|salary| bonus|
+------+-----+-------+------+------+
|     1|Arjun|     IT| 78000|5000.0|
|     2|Kavya|     HR| 62000|  2000|
|     3|Sneha|Finance| 55000|3000.0|
+------+-----+-------+------+------+



In [120]:
from pyspark.sql.types import IntegerType
etl_df = etl_df.withColumn("salary", etl_df["salary"].cast(IntegerType()))
etl_df = etl_df.withColumn("bonus", etl_df["bonus"].cast(IntegerType()))

In [125]:
from pyspark.sql.functions import col
etl_df = etl_df.withColumn("total_ctc", col("salary") + col("bonus"))

In [130]:
etl_df = etl_df.filter(col("total_ctc") > 60000)

In [133]:
etl_df.show()

+------+-----+----+------+-----+---------+
|emp_id| name|dept|salary|bonus|total_ctc|
+------+-----+----+------+-----+---------+
|     1|Arjun|  IT| 78000| 5000|    83000|
|     2|Kavya|  HR| 62000| 2000|    64000|
+------+-----+----+------+-----+---------+



In [137]:
etl_df.write.mode("overwrite").parquet("/tmp/etl_output/parquet")
etl_df.write.mode("overwrite").json("/tmp/etl_output/json")