# Apache Spark Exercises


In [ ]:
# Exercise 0: Setup SparkSession
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("Spark Exercises").getOrCreate()
print("SparkSession created successfully")

## Exercise 1 — Create a SparkSession & DataFrame
**Goal:** Learn how to start Spark and create a DataFrame.

In [ ]:
data = [
    (1, "Alice", 25),
    (2, "Bob", 30)
]
df = spark.createDataFrame(data, ["id", "name", "age"])
df.show()

## Exercise 2 — Select & Rename Columns
**Goal:** Select specific columns and rename them using `alias`.

In [ ]:
people_df = spark.read.csv("data/people.csv", header=True, inferSchema=True)
people_df.select("name", "age").show()
people_df.select(people_df.name.alias("person_name"), "age").show()

## Exercise 3 — Filter Rows
**Goal:** Filter rows using conditions.

In [ ]:
people_df.filter(people_df.age > 30).show()
people_df.filter(people_df.city == "London").show()

## Exercise 4 — Sort and Limit
**Goal:** Sort DataFrame rows and limit results.

In [ ]:
people_df.orderBy(people_df.age.desc()).show()
people_df.orderBy(people_df.age).limit(2).show()

## Exercise 5 — Group By
**Goal:** Group data and count per group.

In [ ]:
people_df.groupBy("city").count().show()

## Exercise 6 — Aggregations
**Goal:** Perform aggregations like `avg` and `max`.

In [ ]:
from pyspark.sql.functions import avg, max
people_df.groupBy("city").agg(
    avg("age").alias("avg_age"),
    max("age").alias("max_age")
).show()

## Exercise 7 — Read & Write CSV
**Goal:** Read data from CSV and write Spark output.

In [ ]:
people_df.write.mode("overwrite").csv("output/people_output", header=True)
print("Data written to output/people_output/")

## Exercise 8 — Spark SQL
**Goal:** Use SQL queries on DataFrames.

In [ ]:
people_df.createOrReplaceTempView("people")
spark.sql("""
SELECT city, AVG(age) AS avg_age
FROM people
GROUP BY city
""").show()

## Exercise 9 — Join DataFrames
**Goal:** Join two DataFrames (people and sales).

In [ ]:
sales_df = spark.read.csv("data/sales.csv", header=True, inferSchema=True)
joined_df = people_df.join(sales_df, people_df.id == sales_df.person_id, "inner")
joined_df.select("name", "amount").show()

## Exercise 10 — User Defined Function (UDF)
**Goal:** Apply custom Python logic to a DataFrame column.

In [ ]:
from pyspark.sql.functions import udf
from pyspark.sql.types import StringType

def age_group(age):
    if age < 30:
        return "Young"
    elif age < 40:
        return "Adult"
    else:
        return "Senior"

age_group_udf = udf(age_group, StringType())
people_df.withColumn("age_group", age_group_udf(people_df.age)).show()

## Cleanup
Stop Spark session when done.

In [ ]:
spark.stop()
print("Spark session stopped")