In [1]:
# 🚀 Your Code Here - Exercise 1: Import Libraries
# Import all the required libraries below:

# Your imports here...
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.functions import *
from pyspark.sql.types import *
import os
import sys
from pyspark.sql.window import Window

# Check versions here...
print("pyspark version:", pyspark.__version__)
print("python version:",sys.version)
print("os version:",os.uname())
print("✅ Libraries imported successfully!")

pyspark version: 4.0.0
python version: 3.12.1 (main, Jul 10 2025, 11:57:50) [GCC 13.3.0]
os version: posix.uname_result(sysname='Linux', nodename='codespaces-780a6a', release='6.8.0-1030-azure', version='#35~22.04.1-Ubuntu SMP Mon May 26 18:08:30 UTC 2025', machine='x86_64')
✅ Libraries imported successfully!


In [7]:
# 🚀 Your Code Here - Exercise 2: Create SparkSession
# Create your SparkSession with the specified configurations:
spark = SparkSession.builder \
    .appName("My PySpark Learning Journey") \
    .config("spark.sql.adaptive.enabled", "true") \
    .config("spark.sql.adaptive.coalescePartitions.enabled", "true") \
    .config("spark.sql.autoBroadcastJoinThreshold", "10MB") \
    .config("spark.executor.memory", "2g") \
    .config("spark.driver.memory", "1g") \
    .getOrCreate()

spark.sparkContext.setLogLevel("WARN")
print("Spark version:", spark.version)  
print("Application name:", spark.conf.get("spark.app.name"))


Spark version: 4.0.0
Application name: My PySpark Learning Journey


## 📝 Exercise 3: Create Your First DataFrame

**Your Mission:** Create a DataFrame with employee data using a proper schema.

**The Data to Use:**
```python
employees_data = [
    (1, "Alice Johnson", "Data Engineering", 85000, "2021-01-15"),
    (2, "Bob Smith", "Data Science", 92000, "2020-03-20"),
    (3, "Carol Davis", "Analytics", 78000, "2021-06-10"),
    (4, "David Wilson", "Engineering", 95000, "2019-11-05"),
    (5, "Eva Brown", "Data Science", 88000, "2020-09-12")
]
```

**Your Tasks:**
1. Define a schema with these fields:
   - employee_id (IntegerType)
   - name (StringType) 
   - department (StringType)
   - salary (IntegerType)
   - hire_date (StringType)

2. Create the DataFrame using the schema
3. Show the DataFrame
4. Print the schema

**Remember:** Use `StructType([StructField(...), ...])` for schema definition!

**Start coding! 👇**

In [None]:
# 🚀 Your Code Here - Exercise 3: Create DataFrame with Schema

# Define your data
employees_data = [
    (1, "Alice Johnson", "Data Engineering", 85000, "2021-01-15"),
    (2, "Bob Smith", "Data Science", 92000, "2020-03-20"),
    (3, "Carol Davis", "Analytics", 78000, "2021-06-10"),
    (4, "David Wilson", "Engineering", 95000, "2019-11-05"),
    (5, "Eva Brown", "Data Science", 88000, "2020-09-12")
]



# 1. Define your schema here:
# employees_schema = StructType([
#     StructField("employee_id", IntegerType(), True),
#     StructField("name", StringType(), True),
#     StructField("department", StringType(), True),
#     StructField("salary", IntegerType(), True),
#     StructField("hire_date", StringType(), True)
# ])

# 2. Create DataFrame here:
# employees_df = spark.createDataFrame(employees_data, employees_schema)

# 3. Show the DataFrame and print schema:
# employees_df.show()
# employees_df.printSchema()

## 📝 Exercise 4: Master All Join Types 

**The Big Challenge:** Now let's practice all join types! First, create a second DataFrame for performance data.

**Create Performance DataFrame:**
```python
performance_data = [
    (1, 95, "Excellent"),
    (2, 87, "Good"),
    (3, 92, "Excellent"), 
    (6, 88, "Good")  # Note: employee_id 6 doesn't exist in employees!
]
```

**Your Mission:**
1. Create a performance DataFrame with columns: employee_id, score, rating
2. Then perform these joins with the employees DataFrame:
   - **Inner Join** (only matching records)
   - **Left Join** (all employees + matching performance)
   - **Right Join** (all performance + matching employees) 
   - **Full Outer Join** (everything from both)
   - **Semi Join** (employees who have performance records - no perf columns)
   - **Anti Join** (employees who DON'T have performance records)

**For each join:**
- Print what the join does
- Show the result
- Print the count

**Time to master joins! 🔗**

In [None]:
🚀 Your Code Here - Exercise 4: Practice All Join Types
# 1. Create performance DataFrame first
performance_data = [
    (1, 95, "Excellent"),
    (2, 87, "Good"),
    (3, 92, "Excellent"), 
    (6, 88, "Good")  # Employee 6 doesn't exist in employees!
]

# Create performance DataFrame here:
# performance_df = spark.createDataFrame(performance_data, ["employee_id", "score", "rating"])

# 2. Now practice each join type:

# INNER JOIN:
print("🔗 INNER JOIN - Only matching records:")
# inner_join = employees_df.join(performance_df, "employee_id", "inner")
# inner_join.show()
# print(f"Result count: {inner_join.count()}")

# LEFT JOIN:
print("\n🔗 LEFT JOIN - All employees + matching performance:")
# left_join = employees_df.join(performance_df, "employee_id", "left")
# left_join.show()
# print(f"Result count: {left_join.count()}")

# RIGHT JOIN:
print("\n🔗 RIGHT JOIN - All performance + matching employees:")
# right_join = employees_df.join(performance_df, "employee_id", "right")
# right_join.show()
# print(f"Result count: {right_join.count()}")

# FULL OUTER JOIN:
print("\n🔗 FULL OUTER JOIN - Everything from both tables:")
# full_join = employees_df.join(performance_df, "employee_id", "outer")
# full_join.show()
# print(f"Result count: {full_join.count()}")

# SEMI JOIN:
print("\n🔗 SEMI JOIN - Employees who have performance (no perf columns):")
# semi_join = employees_df.join(performance_df, "employee_id", "semi")
# semi_join.show()
# print(f"Result count: {semi_join.count()}")

# ANTI JOIN:
print("\n🔗 ANTI JOIN - Employees who DON'T have performance:")
# anti_join = employees_df.join(performance_df, "employee_id", "anti")
# anti_join.show()
# print(f"Result count: {anti_join.count()}")

## 📝 Exercise 5: Optimization Techniques Challenge

**Advanced Challenge:** Let's optimize your joins and learn performance techniques!

**Create a Department Lookup Table:**
```python
dept_data = [
    ("Data Engineering", "DE", "Technology"),
    ("Data Science", "DS", "Technology"),
    ("Analytics", "AN", "Business"),
    ("Engineering", "ENG", "Technology")
]
```

**Your Optimization Tasks:**

1. **Create Department DataFrame** with columns: dept_name, dept_code, category

2. **Practice Broadcast Join:**
   - Join employees with departments using broadcast hint
   - Use `broadcast()` function for the small dept table
   - Show the execution plan with `.explain()`

3. **Sorting Challenge:**
   - Sort employees by department, then salary (highest first)
   - Sort with null handling (add a test record with null department)

4. **Partitioning Practice:**
   - Check current number of partitions
   - Repartition by department
   - Coalesce to fewer partitions

5. **Configuration Check:**
   - Print current values for these configs:
     - `spark.sql.autoBroadcastJoinThreshold`
     - `spark.sql.shuffle.partitions`
     - `spark.sql.adaptive.enabled`

**Ready for the optimization challenge? 🚀**