In [1]:
from pyspark.sql import SparkSession, Row

# Step 1: Start Spark Session
spark = SparkSession.builder \
    .appName("EmployeeWorkData") \
    .getOrCreate()

# Step 2: Prepare Sample Employee Data
data = [
    Row(EmpID=101, Name="Ravi", Department="Engineering", Project="AI Engine", Salary=95000, HoursPerWeek=42),
    Row(EmpID=102, Name="Sneha", Department="Engineering", Project="Data Platform", Salary=87000, HoursPerWeek=45),
    Row(EmpID=103, Name="Kabir", Department="Marketing", Project="Product Launch", Salary=65000, HoursPerWeek=40),
    Row(EmpID=104, Name="Anita", Department="Sales", Project="Client Outreach", Salary=70000, HoursPerWeek=38),
    Row(EmpID=105, Name="Divya", Department="Engineering", Project="AI Engine", Salary=99000, HoursPerWeek=48),
    Row(EmpID=106, Name="Amit", Department="Marketing", Project="Social Media", Salary=62000, HoursPerWeek=35),
    Row(EmpID=107, Name="Priya", Department="HR", Project="Policy Revamp", Salary=58000, HoursPerWeek=37),
    Row(EmpID=108, Name="Manav", Department="Sales", Project="Lead Gen", Salary=73000, HoursPerWeek=41),
    Row(EmpID=109, Name="Neha", Department="Engineering", Project="Security Suite", Salary=91000, HoursPerWeek=46),
    Row(EmpID=110, Name="Farah", Department="HR", Project="Onboarding", Salary=60000, HoursPerWeek=36)
]

# Step 3: Create DataFrame
df = spark.createDataFrame(data)

# Show DataFrame (without truncation)
df.show(truncate=False)


+-----+-----+-----------+---------------+------+------------+
|EmpID|Name |Department |Project        |Salary|HoursPerWeek|
+-----+-----+-----------+---------------+------+------------+
|101  |Ravi |Engineering|AI Engine      |95000 |42          |
|102  |Sneha|Engineering|Data Platform  |87000 |45          |
|103  |Kabir|Marketing  |Product Launch |65000 |40          |
|104  |Anita|Sales      |Client Outreach|70000 |38          |
|105  |Divya|Engineering|AI Engine      |99000 |48          |
|106  |Amit |Marketing  |Social Media   |62000 |35          |
|107  |Priya|HR         |Policy Revamp  |58000 |37          |
|108  |Manav|Sales      |Lead Gen       |73000 |41          |
|109  |Neha |Engineering|Security Suite |91000 |46          |
|110  |Farah|HR         |Onboarding     |60000 |36          |
+-----+-----+-----------+---------------+------+------------+



In [2]:
# Create a Local Temporary View
df.createOrReplaceTempView("employees_local")

# Create a Global Temporary View
df.createOrReplaceGlobalTempView("employees_global")


In [3]:
spark.sql("SELECT * FROM employees_local WHERE Department = 'Engineering'").show()


+-----+-----+-----------+--------------+------+------------+
|EmpID| Name| Department|       Project|Salary|HoursPerWeek|
+-----+-----+-----------+--------------+------+------------+
|  101| Ravi|Engineering|     AI Engine| 95000|          42|
|  102|Sneha|Engineering| Data Platform| 87000|          45|
|  105|Divya|Engineering|     AI Engine| 99000|          48|
|  109| Neha|Engineering|Security Suite| 91000|          46|
+-----+-----+-----------+--------------+------+------------+



In [4]:
spark.sql("SELECT Name, Project FROM global_temp.employees_global WHERE HoursPerWeek > 40").show()


+-----+--------------+
| Name|       Project|
+-----+--------------+
| Ravi|     AI Engine|
|Sneha| Data Platform|
|Divya|     AI Engine|
|Manav|      Lead Gen|
| Neha|Security Suite|
+-----+--------------+



In [8]:
from pyspark.sql import SparkSession, Row

# Step 1: Start Spark session
spark = SparkSession.builder.appName("EmployeeWorkData").getOrCreate()

# Step 2: Prepare the employee data
data = [
    Row(EmpID=101, Name="Ravi", Department="Engineering", Project="AI Engine", Salary=95000, HoursPerWeek=42),
    Row(EmpID=102, Name="Sneha", Department="Engineering", Project="Data Platform", Salary=87000, HoursPerWeek=45),
    Row(EmpID=103, Name="Kabir", Department="Marketing", Project="Product Launch", Salary=65000, HoursPerWeek=40),
    Row(EmpID=104, Name="Anita", Department="Sales", Project="Client Outreach", Salary=70000, HoursPerWeek=38),
    Row(EmpID=105, Name="Divya", Department="Engineering", Project="AI Engine", Salary=99000, HoursPerWeek=48),
    Row(EmpID=106, Name="Amit", Department="Marketing", Project="Social Media", Salary=62000, HoursPerWeek=35),
    Row(EmpID=107, Name="Priya", Department="HR", Project="Policy Revamp", Salary=58000, HoursPerWeek=37),
    Row(EmpID=108, Name="Manav", Department="Sales", Project="Lead Gen", Salary=73000, HoursPerWeek=41),
    Row(EmpID=109, Name="Neha", Department="Engineering", Project="Security Suite", Salary=91000, HoursPerWeek=46),
    Row(EmpID=110, Name="Farah", Department="HR", Project="Onboarding", Salary=60000, HoursPerWeek=36),
]

# Step 3: Create DataFrame
df = spark.createDataFrame(data)

# Step 4: Create local view
df.createOrReplaceTempView("employees_local")

# Step 5: Run SQL queries
print(" Employees working on 'AI Engine':")
spark.sql("SELECT * FROM employees_local WHERE Project = 'AI Engine'").show()

print(" Marketing employees with salary > 60000:")
spark.sql("SELECT * FROM employees_local WHERE Department = 'Marketing' AND Salary > 60000").show()

print(" Average salary per department:")
spark.sql("SELECT Department, ROUND(AVG(Salary), 2) AS Avg_Salary FROM employees_local GROUP BY Department").show()

print(" Top 3 highest paid employees:")
spark.sql("SELECT Name, Salary FROM employees_local ORDER BY Salary DESC LIMIT 3").show()

print(" Employees working more than 40 hours/week:")
spark.sql("SELECT * FROM employees_local WHERE HoursPerWeek > 40").show()

print(" Employees per project:")
spark.sql("SELECT Project, COUNT(*) AS EmployeeCount FROM employees_local GROUP BY Project").show()

# Step 6: Drop the local temp view
spark.catalog.dropTempView("employees_local")

# Step 7: Try accessing dropped view (will raise AnalysisException)
try:
    spark.sql("SELECT * FROM employees_local").show()
except Exception as e:
    print(" View was dropped. Error:\n", e)


 Employees working on 'AI Engine':
+-----+-----+-----------+---------+------+------------+
|EmpID| Name| Department|  Project|Salary|HoursPerWeek|
+-----+-----+-----------+---------+------+------------+
|  101| Ravi|Engineering|AI Engine| 95000|          42|
|  105|Divya|Engineering|AI Engine| 99000|          48|
+-----+-----+-----------+---------+------+------------+

 Marketing employees with salary > 60000:
+-----+-----+----------+--------------+------+------------+
|EmpID| Name|Department|       Project|Salary|HoursPerWeek|
+-----+-----+----------+--------------+------+------------+
|  103|Kabir| Marketing|Product Launch| 65000|          40|
|  106| Amit| Marketing|  Social Media| 62000|          35|
+-----+-----+----------+--------------+------+------------+

 Average salary per department:
+-----------+----------+
| Department|Avg_Salary|
+-----------+----------+
|      Sales|   71500.0|
|Engineering|   93000.0|
|  Marketing|   63500.0|
|         HR|   59000.0|
+-----------+-----

In [9]:
from pyspark.sql import SparkSession, Row
from pyspark.sql.functions import avg, col, when

# Step 1: Initial Spark session
spark = SparkSession.builder.appName("GlobalViewExample").getOrCreate()

# Step 2: Create Data
data = [
    Row(EmpID=101, Name="Ravi", Department="Engineering", Project="AI Engine", Salary=95000, HoursPerWeek=42),
    Row(EmpID=102, Name="Sneha", Department="Engineering", Project="Data Platform", Salary=87000, HoursPerWeek=45),
    Row(EmpID=103, Name="Kabir", Department="Marketing", Project="Product Launch", Salary=65000, HoursPerWeek=40),
    Row(EmpID=104, Name="Anita", Department="Sales", Project="Client Outreach", Salary=70000, HoursPerWeek=38),
    Row(EmpID=105, Name="Divya", Department="Engineering", Project="AI Engine", Salary=99000, HoursPerWeek=48),
    Row(EmpID=106, Name="Amit", Department="Marketing", Project="Social Media", Salary=62000, HoursPerWeek=35),
    Row(EmpID=107, Name="Priya", Department="HR", Project="Policy Revamp", Salary=58000, HoursPerWeek=37),
    Row(EmpID=108, Name="Manav", Department="Sales", Project="Lead Gen", Salary=73000, HoursPerWeek=41),
    Row(EmpID=109, Name="Neha", Department="Engineering", Project="Security Suite", Salary=91000, HoursPerWeek=46),
    Row(EmpID=110, Name="Farah", Department="HR", Project="Onboarding", Salary=60000, HoursPerWeek=36),
]

# Step 3: Create DataFrame and Global View
df = spark.createDataFrame(data)
df.createOrReplaceGlobalTempView("employees_global")

#  1. HR employees working < 38 hours/week
print("1. HR employees working < 38 hrs/week:")
spark.sql("""
    SELECT * FROM global_temp.employees_global
    WHERE Department = 'HR' AND HoursPerWeek < 38
""").show()

#  2. Total salary payout for each department
print("2. Total salary per department:")
spark.sql("""
    SELECT Department, SUM(Salary) AS Total_Salary
    FROM global_temp.employees_global
    GROUP BY Department
""").show()

#  3. Add derived column Status
print("3. Add Status column (Overworked if >45 hrs):")
df_status = spark.sql("SELECT * FROM global_temp.employees_global") \
    .withColumn("Status", when(col("HoursPerWeek") > 45, "Overworked").otherwise("Normal"))
df_status.select("Name", "HoursPerWeek", "Status").show()

#  4. Count of employees per project
print("4. Employee count per project:")
spark.sql("""
    SELECT Project, COUNT(*) AS Total_Employees
    FROM global_temp.employees_global
    GROUP BY Project
""").show()

#  5. Employees with salary above average in their department
print("5. Employees with salary above department average:")
dept_avg = df.groupBy("Department").agg(avg("Salary").alias("AvgSalary"))
above_avg = df.join(dept_avg, on="Department").filter(col("Salary") > col("AvgSalary"))
above_avg.select("Name", "Department", "Salary", "AvgSalary").show()

#  6. Open a NEW Spark session and query the global temp view
print("6. Querying global view from NEW Spark session:")
new_spark = SparkSession.builder.appName("NewSession").getOrCreate()
new_spark.sql("SELECT Name, Department FROM global_temp.employees_global").show()


1. HR employees working < 38 hrs/week:
+-----+-----+----------+-------------+------+------------+
|EmpID| Name|Department|      Project|Salary|HoursPerWeek|
+-----+-----+----------+-------------+------+------------+
|  107|Priya|        HR|Policy Revamp| 58000|          37|
|  110|Farah|        HR|   Onboarding| 60000|          36|
+-----+-----+----------+-------------+------+------------+

2. Total salary per department:
+-----------+------------+
| Department|Total_Salary|
+-----------+------------+
|      Sales|      143000|
|Engineering|      372000|
|  Marketing|      127000|
|         HR|      118000|
+-----------+------------+

3. Add Status column (Overworked if >45 hrs):
+-----+------------+----------+
| Name|HoursPerWeek|    Status|
+-----+------------+----------+
| Ravi|          42|    Normal|
|Sneha|          45|    Normal|
|Kabir|          40|    Normal|
|Anita|          38|    Normal|
|Divya|          48|Overworked|
| Amit|          35|    Normal|
|Priya|          37|   

In [10]:
from pyspark.sql import SparkSession, Row
from pyspark.sql.functions import col, row_number
from pyspark.sql.window import Window

# Start Spark session
spark = SparkSession.builder.appName("BonusChallenges").getOrCreate()

# Sample Data
data = [
    Row(EmpID=101, Name="Ravi", Department="Engineering", Project="AI Engine", Salary=95000, HoursPerWeek=42),
    Row(EmpID=102, Name="Sneha", Department="Engineering", Project="Data Platform", Salary=87000, HoursPerWeek=45),
    Row(EmpID=103, Name="Kabir", Department="Marketing", Project="Product Launch", Salary=65000, HoursPerWeek=40),
    Row(EmpID=104, Name="Anita", Department="Sales", Project="Client Outreach", Salary=70000, HoursPerWeek=38),
    Row(EmpID=105, Name="Divya", Department="Engineering", Project="AI Engine", Salary=99000, HoursPerWeek=48),
    Row(EmpID=106, Name="Amit", Department="Marketing", Project="Social Media", Salary=62000, HoursPerWeek=35),
    Row(EmpID=107, Name="Priya", Department="HR", Project="Policy Revamp", Salary=58000, HoursPerWeek=37),
    Row(EmpID=108, Name="Manav", Department="Sales", Project="Lead Gen", Salary=73000, HoursPerWeek=41),
    Row(EmpID=109, Name="Neha", Department="Engineering", Project="Security Suite", Salary=91000, HoursPerWeek=46),
    Row(EmpID=110, Name="Farah", Department="HR", Project="Onboarding", Salary=60000, HoursPerWeek=36),
]

# Create DataFrame
df = spark.createDataFrame(data)
df.createOrReplaceTempView("employees")

# 1. Window function: Rank by salary within department
from pyspark.sql.functions import rank
window_spec = Window.partitionBy("Department").orderBy(col("Salary").desc())
df_ranked = df.withColumn("SalaryRank", rank().over(window_spec))
print("1. Rank within each department by salary:")
df_ranked.select("Name", "Department", "Salary", "SalaryRank").show()

# 2. Create new view for Engineering employees
df_engineering = df.filter(col("Department") == "Engineering")
df_engineering.createOrReplaceGlobalTempView("engineering_employees")
print("2. Global view 'engineering_employees' created")

# 3. Create view of active employees (working >= 38 hours/week)
df_active = df.filter(col("HoursPerWeek") >= 38)
df_active.createOrReplaceTempView("active_employees")
print("3. Temp view 'active_employees' created")

# Optional: Show the active employees
spark.sql("SELECT * FROM active_employees").show()


1. Rank within each department by salary:
+-----+-----------+------+----------+
| Name| Department|Salary|SalaryRank|
+-----+-----------+------+----------+
|Divya|Engineering| 99000|         1|
| Ravi|Engineering| 95000|         2|
| Neha|Engineering| 91000|         3|
|Sneha|Engineering| 87000|         4|
|Farah|         HR| 60000|         1|
|Priya|         HR| 58000|         2|
|Kabir|  Marketing| 65000|         1|
| Amit|  Marketing| 62000|         2|
|Manav|      Sales| 73000|         1|
|Anita|      Sales| 70000|         2|
+-----+-----------+------+----------+

2. Global view 'engineering_employees' created
3. Temp view 'active_employees' created
+-----+-----+-----------+---------------+------+------------+
|EmpID| Name| Department|        Project|Salary|HoursPerWeek|
+-----+-----+-----------+---------------+------+------------+
|  101| Ravi|Engineering|      AI Engine| 95000|          42|
|  102|Sneha|Engineering|  Data Platform| 87000|          45|
|  103|Kabir|  Marketing| Pr