In [26]:
from pyspark.sql import SparkSession

In [27]:
from pyspark.sql.functions import *
from pyspark.sql.types import *
from pyspark  import broadcast
from pyspark.sql.window import Window

In [28]:
spark =  SparkSession.builder.appName("example_28Mar2025").getOrCreate()
print(spark)

<pyspark.sql.session.SparkSession object at 0x7bfab1781c60>


## Problem Statement 1
You are given a dataset containing customer travel details, including multiple stopovers for flight journeys. The objective is to determine the origin (starting location) and destination (final location) for each customer based on their travel history.

| customer_id | flight_id | origin   | destination |
|------------|----------|---------|-------------|
| 1          | 101      | Delhi   | Hyderabad   |
| 1          | 102      | Hyderabad | Kochi      |
| 1          | 103      | Kochi   | Mangalore   |
| 2          | 201      | Mumbai  | Ayodhya     |
| 2          | 202      | Ayodhya | Gorakhpur   |


In [29]:
df = spark.read.format("csv").option("header","True")\
    .option("inferSchema","True")\
    .load("file:///home/hdoop/notebooks/data/spark_practice/example_28Mar2025/flights.csv")

In [30]:
window_spec = Window.partitionBy("customer_id").orderBy("flight_id")

In [31]:
df.show()

+-----------+---------+---------+-----------+
|customer_id|flight_id|   origin|destination|
+-----------+---------+---------+-----------+
|          1|      101|    Delhi|  Hyderabad|
|          1|      102|Hyderabad|      Kochi|
|          1|      103|    Kochi|  Mangalore|
|          2|      201|   Mumbai|    Ayodhya|
|          2|      202|  Ayodhya|  Gorakhpur|
+-----------+---------+---------+-----------+



In [38]:
df1 =df.withColumn("origin",first("origin").over(window_spec))

In [45]:
df2 =  df1.withColumn("destination",first("destination").over(Window.partitionBy("customer_id").orderBy(df["flight_id"].desc()))).drop("flight_id").dropDuplicates()

In [46]:
df2.show()

+-----------+------+-----------+
|customer_id|origin|destination|
+-----------+------+-----------+
|          1| Delhi|  Mangalore|
|          2|Mumbai|  Gorakhpur|
+-----------+------+-----------+



# Employee Dataset Merging Task

## 📌 Task Description
You are given **two employee datasets** containing details about employees, including their **names, gender, and salaries**. Your task is to **combine these datasets** using:

1. **`union()`** - Merging datasets **without removing duplicates**.
2. **Handling duplicate records** using **`distinct()`**.
3. **`unionByName()`** - Merging datasets when column names match but the order differs.

---

## 📊 Employee Data

### **Dataset 1 (`df1`)**
| Employee_Name | Employee_Gender | Employee_Salary |
|--------------|----------------|----------------|
| Alice        | F              | 50000          |
| Bob          | M              | 60000          |
| Charles      | M              | 70000          |
| David        | M              | 80000          |
| Eve          | F              | 90000          |
| Eve          | F              | 90000          | *(Duplicate for testing `distinct()`)* |

### **Dataset 2 (`df2`)**
| Employee_Name | Employee_Gender | Employee_Salary |
|--------------|----------------|----------------|
| Frank        | M              | 55000          |
| Grace        | F              | 65000          |
| Hank         | M              | 75000          |
| Ivy          | F              | 85000          |
| Jack         | M              | 95000          |
| Eve          | F              | 90000          | *(Duplicate for testing `distinct()`)* |

---

## 🔹 **Your Tasks**
1. **Use `union()`** to merge `df1` and `df2`, keeping duplicates.
2. **Use `distinct()`** to remove duplicate records.
3. **Use `unionByName()`** to merge the datasets while handling **different column orders**.



In [47]:
df1 = spark.read.format("csv").option("header","True").option("inferSchema","True").load("file:///home/hdoop/notebooks/data/spark_practice/example_28Mar2025/employee_merging_task/emp1.csv")

In [48]:
df2 = spark.read.format("csv").option("header","True").option("inferSchema","True").load("file:///home/hdoop/notebooks/data/spark_practice/example_28Mar2025/employee_merging_task/emp2.csv")

In [51]:
df1.show()
df1.printSchema()

+-------------+---------------+---------------+
|Employee_Name|Employee_Gender|Employee_Salary|
+-------------+---------------+---------------+
|        Alice|              F|          50000|
|          Bob|              M|          60000|
|      Charles|              M|          70000|
|        David|              M|          80000|
|          Eve|              F|          90000|
|          Eve|              F|          90000|
+-------------+---------------+---------------+

root
 |-- Employee_Name: string (nullable = true)
 |-- Employee_Gender: string (nullable = true)
 |-- Employee_Salary: integer (nullable = true)



In [52]:
df2.show()
df2.printSchema()

+-------------+---------------+---------------+
|Employee_Name|Employee_Gender|Employee_Salary|
+-------------+---------------+---------------+
|        Frank|              M|          55000|
|        Grace|              F|          65000|
|         Hank|              M|          75000|
|          Ivy|              F|          85000|
|         Jack|              M|          95000|
|          Eve|              F|          90000|
+-------------+---------------+---------------+

root
 |-- Employee_Name: string (nullable = true)
 |-- Employee_Gender: string (nullable = true)
 |-- Employee_Salary: integer (nullable = true)



In [54]:
df1.union(df2).show()

+-------------+---------------+---------------+
|Employee_Name|Employee_Gender|Employee_Salary|
+-------------+---------------+---------------+
|        Alice|              F|          50000|
|          Bob|              M|          60000|
|      Charles|              M|          70000|
|        David|              M|          80000|
|          Eve|              F|          90000|
|          Eve|              F|          90000|
|        Frank|              M|          55000|
|        Grace|              F|          65000|
|         Hank|              M|          75000|
|          Ivy|              F|          85000|
|         Jack|              M|          95000|
|          Eve|              F|          90000|
+-------------+---------------+---------------+



In [56]:
df1.unionByName(df2,allowMissingColumns=True).show()

+-------------+---------------+---------------+
|Employee_Name|Employee_Gender|Employee_Salary|
+-------------+---------------+---------------+
|        Alice|              F|          50000|
|          Bob|              M|          60000|
|      Charles|              M|          70000|
|        David|              M|          80000|
|          Eve|              F|          90000|
|          Eve|              F|          90000|
|        Frank|              M|          55000|
|        Grace|              F|          65000|
|         Hank|              M|          75000|
|          Ivy|              F|          85000|
|         Jack|              M|          95000|
|          Eve|              F|          90000|
+-------------+---------------+---------------+



In [58]:
df1.unionAll(df2).dropDuplicates().show()

+-------------+---------------+---------------+
|Employee_Name|Employee_Gender|Employee_Salary|
+-------------+---------------+---------------+
|        David|              M|          80000|
|      Charles|              M|          70000|
|        Alice|              F|          50000|
|          Eve|              F|          90000|
|          Bob|              M|          60000|
|        Grace|              F|          65000|
|         Hank|              M|          75000|
|         Jack|              M|          95000|
|        Frank|              M|          55000|
|          Ivy|              F|          85000|
+-------------+---------------+---------------+



# 📌 Problem Statement 2

## 📝 Task
You are given an **Employee table** containing employee details such as:
- **Employee_ID**
- **Employee_Name**
- **Department**
- **Salary**

Your task is to **write a PySpark program** to compare the behavior of the following **three ranking functions**:  

1. **`rank()`** - Assigns a unique rank, leaving gaps if there are duplicates.  
2. **`dense_rank()`** - Similar to `rank()`, but does not leave gaps when encountering duplicate values.  
3. **`row_number()`** - Assigns a unique sequential number to each row, even for duplicate values.

---

## 📊 **Employee Dataset (Example)**

| Employee_ID | Employee_Name | Department | Salary |
|------------|--------------|------------|--------|
| 1          | Alice        | IT         | 70000  |
| 2          | Bob          | IT         | 75000  |
| 3          | Charlie      | HR         | 60000  |
| 4          | David        | IT         | 75000  |
| 5          | Eve          | HR         | 60000  |
| 6          | Frank        | Finance    | 90000  |
| 7          | Grace        | Finance    | 85000  |

---

## 🔹 **Your Tasks**
1. **Use `rank()` to assign ranks based on salary** (ordered in descending order).  
2. **Use `dense_rank()` and compare how ranks are assigned differently from `rank()`**.  
3. **Use `row_number()` to understand how it uniquely assigns numbers to each row**.  
4. **Compare the results of all three ranking functions in PySpark**.

---

## ✅ Expected Output Example

| Employee_Name | Department | Salary | Rank | Dense_Rank | Row_Number |
|--------------|------------|--------|------|------------|------------|
| Frank        | Finance    | 90000  | 1    | 1          | 1          |
| Grace        | Finance    | 85000  | 2    | 2          | 2          |
| Bob          | IT         | 75000  | 3    | 3          | 3          |
| David        | IT         | 3      | 75000 | 3          | 4          |
| Alice        | IT         | 70000  | 5    | 4          | 5          |
| Charlie      | HR         | 60000  | 6    | 5          | 6          |
| Eve          | HR         | 60000  | 6    | 5          | 7          |


In [59]:
df = spark.read.format("csv").option("header","True").option("inferSchema","True").load("file:///home/hdoop/notebooks/data/spark_practice/example_28Mar2025/emp_ps2.csv")

#### Rank(_)

In [60]:
df.printSchema()
df.show()

root
 |-- Employee_ID: integer (nullable = true)
 |-- Employee_Name: string (nullable = true)
 |-- Department: string (nullable = true)
 |-- Salary: integer (nullable = true)

+-----------+-------------+----------+------+
|Employee_ID|Employee_Name|Department|Salary|
+-----------+-------------+----------+------+
|          1|        Alice|     Sales| 75000|
|          2|          Bob|     Sales| 75000|
|          3|      Charlie|     Sales| 75000|
|          4|        David|        IT| 70000|
|          5|          Eva|        IT| 68000|
|          6|        Frank|        IT| 68000|
|          7|        Grace|        HR| 60000|
|          8|        Henry|        HR| 60000|
|          9|          Ian|        HR| 58000|
|         10|         Jack|        HR| 56000|
+-----------+-------------+----------+------+



In [62]:
df.withColumn("rank",rank().over(Window.partitionBy("Department").orderBy("salary"))).show()

+-----------+-------------+----------+------+----+
|Employee_ID|Employee_Name|Department|Salary|rank|
+-----------+-------------+----------+------+----+
|         10|         Jack|        HR| 56000|   1|
|          9|          Ian|        HR| 58000|   2|
|          7|        Grace|        HR| 60000|   3|
|          8|        Henry|        HR| 60000|   3|
|          5|          Eva|        IT| 68000|   1|
|          6|        Frank|        IT| 68000|   1|
|          4|        David|        IT| 70000|   3|
|          1|        Alice|     Sales| 75000|   1|
|          2|          Bob|     Sales| 75000|   1|
|          3|      Charlie|     Sales| 75000|   1|
+-----------+-------------+----------+------+----+



#### dense Rank

In [63]:
df.withColumn("dense_rank",dense_rank().over(Window.partitionBy("department").orderBy("salary"))).show()

+-----------+-------------+----------+------+----------+
|Employee_ID|Employee_Name|Department|Salary|dense_rank|
+-----------+-------------+----------+------+----------+
|         10|         Jack|        HR| 56000|         1|
|          9|          Ian|        HR| 58000|         2|
|          7|        Grace|        HR| 60000|         3|
|          8|        Henry|        HR| 60000|         3|
|          5|          Eva|        IT| 68000|         1|
|          6|        Frank|        IT| 68000|         1|
|          4|        David|        IT| 70000|         2|
|          1|        Alice|     Sales| 75000|         1|
|          2|          Bob|     Sales| 75000|         1|
|          3|      Charlie|     Sales| 75000|         1|
+-----------+-------------+----------+------+----------+



# 🏏 Cricket Match Statistics - Pivoting Data in PySpark

## 📌 Problem Statement

You are given a **cricket match dataset** containing the following columns:

- **player_name** → Name of the player.
- **stadium** → The stadium where the match was played.
- **runs** → Runs scored by the player.
- **wickets** → Wickets taken by the player.
- **catches** → Catches taken by the player.

### 🎯 **Task**
- Keep **player_name** as a row index.
- Convert **stadium** values into columns.
- Compute the **sum of runs, wickets, and catches** for each stadium.

---

## 📊 **Cricket Dataset**

| player_name | stadium       | runs | wickets | catches |
|------------|--------------|------|---------|---------|
| JJA        | Wankhede     | 40   | 2       | 1       |
| JJA        | Wankhede     | 60   | 1       | 2       |
| JJA        | Eden Gardens | 25   | 3       | 0       |
| Hardik     | Eden Gardens | 55   | 2       | 1       |
| Hardik     | Eden Gardens | 30   | 1       | 2       |
| Hardik     | Eden Gardens | 45   | 2       | 1       |
| Hardik     | Wankhede     | 45   | 3       | 2       |
| Watson     | Eden Gardens | 20   | 1       | 3       |
| Watson     | Wankhede     | 50   | 4       | 2       |
