In [None]:
!pip install pyspark



In [None]:
from pyspark.sql import SparkSession

#create spark session
spark = SparkSession.builder.appName("Jasmine-colab").getOrCreate()

#check spark version
print("Apache Spark Version:", spark.version)

Apache Spark Version: 3.5.1


In [None]:
# sample data
data = [("Jasmine", 22),("Priya", 21),("Aman", 20)]

#define schema (coulums)
columns = ["Name", "Age"]

#create dataframe
df = spark.createDataFrame(data, columns)

#show dataframe
df.show()

+-------+---+
|   Name|Age|
+-------+---+
|Jasmine| 22|
|  Priya| 21|
|   Aman| 20|
+-------+---+



In [None]:
#select column
df.select("Name").show()

#filter rows
df.filter(df["Age"] > 20).show()

#count rows
print("Total rows:", df.count())

+-------+
|   Name|
+-------+
|Jasmine|
|  Priya|
|   Aman|
+-------+

+-------+---+
|   Name|Age|
+-------+---+
|Jasmine| 22|
|  Priya| 21|
+-------+---+

Total rows: 3


In [None]:
import csv
import io


# Step 1: Create CSV data as a string
csv_data = """id,name,department,salary
1,Rahul Sharma,IT,55000
2,Priya Singh,HR,60000
3,Aman Kumar,Finance,48000
4,Sneha Reddy,Marketing,52000
5,Arjun Mehta,IT,75000
"""

#step 2: use StringIO to treat string like a file
file_like = io.StringIO(csv_data)

# step 3: Read CSV using DictReader
reader = csv.DictReader(file_like)

print("Employee Records:")
for row in reader:
  print(f"{row['id']} - {row['name']} ({row['department']}) → ₹{row['salary']}")

Employee Records:
1 - Rahul Sharma (IT) → ₹55000
2 - Priya Singh (HR) → ₹60000
3 - Aman Kumar (Finance) → ₹48000
4 - Sneha Reddy (Marketing) → ₹52000
5 - Arjun Mehta (IT) → ₹75000


In [None]:
import json

# Step 1: Create JSON as a string
json_data = '''
[
  { "id": 1, "name": "Rahul Sharma", "age": 21, "city": "Bangalore" },
{ "id": 2, "name": "Priya Singh", "age": 22, "city": "Delhi" },
{ "id": 3, "name": "Aman Kumar", "age": 20, "city": "Hyderabad" }
]
'''

# Step 2: Parse JSON string → Python list of dictionaries
students = json.loads(json_data)


# Step 3: Process the data
print ("Student Records:")
for s in students:
  print(f"{s['id']} - {s[ 'name']} ({s['city']}) → Age {s['age']}")

Student Records:
1 - Rahul Sharma (Bangalore) → Age 21
2 - Priya Singh (Delhi) → Age 22
3 - Aman Kumar (Hyderabad) → Age 20


In [None]:
import json
# Step 1: JSON data in memory
json_data = '''
[
  {"id": 1, "name": "Rahul Sharma", "age": 21, "city": "Bangalore"},
  {"id": 2, "name": "Priya Singh", "age": 22, "city": "Delhi"}
]
'''

# Step 2: Load JSON into Python list
students = json.loads(json_data)


# Step 3: Add a new student
new_student = { "id": 3, "name": "Aman Kumar", "age": 20, "city": "Hyderabad" }
students.append (new_student)


# Step 4: Update an existing student
for s in students:
  if s["id"] == 1:
    s["city"] = "Pune"   # Rahul moved from Bangalore to Pune


# Step 5: Convert back to JSON string
updated_json = json.dumps (students, indent=2)


# Print results
print ("Updated JSON Data:\n", updated_json)

Updated JSON Data:
 [
  {
    "id": 1,
    "name": "Rahul Sharma",
    "age": 21,
    "city": "Pune"
  },
  {
    "id": 2,
    "name": "Priya Singh",
    "age": 22,
    "city": "Delhi"
  },
  {
    "id": 3,
    "name": "Aman Kumar",
    "age": 20,
    "city": "Hyderabad"
  }
]


In [None]:
!pip install pyspark
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName ("Employee-Analysis").getOrCreate()



In [None]:
import io
csv_data = """id,name,department,salary
1,Rahul Sharma,IT,55000
2,Priya Singh,HR,60000
3,Aman Kumar,Finance,48000
4,Sneha Reddy,Marketing,52000
5,Arjun Mehta,IT,75000
6,Divya Nair,Finance,67000
"""
with open("employees.csv", "w") as f:
  f. write(csv_data)

In [None]:
df = spark.read.csv("employees.csv", header=True, inferSchema=True)
df.show()

+---+------------+----------+------+
| id|        name|department|salary|
+---+------------+----------+------+
|  1|Rahul Sharma|        IT| 55000|
|  2| Priya Singh|        HR| 60000|
|  3|  Aman Kumar|   Finance| 48000|
|  4| Sneha Reddy| Marketing| 52000|
|  5| Arjun Mehta|        IT| 75000|
|  6|  Divya Nair|   Finance| 67000|
+---+------------+----------+------+



**Transformations**

# 📝 Key Points about Transformations

* **Lazy Execution**:
  Spark doesn’t run transformations right away. Instead, it builds a **logical plan** (a DAG – Directed Acyclic Graph).
  The computation only runs when an **action** (like `.show()` or `.count()`) is called.

* **Return Type**:
  A transformation always returns a **new DataFrame or RDD**. It does **not modify the existing one**.

* **Two Types of Transformations**:

  1. **Narrow Transformations** → Each input partition contributes to only one output partition.
     (e.g., `map()`, `filter()`, `select()`)
  2. **Wide Transformations** → Data is shuffled across partitions.
     (e.g., `groupBy()`, `join()`)

---

In [None]:
# Select name & salary
df. select ("name", "salary").show()

# Filter employees with salary › 60,000
df. filter(df["salary"] > 60000).show()


# Order by salary descending
df.orderBy(df["salary"].desc()).show()

+------------+------+
|        name|salary|
+------------+------+
|Rahul Sharma| 55000|
| Priya Singh| 60000|
|  Aman Kumar| 48000|
| Sneha Reddy| 52000|
| Arjun Mehta| 75000|
|  Divya Nair| 67000|
+------------+------+

+---+-----------+----------+------+
| id|       name|department|salary|
+---+-----------+----------+------+
|  5|Arjun Mehta|        IT| 75000|
|  6| Divya Nair|   Finance| 67000|
+---+-----------+----------+------+

+---+------------+----------+------+
| id|        name|department|salary|
+---+------------+----------+------+
|  5| Arjun Mehta|        IT| 75000|
|  6|  Divya Nair|   Finance| 67000|
|  2| Priya Singh|        HR| 60000|
|  1|Rahul Sharma|        IT| 55000|
|  4| Sneha Reddy| Marketing| 52000|
|  3|  Aman Kumar|   Finance| 48000|
+---+------------+----------+------+



# 📝 What is Aggregation?

* An operation that **groups data** and applies a **summary function** (like sum, avg, count, min, max).
* Used to answer questions like:

  * *“What is the average salary per department?”*
  * *“How many employees are in each department?”*
  * *“What is the highest salary in Finance?”*

---

In [None]:
# Average salary per department
df. groupBy ("department"). avg("salary") .show()

# Maximum salary per department
df. groupBy ("department"). max ("salary").show()

# Count employees per department
df. groupBy ("department"). count () . show()

+----------+-----------+
|department|avg(salary)|
+----------+-----------+
|        HR|    60000.0|
|   Finance|    57500.0|
| Marketing|    52000.0|
|        IT|    65000.0|
+----------+-----------+

+----------+-----------+
|department|max(salary)|
+----------+-----------+
|        HR|      60000|
|   Finance|      67000|
| Marketing|      52000|
|        IT|      75000|
+----------+-----------+

+----------+-----+
|department|count|
+----------+-----+
|        HR|    1|
|   Finance|    2|
| Marketing|    1|
|        IT|    2|
+----------+-----+



In [11]:
df.createOrReplaceTempView("employees")

spark.sql("SELECT department, AVG(salary) as avg_salary FROM employees GROUP BY department"). show()

+----------+----------+
|department|avg_salary|
+----------+----------+
|        HR|   60000.0|
|   Finance|   57500.0|
| Marketing|   52000.0|
|        IT|   65000.0|
+----------+----------+

