In [1]:
!pip install pyspark

from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("Product-Order-Example").getOrCreate()



### **Creating DataFrames**

In [2]:
# Product data
product_data = [
    (101, "Laptop", "Electronics", 55000),
    (102, "Mobile Phone", "Electronics", 25000),
    (103, "Chair", "Furniture", 5000),
    (104, "Book", "Stationery", 300),
    (105, "Headphones", "Electronics", 3000)
]

product_cols = ["product_id", "name", "category", "price"]
product_df = spark.createDataFrame(product_data, product_cols)

# Order data
order_data = [
    (201, 101, 2, "Rahul Sharma"),
    (202, 102, 1, "Priya Singh"),
    (203, 103, 4, "Aman Kumar"),
    (204, 104, 10, "Sneha Reddy"),
    (205, 101, 1, "Arjun Mehta"),
    (206, 105, 3, "Rahul Sharma"),
    (207, 106, 1, "Ghost Customer")  # Order with product not in catalog
]

order_cols = ["order_id", "product_id", "quantity", "customer"]
order_df = spark.createDataFrame(order_data, order_cols)

# Show both
product_df.show()
order_df.show()

+----------+------------+-----------+-----+
|product_id|        name|   category|price|
+----------+------------+-----------+-----+
|       101|      Laptop|Electronics|55000|
|       102|Mobile Phone|Electronics|25000|
|       103|       Chair|  Furniture| 5000|
|       104|        Book| Stationery|  300|
|       105|  Headphones|Electronics| 3000|
+----------+------------+-----------+-----+

+--------+----------+--------+--------------+
|order_id|product_id|quantity|      customer|
+--------+----------+--------+--------------+
|     201|       101|       2|  Rahul Sharma|
|     202|       102|       1|   Priya Singh|
|     203|       103|       4|    Aman Kumar|
|     204|       104|      10|   Sneha Reddy|
|     205|       101|       1|   Arjun Mehta|
|     206|       105|       3|  Rahul Sharma|
|     207|       106|       1|Ghost Customer|
+--------+----------+--------+--------------+



### **Transformations**

In [3]:
# select specific columns
product_df.select("name", "price").show()

# filter products with price >10,000
product_df.filter(product_df["price"] > 10000).show()

# order products by price descending
product_df.orderBy(product_df["price"].desc()).show()

+------------+-----+
|        name|price|
+------------+-----+
|      Laptop|55000|
|Mobile Phone|25000|
|       Chair| 5000|
|        Book|  300|
|  Headphones| 3000|
+------------+-----+

+----------+------------+-----------+-----+
|product_id|        name|   category|price|
+----------+------------+-----------+-----+
|       101|      Laptop|Electronics|55000|
|       102|Mobile Phone|Electronics|25000|
+----------+------------+-----------+-----+

+----------+------------+-----------+-----+
|product_id|        name|   category|price|
+----------+------------+-----------+-----+
|       101|      Laptop|Electronics|55000|
|       102|Mobile Phone|Electronics|25000|
|       103|       Chair|  Furniture| 5000|
|       105|  Headphones|Electronics| 3000|
|       104|        Book| Stationery|  300|
+----------+------------+-----------+-----+



### **Aggregations**

In [4]:
# Total quantity ordered per product
order_df.groupBy("product_id"). sum("quantity").show()

# Count of orders per customer
order_df.groupBy("customer").count().show()

# Average price per category
product_df.groupBy("category").avg("price").show()

+----------+-------------+
|product_id|sum(quantity)|
+----------+-------------+
|       103|            4|
|       101|            3|
|       102|            1|
|       104|           10|
|       106|            1|
|       105|            3|
+----------+-------------+

+--------------+-----+
|      customer|count|
+--------------+-----+
|    Aman Kumar|    1|
|  Rahul Sharma|    2|
|   Priya Singh|    1|
|   Arjun Mehta|    1|
|Ghost Customer|    1|
|   Sneha Reddy|    1|
+--------------+-----+

+-----------+------------------+
|   category|        avg(price)|
+-----------+------------------+
|Electronics|27666.666666666668|
| Stationery|             300.0|
|  Furniture|            5000.0|
+-----------+------------------+



### **Joins**

In [5]:
# Inner Join : Orders with product details
order_df.join(product_df, order_df["product_id"] == product_df["product_id"], "inner").show()

# Left Join: All orders, even if product not found
order_df.join(product_df, order_df["product_id"] == product_df["product_id"], "left").show()

# Right Join: All products, even if never ordered
order_df.join(product_df, order_df["product_id"] == product_df["product_id"], "right").show()

+--------+----------+--------+------------+----------+------------+-----------+-----+
|order_id|product_id|quantity|    customer|product_id|        name|   category|price|
+--------+----------+--------+------------+----------+------------+-----------+-----+
|     201|       101|       2|Rahul Sharma|       101|      Laptop|Electronics|55000|
|     205|       101|       1| Arjun Mehta|       101|      Laptop|Electronics|55000|
|     202|       102|       1| Priya Singh|       102|Mobile Phone|Electronics|25000|
|     203|       103|       4|  Aman Kumar|       103|       Chair|  Furniture| 5000|
|     204|       104|      10| Sneha Reddy|       104|        Book| Stationery|  300|
|     206|       105|       3|Rahul Sharma|       105|  Headphones|Electronics| 3000|
+--------+----------+--------+------------+----------+------------+-----------+-----+

+--------+----------+--------+--------------+----------+------------+-----------+-----+
|order_id|product_id|quantity|      customer|produc

### **SQL Queries**

In [6]:
# Register as temp views
product_df.createOrReplaceTempView("products")
order_df.createOrReplaceTempView("orders")

# Query: Total revenue per product
spark.sql("""
SELECT o.product_id, p.name, SUM(o.quantity * p.price) AS total_revenue
FROM orders o
JOIN products p ON o.product_id = p.product_id
GROUP BY o.product_id, p.name
""").show()

# Query: Top 2 customers by total quantity
spark.sql("""
SELECT customer, SUM(quantity) AS total_quantity
FROM orders
GROUP BY customer
ORDER BY total_quantity DESC
LIMIT 2
""").show()


+----------+------------+-------------+
|product_id|        name|total_revenue|
+----------+------------+-------------+
|       101|      Laptop|       165000|
|       102|Mobile Phone|        25000|
|       103|       Chair|        20000|
|       104|        Book|         3000|
|       105|  Headphones|         9000|
+----------+------------+-------------+

+------------+--------------+
|    customer|total_quantity|
+------------+--------------+
| Sneha Reddy|            10|
|Rahul Sharma|             5|
+------------+--------------+



# **Task - 1**

In [7]:

# Student data
students_data = [
    (1, "Rahul Sharma", 20, "Bangalore"),
    (2, "Priya Singh", 21, "Delhi"),
    (3, "Aman Kumar", 19, "Hyderabad"),
    (4, "Sneha Reddy", 22, "Chennai"),
    (5, "Arjun Mehta", 23, "Mumbai"),
    (6, "Divya Nair", 20, None)  # Student without city
]
students_cols = ["student_id", "name", "age", "city"]
students_df = spark.createDataFrame(students_data, students_cols)

# Course data
courses_data = [
    (101, "Python", "Programming"),
    (102, "Data Science", "Analytics"),
    (103, "Databases", "Technology"),
    (104, "Business Studies", "Management")
]
courses_cols = ["course_id", "course_name", "category"]
courses_df = spark.createDataFrame(courses_data, courses_cols)

# Enrollment data
enrollment_data = [
    (1, 101, "A"),
    (2, 101, "B"),
    (3, 102, "A"),
    (4, 103, "C"),
    (5, 102, "B"),
    (7, 104, "A")  # Enrollment with non-existent student
]
enrollment_cols = ["student_id", "course_id", "grade"]
enrollment_df = spark.createDataFrame(enrollment_data, enrollment_cols)

# Show all DataFrames
students_df.show()
courses_df.show()
enrollment_df.show()


+----------+------------+---+---------+
|student_id|        name|age|     city|
+----------+------------+---+---------+
|         1|Rahul Sharma| 20|Bangalore|
|         2| Priya Singh| 21|    Delhi|
|         3|  Aman Kumar| 19|Hyderabad|
|         4| Sneha Reddy| 22|  Chennai|
|         5| Arjun Mehta| 23|   Mumbai|
|         6|  Divya Nair| 20|     NULL|
+----------+------------+---+---------+

+---------+----------------+-----------+
|course_id|     course_name|   category|
+---------+----------------+-----------+
|      101|          Python|Programming|
|      102|    Data Science|  Analytics|
|      103|       Databases| Technology|
|      104|Business Studies| Management|
+---------+----------------+-----------+

+----------+---------+-----+
|student_id|course_id|grade|
+----------+---------+-----+
|         1|      101|    A|
|         2|      101|    B|
|         3|      102|    A|
|         4|      103|    C|
|         5|      102|    B|
|         7|      104|    A|
+--------

### **Transformations/Task - 1**

In [8]:
# 1. Select all student names and their cities
students_df.select("name", "city").show()

# 2. Find students who are older than 20
students_df.filter(students_df["age"] > 20).show()

# 3. List all courses under the "Analytics" category
courses_df.filter(courses_df["category"] == "Analytics").show()


+------------+---------+
|        name|     city|
+------------+---------+
|Rahul Sharma|Bangalore|
| Priya Singh|    Delhi|
|  Aman Kumar|Hyderabad|
| Sneha Reddy|  Chennai|
| Arjun Mehta|   Mumbai|
|  Divya Nair|     NULL|
+------------+---------+

+----------+-----------+---+-------+
|student_id|       name|age|   city|
+----------+-----------+---+-------+
|         2|Priya Singh| 21|  Delhi|
|         4|Sneha Reddy| 22|Chennai|
|         5|Arjun Mehta| 23| Mumbai|
+----------+-----------+---+-------+

+---------+------------+---------+
|course_id| course_name| category|
+---------+------------+---------+
|      102|Data Science|Analytics|
+---------+------------+---------+



### **Aggregations/Task - 1**

In [9]:
from pyspark.sql.functions import avg, max, min, count

# 1. Count how many students are enrolled in each course
enrollment_df.groupBy("course_id").count().show()

# 2. Find the average age of students per city
students_df.groupBy("city").agg(avg("age").alias("avg_age")).show()

# 3. Get the maximum and minimum age of students
students_df.agg(max("age").alias("max_age"), min("age").alias("min_age")).show()


+---------+-----+
|course_id|count|
+---------+-----+
|      101|    2|
|      102|    2|
|      103|    1|
|      104|    1|
+---------+-----+

+---------+-------+
|     city|avg_age|
+---------+-------+
|Bangalore|   20.0|
|    Delhi|   21.0|
|Hyderabad|   19.0|
|  Chennai|   22.0|
|     NULL|   20.0|
|   Mumbai|   23.0|
+---------+-------+

+-------+-------+
|max_age|min_age|
+-------+-------+
|     23|     19|
+-------+-------+



### **Joins / Task - 1**

In [10]:
# 1. Join students with enrollments to see which student took which course
students_df.join(enrollment_df, "student_id", "inner").show()

# 2. Left join enrollments with courses to get course details
enrollment_df.join(courses_df, "course_id", "left").show()

# 3. Find students who are not enrolled in any course
students_df.join(enrollment_df, "student_id", "left_anti").show()

# 4. Find courses with no students enrolled
courses_df.join(enrollment_df, "course_id", "left_anti").show()


+----------+------------+---+---------+---------+-----+
|student_id|        name|age|     city|course_id|grade|
+----------+------------+---+---------+---------+-----+
|         1|Rahul Sharma| 20|Bangalore|      101|    A|
|         2| Priya Singh| 21|    Delhi|      101|    B|
|         3|  Aman Kumar| 19|Hyderabad|      102|    A|
|         4| Sneha Reddy| 22|  Chennai|      103|    C|
|         5| Arjun Mehta| 23|   Mumbai|      102|    B|
+----------+------------+---+---------+---------+-----+

+---------+----------+-----+----------------+-----------+
|course_id|student_id|grade|     course_name|   category|
+---------+----------+-----+----------------+-----------+
|      101|         1|    A|          Python|Programming|
|      101|         2|    B|          Python|Programming|
|      102|         3|    A|    Data Science|  Analytics|
|      103|         4|    C|       Databases| Technology|
|      104|         7|    A|Business Studies| Management|
|      102|         5|    B|   

### **SQL Queries / Task - 1**

In [11]:
# Register DataFrames as temporary views
students_df.createOrReplaceTempView("students")
courses_df.createOrReplaceTempView("courses")
enrollment_df.createOrReplaceTempView("enrollments")

# 1. All students with their course names and grades
spark.sql("""
SELECT s.name, c.course_name, e.grade
FROM enrollments e
JOIN students s ON e.student_id = s.student_id
JOIN courses c ON e.course_id = c.course_id
""").show()

# 2. Number of students who got grade "A" in each course
spark.sql("""
SELECT c.course_name, COUNT(*) AS grade_A_count
FROM enrollments e
JOIN courses c ON e.course_id = c.course_id
WHERE e.grade = 'A'
GROUP BY c.course_name
""").show()

# 3. Top city with the most students enrolled in courses
spark.sql("""
SELECT s.city, COUNT(*) AS total_enrolled
FROM enrollments e
JOIN students s ON e.student_id = s.student_id
GROUP BY s.city
ORDER BY total_enrolled DESC
LIMIT 1
""").show()


+------------+------------+-----+
|        name| course_name|grade|
+------------+------------+-----+
| Priya Singh|      Python|    B|
|Rahul Sharma|      Python|    A|
| Arjun Mehta|Data Science|    B|
|  Aman Kumar|Data Science|    A|
| Sneha Reddy|   Databases|    C|
+------------+------------+-----+

+----------------+-------------+
|     course_name|grade_A_count|
+----------------+-------------+
|Business Studies|            1|
|          Python|            1|
|    Data Science|            1|
+----------------+-------------+

+---------+--------------+
|     city|total_enrolled|
+---------+--------------+
|Bangalore|             1|
+---------+--------------+



In [12]:
!pip install pyspark

from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("RDD-Example").getOrCreate()

# Get sparkcontext
sc = spark.sparkContext



### 📝 **What is RDD?**

**RDD** stands for **Resilient Distributed Dataset**.

It is the **core data structure of Apache Spark** — an **immutable distributed collection of objects** that can be processed in parallel across a cluster.

👉 In simple words:

* Think of RDD like a **giant list**, but instead of sitting in one machine, it’s **spread across multiple machines**.
* Spark can apply functions to this distributed data in parallel, making it fast and fault-tolerant.

---

### 📝 **Key Properties of RDD**

* **Resilient** → Fault-tolerant (can recover lost data automatically using lineage).
* **Distributed** → Data is split into partitions across cluster nodes.
* **Dataset** → Collection of elements (numbers, strings, objects, rows, etc.).
* **Immutable** → Once created, cannot be changed — only transformed into new RDDs.


In [13]:
# from a python list
data = [1, 2, 3, 4, 5, 6, 7, 8, 9]

rdd = sc.parallelize(data)

print("RDD elements:", rdd.collect())

RDD elements: [1, 2, 3, 4, 5, 6, 7, 8, 9]


In [14]:
# Map: Square each number
squared_rdd = rdd.map(lambda x: x * x)

#Filter: Keep only even numbers
even_rdd = rdd.filter(lambda x: x % 2 == 0)


In [15]:
print("Squared:", squared_rdd.collect())
print("Even:", even_rdd.collect())

print("Count:", rdd.count())
print("Sum:", rdd.sum())
print("Max:", rdd.max())

Squared: [1, 4, 9, 16, 25, 36, 49, 64, 81]
Even: [2, 4, 6, 8]
Count: 9
Sum: 45
Max: 9


In [16]:
# Sample text
text =["hello world", "hello spark", "big data with spark"]

# Create RDD
text_rdd = sc.parallelize(text)

# Split words
words = text_rdd.flatMap(lambda line: line.split(" "))

# Map each word to (word, 1)
word_pairs = words.map(lambda word: (word, 1))

# Reduce by key (sum counts)
# hello, 1 --- hello, 1
word_count = word_pairs.reduceByKey(lambda a, b: a + b)

print("Word Count:", word_count.collect())

Word Count: [('hello', 2), ('world', 1), ('big', 1), ('with', 1), ('spark', 2), ('data', 1)]


 # **RDD-Exercises 1**

In [17]:
from pyspark.sql import SparkSession

# Setup
spark = SparkSession.builder.appName("RDD-Exercises").getOrCreate()
sc = spark.sparkContext


### **Numbers Practice/ RDD-Exercise**

In [18]:
# Create RDD with numbers 1–15
nums = sc.parallelize(range(1, 16))

# Numbers divisible by 3
div_by_3 = nums.filter(lambda x: x % 3 == 0)
print("Divisible by 3:", div_by_3.collect())

# Each number doubled
doubled = nums.map(lambda x: x * 2)
print("Doubled:", doubled.collect())

# Count numbers greater than 10
count_gt_10 = nums.filter(lambda x: x > 10).count()
print("Count > 10:", count_gt_10)


Divisible by 3: [3, 6, 9, 12, 15]
Doubled: [2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30]
Count > 10: 5


### **String Processing/ RDD-Exercise**

In [19]:

fruits = sc.parallelize(["apple", "banana", "grape", "banana", "apple", "mango"])

# Distinct fruits
print("Distinct:", fruits.distinct().collect())

# Count each fruit
fruit_counts = fruits.map(lambda x: (x, 1)).reduceByKey(lambda a, b: a + b)
print("Counts:", fruit_counts.collect())

# Longest word
longest = fruits.reduce(lambda a, b: a if len(a) > len(b) else b)
print("Longest word:", longest)


Distinct: ['apple', 'banana', 'grape', 'mango']
Counts: [('apple', 2), ('banana', 2), ('grape', 1), ('mango', 1)]
Longest word: banana


### **Sentence Split/ RDD-Exercise**

In [20]:
sentences = sc.parallelize([
    "spark makes big data easy",
    "rdd is the core of spark",
    "python with spark"
])

# Split into words
words = sentences.flatMap(lambda s: s.split(" "))

# Lowercase + remove duplicates
unique_words = words.map(lambda w: w.lower()).distinct()
print("Unique words:", unique_words.collect())

# Count unique words
print("Total unique words:", unique_words.count())


Unique words: ['big', 'easy', 'rdd', 'core', 'of', 'python', 'with', 'spark', 'makes', 'data', 'is', 'the']
Total unique words: 12


### **Pair RDD Operations/ RDD-Exercise**

In [21]:

marks = sc.parallelize([
    ("Rahul", 85), ("Priya", 92), ("Aman", 78),
    ("Rahul", 90), ("Priya", 88)
])

# Total marks per student
total_marks = marks.reduceByKey(lambda a, b: a + b)
print("Total Marks:", total_marks.collect())

# Average marks per student
count_marks = marks.mapValues(lambda x: (x, 1)) \
                   .reduceByKey(lambda a, b: (a[0] + b[0], a[1] + b[1])) \
                   .mapValues(lambda x: x[0] / x[1])
print("Average Marks:", count_marks.collect())

# Student with highest marks (overall)
highest = marks.reduce(lambda a, b: a if a[1] > b[1] else b)
print("Highest Marks:", highest)


Total Marks: [('Rahul', 175), ('Priya', 180), ('Aman', 78)]
Average Marks: [('Rahul', 87.5), ('Priya', 90.0), ('Aman', 78.0)]
Highest Marks: ('Priya', 92)


### **Reduce & Aggregate/ RDD-Exercise**

In [22]:

nums = sc.parallelize([5, 10, 15, 20, 25])

# Sum
total_sum = nums.reduce(lambda a, b: a + b)
print("Sum:", total_sum)

# Product
product = nums.reduce(lambda a, b: a * b)
print("Product:", product)

# Average (sum ÷ count)
avg = nums.reduce(lambda a, b: a + b) / nums.count()
print("Average:", avg)


Sum: 75
Product: 375000
Average: 15.0


### **Word Length Analysis/ RDD-Exercise**

In [23]:
words = sc.parallelize(["data", "engineering", "spark", "rdd", "pyspark", "analytics"])

# (word, length)
word_len = words.map(lambda w: (w, len(w)))
print("Word lengths:", word_len.collect())

# Longest word
longest = words.reduce(lambda a, b: a if len(a) > len(b) else b)
print("Longest word:", longest)

# Average length
avg_len = words.map(lambda w: len(w)).reduce(lambda a, b: a + b) / words.count()
print("Average length:", avg_len)


Word lengths: [('data', 4), ('engineering', 11), ('spark', 5), ('rdd', 3), ('pyspark', 7), ('analytics', 9)]
Longest word: engineering
Average length: 6.5


### **Joins/ RDD-Exercise**

In [24]:
students = sc.parallelize([(1, "Rahul"), (2, "Priya"), (3, "Aman")])
courses = sc.parallelize([(1, "Python"), (2, "Spark"), (4, "Databases")])

# Inner Join
inner = students.join(courses)
print("Inner Join:", inner.collect())

# Left Outer Join
left_outer = students.leftOuterJoin(courses)
print("Left Outer Join:", left_outer.collect())

# Right Outer Join
right_outer = students.rightOuterJoin(courses)
print("Right Outer Join:", right_outer.collect())


Inner Join: [(1, ('Rahul', 'Python')), (2, ('Priya', 'Spark'))]
Left Outer Join: [(1, ('Rahul', 'Python')), (2, ('Priya', 'Spark')), (3, ('Aman', None))]
Right Outer Join: [(4, (None, 'Databases')), (1, ('Rahul', 'Python')), (2, ('Priya', 'Spark'))]


### **Mini Real-World/ RDD-Exercise**

In [25]:
orders = sc.parallelize([
    (1, 200), (2, 500), (3, 300),
    (1, 150), (2, 250)
])

# Total spend per customer
total_spend = orders.reduceByKey(lambda a, b: a + b)
print("Total Spend:", total_spend.collect())

# Customer with max spend
max_customer = total_spend.reduce(lambda a, b: a if a[1] > b[1] else b)
print("Max Spend Customer:", max_customer)

# Total revenue
total_revenue = orders.map(lambda x: x[1]).sum()
print("Total Revenue:", total_revenue)


Total Spend: [(2, 750), (1, 350), (3, 300)]
Max Spend Customer: (2, 750)
Total Revenue: 1400
