In [1]:
# pip install pyspark
from pyspark import SparkContext, RDD

DIR_PATH = "Sparks_dataset/Sparks_dataset"
DIR_CONTEXT = DIR_PATH + "/Context"
DIR_DATA_FRAME = DIR_PATH + "/Data Frame"
DIR_SPARK_PANDAS = DIR_PATH + "/Spark Pandas"

# Exercise 1 - Basic Order Analysis
Tasks:

1. Load the dataset into Spark using SparkContext and textFile().
2. Count the total number of orders.
3. Calculate the total revenue from all orders.
4. Find the number of unique customers.

In [None]:
# 1. Load the dataset into Spark using SparkContext and textFile().
sc = SparkContext("local", "ST2CBD-Lab2-OrderAnalysis")

# Load the dataset
salesdata_rdd = sc.textFile(f"{DIR_CONTEXT}/salesdata.csv.txt")

In [None]:
# 2. Count the total number of orders.
# Because the file has 2 header lines, we need to remove them before counting
header = salesdata_rdd.take(2)

# We remove the header lines from the RDD
salesdata_rdd = salesdata_rdd.filter(lambda line: line != header[0] and line != header[1])

# Count the total number of orders
total_orders = salesdata_rdd.count()

total_orders

In the CSV file, we have 2 header lines. In order to count the total number of orders, we need to remove the header lines from the RDD.

Total number of orders: 10

In [None]:
# 3. Calculate the total revenue from all orders.
# Index of OrderAmount is 2

sales_revenues = salesdata_rdd.map(lambda line: float(line.split(",")[2]))

total_revenue = sales_revenues.sum()

total_orders

Total orders = 10

In [None]:
# 4. Find the number of unique customers.
# Index of CustomerID is 1

unique_customers = salesdata_rdd.map(lambda line: line.split(",")[1]).distinct()

total_unique_customers = unique_customers.count()

total_unique_customers

Total unique customers = 6

In [None]:
sc.stop()

# Exercise 2 - Department Salary Stats
Tasks:

1. Load employee data and skip header.
2. Find total number of employees.
3. Calculate average salary per department.
4. Find department with highest total salary.

In [2]:
# 1. Load employee data and skip header.
sc = SparkContext("local", "ST2CBD-Lab2-DepartmentSalaryStats")

# Load the dataset
salesdata_rdd = sc.textFile(f"{DIR_CONTEXT}/2 salary.csv")

# Skip header
header = salesdata_rdd.take(1)

salesdata_rdd = salesdata_rdd.filter(lambda line: line != header[0])

salesdata_rdd.take(1)

25/05/14 08:41:24 WARN Utils: Your hostname, MacBook-Pro-de-Lucas.local resolves to a loopback address: 127.0.0.1; using 10.101.8.226 instead (on interface en0)
25/05/14 08:41:24 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/05/14 08:41:24 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
                                                                                

["'E101,Alice,Sales,50000']"]

Result should not be the header and be the first line: ['E101,Alice,Sales,50000']

In [3]:
# 2. Find total number of employees.

total_employees = salesdata_rdd.count()

total_employees

8

Total employees = 8

In [5]:
# 3. Calculate average salary per department.

average_salary_per_department = salesdata_rdd.map(lambda line: (line.split(",")[2], float(line.split(",")[3]))) \
    .groupByKey() \
    .mapValues(lambda salaries: sum(salaries) / len(salaries)).collect()

average_salary_per_department

[('Sales', 54000.0), ('Marketing', 56500.0), ('IT', 69000.0)]

Departements average salaries:
- Sales: 54000
- Marketing: 56500
- IT: 69000

In [None]:
# 4. Find department with the highest total salary.

highest_salary_department = salesdata_rdd.map(lambda line: (line.split(",")[2], float(line.split(",")[3]))) \
    .groupByKey() \
    .mapValues(lambda salaries: sum(salaries)) \
    .sortBy(lambda x: x[1], ascending=False) \
    .first()

highest_salary_department

In [None]:
sc.stop()

# Exercise 3: Web Traffic Summary
Dataset: web_logs.txt

Tasks:
1. Load the data and parse it into fields.
2. Count total number of visits per URL.
3. Find unique users per URL.
4. Determine peak hour for traffic.

In [None]:
# 1. Load the data and parse it into fields.

sc = SparkContext("local", "ST2CBD-Lab2-WebTrafficSummary")
web_logs_rdd = sc.textFile(f"{DIR_CONTEXT}/3 web_logs.txt")

# Skip header
header = web_logs_rdd.take(1)
web_logs_rdd = web_logs_rdd.filter(lambda line: line != header[0])

# Parse the data into fields
web_logs_rdd = web_logs_rdd.map(lambda line: line.split(","))

In [None]:
# 2. Count total number of visits per URL.

url_visits = web_logs_rdd.map(lambda fields: (fields[1], 1)) \
    .reduceByKey(lambda a, b: a + b) \
    .collect()

url_visits

We take the URL and the number of visits -> Reduce by key to count visits (line = 1 visit).

Result: [('/home', 6), ('/product', 2), ('/contact', 1), ('/about', 1)]

In [None]:
# 3. Find unique users per URL.

unique_users_per_url = web_logs_rdd.map(lambda fields: (fields[1], fields[0])) \
    .distinct() \
    .map(lambda x: (x[0], 1)) \
    .reduceByKey(lambda a, b: a + b) \
    .collect()

unique_users_per_url

First we take the URL and the user ID -> Remove duplicates -> Reduce by key to count unique users.

Result: [('/home', 4), ('/product', 2), ('/contact', 1), ('/about', 1)]

In [None]:
# 4. Determine peak hour for traffic.
from datetime import datetime

hour_counts = web_logs_rdd.map(lambda fields: (datetime.strptime(fields[2], "%Y-%m-%d %H:%M:%S").hour, 1)) \
    .reduceByKey(lambda a, b: a + b)

peak_hour = hour_counts.sortBy(lambda x: x[1], ascending=False).first()

peak_hour

We take the field index 2 (Timestamp) and convert it to a datetime object -> Get the hour -> Reduce by key to count visits per hour.

After that we sort by the number of visits and take the first one.

Result: (13, 2) -> 13:00 w/ 2 visits

In [None]:
sc.stop()

# Exercise 4: Product Rating Analysis
Tasks: Dataset: product_reviews.csv
1. Load and parse the review data.
2. Calculate the average rating per product.
3. Count number of reviews per product.
4. Find products with all 5-star reviews.

In [None]:
def parse_rdd_csv(file_path, header_lines=1) -> RDD[str]:
    rdd = sc.textFile(file_path)
    header = rdd.take(header_lines)
    rdd = rdd.filter(lambda line: line not in header)

    rdd = rdd.map(lambda line: line.split(","))
    return rdd

Created a function for parsing input files. It takes the file path and the number of header lines to skip (header). It returns an RDD with the parsed data.

In [None]:
# 1. Load and parse the review data.
sc = SparkContext("local", "ST2CBD-Lab2-ProductRatingAnalysis")
production_reviews_rdd = parse_rdd_csv(f"{DIR_CONTEXT}/4 product_reviews.csv")

In [None]:
# 2. Calculate the average rating per product
average_rating_per_product = production_reviews_rdd.map(lambda fields: (fields[1], float(fields[2]))) \
    .groupByKey() \
    .mapValues(lambda ratings: sum(ratings) / len(ratings)) \
    .collect()

average_rating_per_product

We take the second (product) and third field (score) -> Group by product -> Calculate the average score.

In [None]:
# 3. Count number of reviews per product

reviews_per_product = production_reviews_rdd.map(lambda fields: (fields[1], 1)) \
    .reduceByKey(lambda a, b: a + b) \
    .collect()

reviews_per_product

We take the second field (product) and count the number of reviews (each line correspond to one product review).

In [None]:
# 4. Find products with all 5-star reviews.

five_star_products = production_reviews_rdd.map(lambda fields: (fields[1], float(fields[2]))) \
    .groupByKey() \
    .filter(lambda x: all(rating == 5.0 for rating in x[1])) \
    .map(lambda x: x[0]) \
    .collect()

five_star_products

We take the second field (product) and the third field (score) -> Group by product -> Filter products with all 5-star reviews

In [None]:
sc.stop()

# Exercise 5: Movie Ratings Analytics
Objective: Learn to read a CSV file, initialize SparkContext, and perform basic operations.

Tasks:
1. Load the movies.csv using SparkContext.textFile.
2. Count the total number of movies.
3. Filter movies with rating >= 4.
4. Display the top 5 movies by rating.

In [40]:
# 1. Load the movies.csv using SparkContext.textFile.
sc = SparkContext("local", "ST2CBD-Lab2-MovieRatingsAnalytics")
movies_rdd = parse_rdd_csv(f"{DIR_CONTEXT}/5 movies.csv")

In [None]:
# 2. Count the total number of movies.
total_movies = movies_rdd.count()
total_movies

1 line = 1 movie

Total movies = 5

In [None]:
# 3. Filter movies with rating >= 4.
movies_rated_4_or_plus = movies_rdd.filter(lambda fields: float(fields[3]) >= 4.0) \
    .map(lambda fields: (fields[1], fields[2], float(fields[3]))) \
    .collect()

movies_rated_4_or_plus

We filter lines with rating >= 4.0 -> We take the second field (movie name), third field (genre) and fourth field (rating).

In [42]:
# 4. Display the top 5 movies by rating. (Sort + Limit)
top_5_movies = movies_rdd.map(lambda fields: (fields[1], fields[2], float(fields[3]))) \
    .sortBy(lambda fields: float(fields[2]), ascending=False) \
    .take(5)

top_5_movies

[('The Dark Knight', 'Action', 4.9),
 ('Inception', 'Sci-Fi', 4.8),
 ('Interstellar', 'Sci-Fi', 4.7),
 ('The Notebook', 'Romance', 4.2),
 ('Fast & Furious', 'Action', 3.9)]

We take the second field (movie name), third field (genre) and fourth field (rating) -> Sort by rating -> Take the top 5 movies.

In [47]:
sc.stop()

# Exercise 6: Student Scores Analysis
Objective: Understand RDD transformations and actions.

Tasks:
1. Load data using SparkContext.textFile.
2. Map each line to a key-value pair (name, score).
3. Filter students who scored above 80.
4. Count how many students scored above 80.

In [48]:
# 1. Load data using SparkContext.textFile.
sc = SparkContext("local", "ST2CBD-Lab2-StudentScoresAnalysis")
students_rdd = parse_rdd_csv(f"{DIR_CONTEXT}/6 analysis.csv")

students_rdd.collect()

[['Alice', 'Math', '85'],
 ['Bob', 'Math', '75'],
 ['Charlie', 'Math', '90'],
 ['Diana', 'Math', '88'],
 ['Eve', 'Math', '60']]

In [49]:
# 2. Map each line to a key-value pair (name, score).
students_scores_rdd = students_rdd.map(lambda fields: (fields[0], float(fields[2])))

students_scores_rdd.collect()

[('Alice', 85.0),
 ('Bob', 75.0),
 ('Charlie', 90.0),
 ('Diana', 88.0),
 ('Eve', 60.0)]

We take the first field (name) and the third field (score) -> We create a tuple with the name and score.

In [65]:
# 3. Filter students who scored above 80.
students_above_80 = students_rdd.filter(lambda fields: float(fields[2]) > 80.0) \
    .map(lambda fields: (fields[0], fields[1])) \
    .collect()

students_above_80

[('Alice', 'Math'), ('Charlie', 'Math'), ('Diana', 'Math')]

We filter by score > 80.0 (field n°3) -> We take the first field (name) and the second field (subject).

In [66]:
# 4. Count how many students scored above 80.
count_students_above_80 = students_rdd.filter(lambda fields: float(fields[2]) > 80.0) \
    .count()

print(count_students_above_80)

# or
count_students_above_80 = len(students_above_80)
count_students_above_80

3


3

In [73]:
sc.stop()

# Exercise 7: Word Count from News Articles
Objective: Classic Word Count example.

Tasks:
1. Load the news data file.
2. Split lines into words.
3. Map each word to (word, 1).
4. Use reduceByKey to get word counts

In [74]:
# 1. Load the news data file.
sc = SparkContext("local", "ST2CBD-Lab2-WordCount")
news_rdd = sc.textFile(f"{DIR_CONTEXT}/7 wordcount.csv", 0)

news_rdd.collect()

['Spark is fast. Big data is booming. Spark handles big data with ease.']

In [91]:
# 2. Split lines into words.
words_rdd = news_rdd.flatMap(lambda line: line.replace('.', '').split(" "))

words_rdd.collect()

['Spark',
 'is',
 'fast',
 'Big',
 'data',
 'is',
 'booming',
 'Spark',
 'handles',
 'big',
 'data',
 'with',
 'ease']

Split words by space

In [93]:
# 3. Map each word to (word, 1).
word_pairs_rdd = words_rdd.map(lambda word: (word, 1))

word_pairs_rdd.collect()

[('Spark', 1),
 ('is', 1),
 ('fast', 1),
 ('Big', 1),
 ('data', 1),
 ('is', 1),
 ('booming', 1),
 ('Spark', 1),
 ('handles', 1),
 ('big', 1),
 ('data', 1),
 ('with', 1),
 ('ease', 1)]

We take each word and create a tuple with the word and 1 (1 occurrence).

In [94]:
# 4. Use reduceByKey to get word counts
word_counts_rdd = word_pairs_rdd.reduceByKey(lambda a, b: a + b)

word_counts_rdd.collect()

[('Spark', 2),
 ('is', 2),
 ('fast', 1),
 ('Big', 1),
 ('data', 2),
 ('booming', 1),
 ('handles', 1),
 ('big', 1),
 ('with', 1),
 ('ease', 1)]

In [96]:
sc.stop()

# Exercise 8: Product Sales Tracker
Objective: Group and aggregate product sales.

Tasks:
1. Load product sales CSV.
2. Map to (product, revenue).
3. Use reduceByKey to get total revenue.
4. Filter products with revenue > 1000.

In [97]:
# 1. Load product sales CSV.
sc = SparkContext("local", "ST2CBD-Lab2-ProductSalesTracker")
product_sales_rdd = parse_rdd_csv(f"{DIR_CONTEXT}/8 product.csv")

product_sales_rdd.collect()

[['1', 'Mobile', '100', '10'],
 ['2', 'Laptop', '10', '900'],
 ['3', 'Mouse', '200', '5'],
 ['4', 'Keyboard', '150', '10']]

In [105]:
# 2. Map to (product, revenue).
product_revenue_rdd = product_sales_rdd.map(lambda fields: (fields[1], float(fields[2]) * float(fields[3])))

product_revenue_rdd.collect()

[('Mobile', 1000.0),
 ('Laptop', 9000.0),
 ('Mouse', 1000.0),
 ('Keyboard', 1500.0)]

We take the second field (product) and the third field (revenue) -> We create a tuple with the product and revenue.

In [108]:
# 3. Use reduceByKey to get total revenue.
total_revenue_rdd = product_revenue_rdd.reduceByKey(lambda a, b: a + b) # ??

total_revenue_rdd.collect()

# Getwhole total revenue
total_revenue = total_revenue_rdd.map(lambda fields: fields[1]).sum()

total_revenue

12500.0

In this case, the usage of reduceByKey is not necessary since there's no duplicate keys.

To get the total revenue, we can simply sum values.

In [109]:
# 4. Filter products with revenue > 1000.
filtered_revenue_rdd = total_revenue_rdd.filter(lambda fields: fields[1] > 1000)

filtered_revenue_rdd.collect()

[('Laptop', 9000.0), ('Keyboard', 1500.0)]

We filter the products with revenue > 1000.

In [111]:
sc.stop()

# Exercise 9: Temperature Data Monitoring
Objective: Work with sensor data and aggregate.

Tasks:
1. Load temperature log.
2. Extract date and temperature.
3. Calculate daily average temperature.
4. Filter days with average temperature > 30°C.

In [112]:
# 1. Load temperature log.
sc = SparkContext("local", "ST2CBD-Lab2-TemperatureDataMonitoring")
temperature_rdd = parse_rdd_csv(f"{DIR_CONTEXT}/9 temp.csv")

temperature_rdd.collect()

[['2025-05-01', 'S1', '32'],
 ['2025-05-01', 'S2', '31'],
 ['2025-05-02', 'S1', '28'],
 ['2025-05-02',
  'S2',
  '30                                                     ']]

In [114]:
# 2. Extract date and temperature.
temperature_data_rdd = temperature_rdd.map(lambda fields: (fields[0], float(fields[2])))

temperature_data_rdd.collect()

[('2025-05-01', 32.0),
 ('2025-05-01', 31.0),
 ('2025-05-02', 28.0),
 ('2025-05-02', 30.0)]

We take the first field (date) and the third field (temperature) -> We create a tuple with the date and temperature.

In [115]:
# 3. Calculate daily average temperature.
daily_avg_temp_rdd = temperature_data_rdd.groupByKey() \
    .mapValues(lambda temps: sum(temps) / len(temps))

daily_avg_temp_rdd.collect()

[('2025-05-01', 31.5), ('2025-05-02', 29.0)]

We use groupByKey to group by date -> We calculate the average temperature for each date.

In [None]:
# 4. Filter days with average temperature > 30°C.
filtered_temp_rdd = daily_avg_temp_rdd.filter(lambda fields: fields[1] > 30)

filtered_temp_rdd.collect()

We take the average temperature (produced previously) and filter the days with average temperature > 30°C.

In [None]:
sc.stop()