In [3]:
import pyspark
from pyspark.sql import SparkSession

# REDIS CONFIGURATION
redis_host = "redis"
redis_port = "6379"

# Spark init
spark = SparkSession \
    .builder \
    .master("local") \
    .appName('jupyter-pyspark') \
      .config("spark.redis.host", redis_host)\
      .config("spark.redis.port", redis_port)\
      .config("spark.jars.packages","com.redislabs:spark-redis_2.12:3.0.0")\
    .getOrCreate()
sc = spark.sparkContext
sc.setLogLevel("ERROR")

print(f"host {redis_host}, port {redis_port}")


host redis, port 6379


In [9]:
# 7. Use Spark to load the exam scores data set `/home/jovyan/datasets/exam-scores/*.csv` 
# into Redis under the namespace examscores. Use Spark to demonstrate the 
# data is there by querying it back out.

file_path = "file:///home/jovyan/datasets/exam-scores/*.csv"
# exam_scores = spark.read.csv(file_path)
exam_scores = spark.read.options(header=True, inferSchema=True).csv(file_path)
exam_scores.printSchema()
print(file_path)

root
 |-- Class_Section: string (nullable = true)
 |-- Exam_Version: string (nullable = true)
 |-- Completion_Time: integer (nullable = true)
 |-- Made_Own_Study_Guide: string (nullable = true)
 |-- Did_Exam_Prep Assignment: string (nullable = true)
 |-- Studied_In_Groups: string (nullable = true)
 |-- Student_Score: integer (nullable = true)
 |-- Percentage: string (nullable = true)
 |-- Letter_Grade: string (nullable = true)

file:///home/jovyan/datasets/exam-scores/*.csv


In [17]:
# try to find a unique key
# exam_scores.count() # 65
# exam_scores.select("Class_Section", "Exam_Version").distinct().count() # 8
exam_scores.createOrReplaceTempView("examscores")
exam_scores.toPandas()

Unnamed: 0,Class_Section,Exam_Version,Completion_Time,Made_Own_Study_Guide,Did_Exam_Prep Assignment,Studied_In_Groups,Student_Score,Percentage,Letter_Grade
0,M02,A,20,N,N,N,16,53.30%,D
1,M02,A,25,?,?,?,17,56.70%,D
2,M02,A,30,N,Y,Y,24,80.00%,B
3,M02,A,35,?,?,?,22,73.30%,C+
4,M02,A,40,?,?,?,27,90.00%,A-
...,...,...,...,...,...,...,...,...,...
60,M01,D,40,?,?,?,20,66.70%,C
61,M01,D,55,Y,N,N,20,66.70%,C
62,M01,D,60,Y,Y,N,19,63.30%,C-
63,M01,D,60,Y,N,Y,21,70.00%,C+


In [18]:
exam_scores.write.format("org.apache.spark.sql.redis")\
    .mode("overwrite")\
    .option("table", "examscores")\
    .save()

# go to command line for redis and type:
#  keys examscores:*
# hgetall examscores:use_one_key

In [21]:
exam_scores2 = spark.read.format("org.apache.spark.sql.redis")\
    .option("table", "examscores")\
    .load()

In [22]:
exam_scores2.toPandas()

Unnamed: 0,Class_Section,Exam_Version,Completion_Time,Made_Own_Study_Guide,Did_Exam_Prep Assignment,Studied_In_Groups,Student_Score,Percentage,Letter_Grade
0,M02,A,50,?,?,?,27,90.00%,A-
1,M01,D,60,?,?,?,26,86.70%,B+
2,M02,B,60,N,N,Y,22,73.30%,C+
3,M01,D,40,?,?,?,20,66.70%,C
4,M01,A,30,Y,Y,Y,30,100.00%,A
...,...,...,...,...,...,...,...,...,...
60,M01,B,20,N,N,Y,13,43.30%,F
61,M01,D,60,Y,Y,N,19,63.30%,C-
62,M02,A,40,?,?,?,27,90.00%,A-
63,M01,C,45,Y,N,N,22,73.30%,C+


In [25]:
# In Spark SQL, read the Redis examscore data into a temp view and get the min, 
# max, and average exam score across all students. Write the data back out to 
# Redis as examscoresummary; finally query the key in Redis showing all 
# values in the hash!

from pyspark.sql.functions import *
exam_summary = exam_scores.groupBy().agg(
    min(col("Student_Score")).alias("min_score"),
    avg(col("Student_Score")).alias("avg_score"),
    max(col("Student_Score")).alias("max_score")
)
exam_summary.show()

+---------+-----------------+---------+
|min_score|        avg_score|max_score|
+---------+-----------------+---------+
|       13|22.73846153846154|       30|
+---------+-----------------+---------+



In [26]:
exam_summary.write.format("org.apache.spark.sql.redis")\
    .mode("overwrite")\
    .option("table", "examsummary")\
    .save()

In [None]:
# in the redis CLI,
# keys examsummary:*
# hgetall examsummary:use_row_id