EXTRACT DATA


In [1]:
from pyspark.sql import SparkSession

jdbc_url = "jdbc:postgresql://192.168.20.11:5432/demo_db"
properties = {
    "user": "postgres", 
    "password": "postgres",  
    "driver": "org.postgresql.Driver",
    "fetchsize": "10000"
}
postgres_driver_path = "C:/Users/denve/Downloads/postgresql-42.7.5.jar"

def extract(jdbc_url, table_name, properties, postgres_driver_path):
    """ Extract data from PostgreSQL database using Spark."""
    # Initialize Spark session
    spark = SparkSession.builder \
        .appName("Postgres Connection") \
        .config("spark.jars", postgres_driver_path) \
        .config("spark.driver.memory", "8g") \
        .config("spark.executor.memory", "8g") \
        .config("spark.executor.memoryOverhead", "2g") \
        .config("spark.driver.memoryOverhead", "2g") \
        .getOrCreate()

    # Extracts data from PostgreSQL database
    df = spark.read.jdbc(
        url=jdbc_url,
        table=table_name,
        properties=properties,
    )
    
    return df, spark

In [None]:
# Extract data once
raw_df, spark = extract(jdbc_url, "raw_student_grades", properties, postgres_driver_path)


CHECK FOR SPECIAL CHARACTERS AND CATEGORICAL DATA IN grade_final COLUMN

In [5]:
raw_df.createOrReplaceTempView('raw_student_grades')
show_data = spark.sql('SELECT grade_final, COUNT(*) AS occurences FROM raw_student_grades GROUP BY grade_final ORDER BY grade_final DESC')
show_data.show(22)

NameError: name 'raw_df' is not defined

REMOVE WHITE SPACES

In [459]:
from pyspark.sql.functions import trim

raw_df = raw_df.withColumn('grade_final', trim('grade_final'))\
.withColumn('grade_reexam', trim('grade_reexam'))


In [460]:
raw_df.createOrReplaceTempView('raw_student_grades')

show_data = spark.sql("SELECT grade_final, COUNT(*) AS occurences FROM raw_student_grades GROUP BY grade_final ORDER BY grade_final DESC")
show_data.show(22)

+-----------+----------+
|grade_final|occurences|
+-----------+----------+
|        inc|         1|
|     Passed|       255|
|          P|        27|
|         OG|      1581|
|        INC|     60401|
|          F|         1|
|        DRP|      2907|
|       5.00|     15360|
|       4.00|       142|
|       3.50|         1|
|       3.00|    118143|
|       2.75|     90548|
|       2.50|    177290|
|       2.25|    160248|
|       2.00|    289306|
|       1.75|    259871|
|       1.50|    452504|
|       1.25|    317692|
|       1.00|     63427|
|         --|        14|
|          -|        21|
+-----------+----------+



In [461]:
show_data = spark.sql("SELECT grade_final, grade_reexam, COUNT(*) AS occurences FROM raw_student_grades WHERE grade_final = '--' GROUP BY grade_final, grade_reexam ORDER BY occurences")

show_data.show()

+-----------+------------+----------+
|grade_final|grade_reexam|occurences|
+-----------+------------+----------+
|         --|          --|        14|
+-----------+------------+----------+



In [462]:
show_data = spark.sql("SELECT grade_final, grade_reexam, COUNT(*) AS occurences FROM raw_student_grades WHERE grade_final = '-' GROUP BY grade_final, grade_reexam ORDER BY occurences")

show_data.show()

+-----------+------------+----------+
|grade_final|grade_reexam|occurences|
+-----------+------------+----------+
|          -|        1.00|         1|
|          -|        1.50|         1|
|          -|        1.25|         1|
|          -|        2.00|         1|
|          -|           -|        17|
+-----------+------------+----------+



ASSIGN NUMERIC VALUES TO THE CATEGORICAL DATA (DRP = 0, INC = -1, OG = -2, P = -3, Passed = -4, F = -5)

In [463]:
from pyspark.sql.functions import when, col, upper

# Convert grades based on conditions and retain numeric values
raw_df = raw_df.withColumn(
    'new_grade_final',
    when(upper(col('grade_final')) == "DRP", 0)
    .when(upper(col('grade_final')) == "INC", -1)
    .when(upper(col('grade_final')) == "OG", -2)
    .when(upper(col('grade_final')) == "P", -3)
    .when(upper(col('grade_final')) == "PASSED", -4)
    .when(upper(col('grade_final')) == "F", -5)
    .otherwise(col('grade_final'))
)

In [464]:
raw_df.createOrReplaceTempView('raw_student_grades')
show_data = spark.sql("SELECT new_grade_final, COUNT(*) AS occurences FROM raw_student_grades GROUP BY new_grade_final ORDER BY occurences DESC")
show_data.show()

+---------------+----------+
|new_grade_final|occurences|
+---------------+----------+
|           1.50|    452504|
|           1.25|    317692|
|           2.00|    289306|
|           1.75|    259871|
|           2.50|    177290|
|           2.25|    160248|
|           3.00|    118143|
|           2.75|     90548|
|           1.00|     63427|
|             -1|     60402|
|           5.00|     15360|
|              0|      2907|
|             -2|      1581|
|             -4|       255|
|           4.00|       142|
|             -3|        27|
|              -|        21|
|             --|        14|
|           3.50|         1|
|             -5|         1|
+---------------+----------+



In [465]:
show_data = spark.sql("SELECT new_grade_final, grade_reexam, status, COUNT(*) AS occurences FROM raw_student_grades WHERE grade_final = '-' GROUP BY new_grade_final, grade_reexam, status ORDER BY occurences ASC")
show_data.show()

+---------------+------------+------+----------+
|new_grade_final|grade_reexam|status|occurences|
+---------------+------------+------+----------+
|              -|        1.25|PASSED|         1|
|              -|        2.00|PASSED|         1|
|              -|        1.50|PASSED|         1|
|              -|        1.00|PASSED|         1|
|              -|           -|FAILED|        17|
+---------------+------------+------+----------+



In [466]:
show_data = spark.sql("SELECT new_grade_final, grade_reexam, COUNT(*) AS occurences FROM raw_student_grades WHERE new_grade_final = '--' OR new_grade_final = ' -- ' GROUP BY new_grade_final, grade_reexam ORDER BY occurences ASC")
show_data.show()

+---------------+------------+----------+
|new_grade_final|grade_reexam|occurences|
+---------------+------------+----------+
|             --|          --|        14|
+---------------+------------+----------+



FILTER OUT THE CHARACTERS SUCH AS "-", "--"

In [468]:
raw_df = raw_df.filter(~((col('new_grade_final') == "-") & (col('grade_reexam') == '-')))\
.filter(~((col('new_grade_final') == "--") & (col('grade_reexam') == '--')))


In [469]:
raw_df.createOrReplaceTempView('raw_student_grades')

show_data = spark.sql("SELECT new_grade_final, COUNT(*) AS occurences FROM raw_student_grades GROUP BY new_grade_final ORDER BY occurences DESC")
show_data.show()

+---------------+----------+
|new_grade_final|occurences|
+---------------+----------+
|           1.50|    452504|
|           1.25|    317692|
|           2.00|    289306|
|           1.75|    259871|
|           2.50|    177290|
|           2.25|    160248|
|           3.00|    118143|
|           2.75|     90548|
|           1.00|     63427|
|             -1|     60402|
|           5.00|     15360|
|              0|      2907|
|             -2|      1581|
|             -4|       255|
|           4.00|       142|
|             -3|        27|
|              -|         4|
|           3.50|         1|
|             -5|         1|
+---------------+----------+



In [470]:
show_data = spark.sql("SELECT new_grade_final, grade_reexam, status, COUNT(*) AS occurences FROM raw_student_grades WHERE grade_final = '-' GROUP BY new_grade_final, grade_reexam, status ORDER BY occurences ASC")
show_data.show()

+---------------+------------+------+----------+
|new_grade_final|grade_reexam|status|occurences|
+---------------+------------+------+----------+
|              -|        1.25|PASSED|         1|
|              -|        2.00|PASSED|         1|
|              -|        1.50|PASSED|         1|
|              -|        1.00|PASSED|         1|
+---------------+------------+------+----------+



In [471]:
raw_df = raw_df.withColumn(
    'new_grade_final',
    when(upper(col('grade_final')) == "-", -1)
    .otherwise(col('new_grade_final'))
)

In [474]:
raw_df.createOrReplaceTempView('raw_student_grades')

show_data = spark.sql("""
    SELECT new_grade_final, COUNT(*) AS occurrences 
    FROM raw_student_grades 
    GROUP BY new_grade_final 
    ORDER BY occurrences DESC
""")
show_data.show()


+---------------+-----------+
|new_grade_final|occurrences|
+---------------+-----------+
|           1.50|     452504|
|           1.25|     317692|
|           2.00|     289306|
|           1.75|     259871|
|           2.50|     177290|
|           2.25|     160248|
|           3.00|     118143|
|           2.75|      90548|
|           1.00|      63427|
|             -1|      60406|
|           5.00|      15360|
|              0|       2907|
|             -2|       1581|
|             -4|        255|
|           4.00|        142|
|             -3|         27|
|           3.50|          1|
|             -5|          1|
+---------------+-----------+



CAST THE new_grade_final FROM STRING INTO DOUBLE (FLOAT)

In [477]:
raw_df = raw_df.withColumn("new_grade_final", col("new_grade_final").cast("double"))

In [478]:
raw_df.printSchema()

root
 |-- id: integer (nullable = true)
 |-- schoolyear: string (nullable = true)
 |-- semester: string (nullable = true)
 |-- code: string (nullable = true)
 |-- description: string (nullable = true)
 |-- credits: integer (nullable = true)
 |-- instructor_id: integer (nullable = true)
 |-- instructor: string (nullable = true)
 |-- srcode: integer (nullable = true)
 |-- fullname: string (nullable = true)
 |-- campus: string (nullable = true)
 |-- college: string (nullable = true)
 |-- program: string (nullable = true)
 |-- grade_final: string (nullable = true)
 |-- grade_reexam: string (nullable = true)
 |-- status: string (nullable = true)
 |-- new_grade_final: double (nullable = true)



In [1]:
raw_df.show()

NameError: name 'raw_df' is not defined