In [1]:
from pyspark.sql import SparkSession

jdbc_url = "jdbc:postgresql://192.168.20.11:5432/demo_db"
properties = {
    "user": "postgres", 
    "password": "postgres",  
    "driver": "org.postgresql.Driver",
    "fetchsize": "10000"
}

#DOWNLOAD FROM ORACLE
postgres_driver_path = "C:\postgresql-42.7.5.jar"

def extract(jdbc_url, table_name, properties, postgres_driver_path):
    """ Extract data from PostgreSQL database using Spark."""
    # Initialize Spark session
    spark = SparkSession.builder \
        .appName("Postgres Connection") \
        .config("spark.jars", postgres_driver_path) \
        .config("spark.driver.memory", "8g") \
        .config("spark.executor.memory", "8g") \
        .config("spark.executor.memoryOverhead", "2g") \
        .config("spark.driver.memoryOverhead", "2g") \
        .getOrCreate()

    # Extracts data from PostgreSQL database
    df = spark.read.jdbc(
        url=jdbc_url,
        table=table_name,
        properties=properties,
    )
    
    return df, spark

# Extract data once
raw_df, spark = extract(jdbc_url, "filtered_data_with_id", properties, postgres_driver_path)

# Check the number of rows
row_count = raw_df.count()
print(f'Number of rows: {row_count}')

# Print schema
raw_df.printSchema()

Number of rows: 1999679
root
 |-- id: integer (nullable = true)
 |-- schoolyear: string (nullable = true)
 |-- semester: string (nullable = true)
 |-- code: string (nullable = true)
 |-- description: string (nullable = true)
 |-- credits: integer (nullable = true)
 |-- instructor_id: string (nullable = true)
 |-- instructor: string (nullable = true)
 |-- srcode: string (nullable = true)
 |-- fullname: string (nullable = true)
 |-- campus: string (nullable = true)
 |-- college: string (nullable = true)
 |-- program: string (nullable = true)
 |-- grade_final: string (nullable = true)
 |-- grade_reexam: string (nullable = true)
 |-- status: string (nullable = true)
 |-- grade_numeric: decimal(5,2) (nullable = true)
 |-- grade_classification: string (nullable = true)



In [None]:
from pyspark.sql.functions import split, col, trim, when

# Get distinct schoolyear values
distinct_years_df = raw_df.select("schoolyear").distinct()

# Optionally, show all distinct school years
distinct_years_df.show(truncate=False)

# Extract the starting year from the 'schoolyear' column
# Assumes format like "2021-2022"
distinct_years_df = distinct_years_df.withColumn("start_year", split(col("schoolyear"), "-")[0].cast("int"))

# Filter for school years starting from 2006 (or later)
#valid_years_df = distinct_years_df.filter(col("start_year") >= 2006)

valid_years_df = distinct_years_df.orderBy("start_year")

# Collect the valid schoolyear values as a list
valid_schoolyears = [row.schoolyear for row in valid_years_df.collect()]

print("Valid schoolyears:", valid_schoolyears)


+----------+
|schoolyear|
+----------+
|2022-2023 |
|2021-2022 |
|2011-2012 |
|2012-2013 |
|2013-2014 |
|2016-2017 |
|2010-2011 |
|2014-2015 |
|2007-2008 |
|2006-2007 |
|2017-2018 |
|2019-2020 |
|2018-2019 |
|2023-2024 |
|2009-2010 |
|2024-2025 |
|2008-2009 |
|2020-2021 |
|2015-2016 |
+----------+

Valid schoolyears: ['2006-2007', '2007-2008', '2008-2009', '2009-2010', '2010-2011', '2011-2012', '2012-2013', '2013-2014', '2014-2015', '2015-2016', '2016-2017', '2017-2018', '2018-2019', '2019-2020', '2020-2021', '2021-2022', '2022-2023', '2023-2024', '2024-2025']


In [16]:
from pyspark.sql.functions import split, col, when, concat_ws

# Define the semesters you're interested in
semesters = ["FIRST", "SECOND", "SUMMER"]

# Filter the DataFrame on the valid schoolyears, semesters, and ensure grade_numeric is not null
filtered_df = raw_df.filter(
    (col("schoolyear").isin(valid_schoolyears)) &
    (col("semester").isin(semesters)) &
    (col("grade_numeric").isNotNull())
)

# Extract the starting year from the 'schoolyear' column (assumes format "2021-2022")
filtered_df = filtered_df.withColumn("start_year", split(col("schoolyear"), "-")[0].cast("int"))

# Create a custom order for the semester column
filtered_df = filtered_df.withColumn(
    "sem_order",
    when(col("semester") == "FIRST", 1)
    .when(col("semester") == "SECOND", 2)
    .when(col("semester") == "SUMMER", 3)
)

# Create the 'yearsem' column by concatenating schoolyear and semester (e.g., "2021-2022-FIRST")
filtered_df = filtered_df.withColumn("yearsem", concat_ws("-", col("schoolyear"), col("semester")))

# Order by start_year and then by sem_order to get FIRST, SECOND, SUMMER in that order for each year
output_df = filtered_df.select("srcode", "yearsem", "description", "grade_numeric").orderBy("srcode","start_year", "sem_order", "description")

# Show a preview of the results
output_df.show(500, truncate=False)

+------+----------------+-----------------------------------------------------------------------------------------------+-------------+
|srcode|yearsem         |description                                                                                    |grade_numeric|
+------+----------------+-----------------------------------------------------------------------------------------------+-------------+
|100005|2020-2021-FIRST |Educational Innovations and Technology                                                         |1.00         |
|100005|2020-2021-FIRST |Educational Leadership (with Ethical, Professional and Spiritual Principles)                   |1.25         |
|100005|2020-2021-FIRST |Financial Management of Educational Institutions in the Philippines                            |1.00         |
|100005|2020-2021-SECOND|Educational Administration and Supervision (with Educational Philosophy and Legislations)      |1.00         |
|100005|2020-2021-SECOND|Human and Material Reso

In [17]:
from pyspark.sql.types import DecimalType

# Define DecimalType(5,2)
decimal_type = DecimalType(5, 2)

# Define conditions for each grade_numeric mapping and cast as DecimalType
final_df = output_df.withColumn(
    "min", when(col("grade_numeric") == 1.00, 98)
          .when(col("grade_numeric") == 1.25, 94)
          .when(col("grade_numeric") == 1.50, 90)
          .when(col("grade_numeric") == 1.75, 88)
          .when(col("grade_numeric") == 2.00, 85)
          .when(col("grade_numeric") == 2.25, 83)
          .when(col("grade_numeric") == 2.50, 80)
          .when(col("grade_numeric") == 2.75, 78)
          .when(col("grade_numeric") == 3.00, 75)
          .when(col("grade_numeric") == 5.00, 74)
          .otherwise(None).cast(decimal_type)
).withColumn(
    "mid", when(col("grade_numeric") == 1.00, 99)
          .when(col("grade_numeric") == 1.25, 95.5)
          .when(col("grade_numeric") == 1.50, 91.5)
          .when(col("grade_numeric") == 1.75, 88.5)
          .when(col("grade_numeric") == 2.00, 86)
          .when(col("grade_numeric") == 2.25, 83.5)
          .when(col("grade_numeric") == 2.50, 81)
          .when(col("grade_numeric") == 2.75, 78.5)
          .when(col("grade_numeric") == 3.00, 76)
          .when(col("grade_numeric") == 5.00, 74)
          .otherwise(None).cast(decimal_type)
).withColumn(
    "max", when(col("grade_numeric") == 1.00, 100)
          .when(col("grade_numeric") == 1.25, 97)
          .when(col("grade_numeric") == 1.50, 93)
          .when(col("grade_numeric") == 1.75, 89)
          .when(col("grade_numeric") == 2.00, 87)
          .when(col("grade_numeric") == 2.25, 84)
          .when(col("grade_numeric") == 2.50, 82)
          .when(col("grade_numeric") == 2.75, 79)
          .when(col("grade_numeric") == 3.00, 77)
          .when(col("grade_numeric") == 5.00, 74)
          .otherwise(None).cast(decimal_type)
)

# Show results and schema
final_df.show()
final_df.printSchema()

+------+----------------+--------------------+-------------+-----+-----+------+
|srcode|         yearsem|         description|grade_numeric|  min|  mid|   max|
+------+----------------+--------------------+-------------+-----+-----+------+
|100005| 2020-2021-FIRST|Educational Innov...|         1.00|98.00|99.00|100.00|
|100005| 2020-2021-FIRST|Educational Leade...|         1.25|94.00|95.50| 97.00|
|100005| 2020-2021-FIRST|Financial Managem...|         1.00|98.00|99.00|100.00|
|100005|2020-2021-SECOND|Educational Admin...|         1.00|98.00|99.00|100.00|
|100005|2020-2021-SECOND|Human and Materia...|         1.00|98.00|99.00|100.00|
|100005|2020-2021-SECOND|Supervision of In...|         1.25|94.00|95.50| 97.00|
|100005| 2021-2022-FIRST|Comparative Educa...|         1.00|98.00|99.00|100.00|
|100005| 2021-2022-FIRST|Educational Manag...|         1.00|98.00|99.00|100.00|
|100005|2021-2022-SECOND|Educational Measu...|         1.00|98.00|99.00|100.00|
|100005|2021-2022-SECOND|Research Method