- FILTER SEMESTER (FIRST, SECOND, SUMMER, SUMMER2)
- grade_final should only have numerical values such as (1.00 -5.00 excluding 3.50), INC, and DRP.
- create new column, numerical_grade (Ilipat ang grade_reexam to grade_finals if INC). 

COLUMNS TO BE CLEANED:
- grade_reexam
- grade_finals

CLEANED / FILTERED
- semester 
- schoolyear (with _x)
- drop schoolyear without - EXAMPLE ("2004")

EXTRACT DATA

In [205]:
from pyspark.sql import SparkSession

jdbc_url = "jdbc:postgresql://192.168.20.11:5432/demo_db"
#jdbc_url = "jdbc:postgresql://localhost:5432/local_student_grades"
properties = {
    "user": "postgres", 
    "password": "postgres",  
    "driver": "org.postgresql.Driver",
    "fetchsize": "10000"
}
postgres_driver_path = "C:\postgresql-42.7.5.jar"



def extract(jdbc_url, table_name, properties, postgres_driver_path):
    """ Extract data from PostgreSQL database using Spark."""
    # Initialize Spark session
    spark = SparkSession.builder \
        .appName("Postgres Connection") \
        .config("spark.jars", postgres_driver_path) \
        .config("spark.driver.memory", "8g") \
        .config("spark.executor.memory", "8g") \
        .config("spark.executor.memoryOverhead", "2g") \
        .config("spark.driver.memoryOverhead", "2g") \
        .getOrCreate()

    # Extracts data from PostgreSQL database
    df = spark.read.jdbc(
        url=jdbc_url,
        table=table_name,
        properties=properties,
    )
    
    return df, spark

In [206]:
# Extract data once
raw_df, spark = extract(jdbc_url, "grades_with_updated_id", properties, postgres_driver_path)

# Check the number of rows
row_count = raw_df.count()
print(f'Number of rows: {row_count}')

# Print schema
raw_df.printSchema()

Number of rows: 2013589
root
 |-- id: long (nullable = true)
 |-- schoolyear: string (nullable = true)
 |-- semester: string (nullable = true)
 |-- code: string (nullable = true)
 |-- description: string (nullable = true)
 |-- units: integer (nullable = true)
 |-- instructor_id: string (nullable = true)
 |-- instructor_name: string (nullable = true)
 |-- srcode: string (nullable = true)
 |-- fullname: string (nullable = true)
 |-- campus: string (nullable = true)
 |-- program: string (nullable = true)
 |-- major: string (nullable = true)
 |-- yearlevel: string (nullable = true)
 |-- curriculum: string (nullable = true)
 |-- class_section: string (nullable = true)
 |-- grade_final: string (nullable = true)
 |-- grade_reexam: string (nullable = true)
 |-- status: string (nullable = true)



In [207]:
raw_df.show()

+-------+----------+--------+--------+--------------------+-----+-------------+------------------+------+----------------+-------+--------------------+--------------------+---------+----------+-------------+-----------+------------+------+
|     id|schoolyear|semester|    code|         description|units|instructor_id|   instructor_name|srcode|        fullname| campus|             program|               major|yearlevel|curriculum|class_section|grade_final|grade_reexam|status|
+-------+----------+--------+--------+--------------------+-----+-------------+------------------+------+----------------+-------+--------------------+--------------------+---------+----------+-------------+-----------+------------+------+
|1987625| 2022-2023|  SECOND| BPO 201|Fundamentals of B...|    3|       -12544|LN-12544, FN-12544| 77590|LN77590, FN77590|NASUGBU|BS Business Admin...|Financial Management|   SECOND| 2018-2019|  FINMGT-2202|       1.50|           -|PASSED|
|1987626| 2022-2023|  SECOND|Fili 102|Fi

In [208]:
total_count = raw_df.count()
print(f"Total instances: {total_count}")

Total instances: 2013589


In [209]:
from pyspark.sql.functions import col, upper

# Uppercase all supposed to be uppercase data just to be sure
df = raw_df.withColumn('grade_final', upper(col('grade_final')))\
                .withColumn('campus', upper(col('campus')))\
                .withColumn('semester', upper(col('semester')))\
                .withColumn('schoolyear', upper(col('schoolyear'))       
)

CHECK FOR TYPO ERROR AND UNWANTED VALUES UNDER semester COLUMN

In [210]:
df.createOrReplaceTempView('sample_data_ai_with_id')
show_data = spark.sql("""SELECT semester, COUNT(*) AS occurences 
                         FROM sample_data_ai_with_id 
                         GROUP BY semester""")
show_data.show(22)

+--------+----------+
|semester|occurences|
+--------+----------+
|  SECOND|    771773|
| SUMMER2|        49|
|  SUMMER|     56690|
|   FIRST|   1185077|
+--------+----------+



FILTER TYPO ERRORS AND INSTANCES FOR SECOND SEMESTER, AND REMOVE ROWS WITH INVALID semester VALUES

In [211]:
from pyspark.sql.functions import when

valid_semesters = ["FIRST", "SECOND", "SUMMER", "SUMMER2"]
df = df.withColumn(
    "semester",
    when(df["semester"].isin("SECOND_X", "SECOND SEMESTER"), "SECOND")
    .when(df["semester"].isin(valid_semesters), df["semester"])
    .otherwise(None)  # Assign None to invalid semesters so they can be filtered
).filter(col("semester").isNotNull())  # Remove rows where semester is None (invalid)

In [212]:
df.createOrReplaceTempView('sample_data_ai_with_id')
show_data = spark.sql("""SELECT semester, COUNT(*) AS occurences 
                         FROM sample_data_ai_with_id 
                         GROUP BY semester""")
show_data.show()

+--------+----------+
|semester|occurences|
+--------+----------+
|  SECOND|    771773|
| SUMMER2|        49|
|  SUMMER|     56690|
|   FIRST|   1185077|
+--------+----------+



CHECK FOR SPECIAL CHARACTERS AND CATEGORICAL DATA IN grade_final COLUMN

In [213]:
df.createOrReplaceTempView('sample_data_ai_with_id')
show_data = spark.sql("""SELECT grade_final, COUNT(*) AS occurences 
                         FROM sample_data_ai_with_id 
                         GROUP BY grade_final 
                         ORDER BY grade_final DESC""")
show_data.show(22)

+-----------+----------+
|grade_final|occurences|
+-----------+----------+
|          P|        27|
|         OG|      1831|
|        INC|     60481|
|          F|         1|
|        DRP|      2996|
|       5.00|     15332|
|       4.00|       144|
|       3.50|         1|
|       3.00|    118212|
|       2.75|     90544|
|       2.50|    177264|
|       2.25|    160188|
|       2.00|    289297|
|       1.75|    260043|
|       1.50|    453661|
|       1.25|    319564|
|       1.00|     63967|
|         --|         2|
|          -|        22|
|        -- |        12|
+-----------+----------+



REMOVE WHITE SPACES FROM ALL THE COLUMNS

In [214]:
from pyspark.sql.functions import trim

df = df.withColumn('schoolyear', trim('schoolyear'))\
.withColumn('semester', trim('semester'))\
.withColumn('code', trim('code'))\
.withColumn('description', trim('description'))\
.withColumn('units', trim('units'))\
.withColumn('instructor_id', trim('instructor_id'))\
.withColumn('instructor_name', trim('instructor_name'))\
.withColumn('srcode', trim('srcode'))\
.withColumn('fullname', trim('fullname'))\
.withColumn('campus', trim('campus'))\
.withColumn('program', trim('program'))\
.withColumn('grade_final', trim('grade_final'))\
.withColumn('grade_reexam', trim('grade_reexam'))\
.withColumn('status', trim('status'))



In [215]:
df.createOrReplaceTempView('sample_data_ai_with_id')

show_data = spark.sql("SELECT * FROM sample_data_ai_with_id")
show_data.show(22)

+-------+----------+--------+----------+--------------------+-----+-------------+------------------+------+----------------+------+--------------------+-----+---------+----------+---------------+-----------+------------+------+
|     id|schoolyear|semester|      code|         description|units|instructor_id|   instructor_name|srcode|        fullname|campus|             program|major|yearlevel|curriculum|  class_section|grade_final|grade_reexam|status|
+-------+----------+--------+----------+--------------------+-----+-------------+------------------+------+----------------+------+--------------------+-----+---------+----------+---------------+-----------+------------+------+
|1998116| 2021-2022|   FIRST|NSTP 111CW|NSTP - Civic Welf...|    3|       -13053|LN-13053, FN-13053| 77892|LN77892, FN77892|    PB|      BS Accountancy| null|    FIRST| 2018-2019|NSTP111CW-MA-14|       1.50|           -|PASSED|
|1998117| 2021-2022|   FIRST|    PE 101|Physical Fitness,...|    2|       -12650|LN-1265

In [216]:
df.createOrReplaceTempView('sample_data_ai_with_id')

show_data = spark.sql("""SELECT schoolyear, COUNT(*) AS occurences 
                         FROM sample_data_ai_with_id 
                         GROUP BY schoolyear 
                         ORDER BY schoolyear DESC""")

show_data.show(50)

+----------+----------+
|schoolyear|occurences|
+----------+----------+
| 2024-2025|    393107|
| 2023-2024|    699469|
| 2022-2023|    551989|
| 2021-2022|    306594|
| 2020-2021|     46056|
| 2019-2022|         1|
| 2019-2020|     10621|
| 2018-2019|      2690|
| 2017-2018|       894|
| 2016-2017|       700|
| 2015-2016|       744|
| 2014-2015|       287|
| 2013-2014|       106|
| 2012-2013|        99|
| 2011-2012|        58|
| 2010-2011|        36|
| 2009-2010|        21|
| 2008-2009|        18|
| 2007-2008|        21|
| 2006-2007|         2|
| 2005-2006|         3|
| 2004-2005|        23|
|      2004|         1|
| 2003-2004|        21|
| 2002-2003|        12|
| 2001-2002|         4|
| 2000-2001|         8|
| 1999-2000|         4|
+----------+----------+



FILTERING schoolyear AND REMOVE/REPLACE UNWANTED VALUES

In [217]:
from pyspark.sql.functions import regexp_extract, col

# 1. Extract the YYYY-YYYY pattern (handles cases with trailing _X)
df = df.withColumn(
    "schoolyear", regexp_extract(col("schoolyear"), r"(\d{4}-\d{4})", 1)
)

# 2. Filter out rows WITHOUT the YYYY-YYYY pattern
schoolyear_regex = r"^\d{4}-\d{4}$"  # Regex for exact YYYY-YYYY format
df = df.filter(col("schoolyear").rlike(schoolyear_regex))  # Keep only matching rows

In [218]:
df.createOrReplaceTempView('sample_data_ai_with_id')

show_data = spark.sql("""SELECT schoolyear, COUNT(*) AS occurences 
                         FROM sample_data_ai_with_id 
                         GROUP BY schoolyear 
                         ORDER BY schoolyear DESC""")

show_data.show(50)

+----------+----------+
|schoolyear|occurences|
+----------+----------+
| 2024-2025|    393107|
| 2023-2024|    699469|
| 2022-2023|    551989|
| 2021-2022|    306594|
| 2020-2021|     46056|
| 2019-2022|         1|
| 2019-2020|     10621|
| 2018-2019|      2690|
| 2017-2018|       894|
| 2016-2017|       700|
| 2015-2016|       744|
| 2014-2015|       287|
| 2013-2014|       106|
| 2012-2013|        99|
| 2011-2012|        58|
| 2010-2011|        36|
| 2009-2010|        21|
| 2008-2009|        18|
| 2007-2008|        21|
| 2006-2007|         2|
| 2005-2006|         3|
| 2004-2005|        23|
| 2003-2004|        21|
| 2002-2003|        12|
| 2001-2002|         4|
| 2000-2001|         8|
| 1999-2000|         4|
+----------+----------+



In [219]:
df.createOrReplaceTempView('sample_data_ai_with_id')
show_data = spark.sql("""SELECT grade_final, grade_reexam, COUNT(*) AS occurences 
                         FROM sample_data_ai_with_id 
                         GROUP BY grade_final, grade_reexam 
                         ORDER BY occurences DESC""")
show_data.show(80)

+-----------+------------+----------+
|grade_final|grade_reexam|occurences|
+-----------+------------+----------+
|       1.50|           -|    453614|
|       1.25|           -|    319500|
|       2.00|           -|    289276|
|       1.75|           -|    260019|
|       2.50|           -|    177235|
|       2.25|           -|    160176|
|       3.00|           -|    118185|
|       2.75|           -|     90519|
|       1.00|           -|     63953|
|        INC|        3.00|     35198|
|        INC|        5.00|     16183|
|       5.00|           -|     15324|
|        DRP|           -|      2991|
|         OG|           -|      1831|
|        INC|           -|      1787|
|        INC|        2.50|      1474|
|        INC|        2.75|      1443|
|        INC|        2.00|      1307|
|        INC|        1.75|       918|
|        INC|        2.25|       902|
|        INC|        1.50|       878|
|        INC|        1.25|       325|
|       4.00|        3.00|       119|
|       1.25

In [220]:
total_count = df.count()
print(f"Total instances: {total_count}")

Total instances: 2013588


CREATING grade_numeric and grade_classification COLUMNS and inserting values accordingly

In [221]:
from pyspark.sql.functions import when, col, regexp_extract, lit, coalesce, isnull, format_number

# 1. Filter out rows with invalid grade_final values (excluding "-", "--")
invalid_grades = ["PASSED", "P", "OG", "F"]
df = df.filter(~col("grade_final").isin(invalid_grades))

# 2. Create the 'grade_numeric' column
df = df.withColumn(
    "grade_numeric",
    when(
        (df["grade_reexam"].isNotNull()) & (~df["grade_reexam"].isin(invalid_grades)) & (df["grade_reexam"].rlike(r"^\d+\.?\d*$")),
        regexp_extract(df["grade_reexam"], r"(\d+\.?\d*)", 1)
    ).when(
        (~df["grade_final"].isin("DRP", "INC")) & (df["grade_final"].rlike(r"^\d+\.?\d*$")),
        regexp_extract(df["grade_final"], r"(\d+\.?\d*)", 1)
    ).when(
       (df["grade_final"] == "INC") & (df["grade_reexam"].isNotNull()) & (~df["grade_reexam"].isin(invalid_grades)) & (df["grade_reexam"].rlike(r"^\d+\.?\d*$")),
        regexp_extract(df["grade_reexam"], r"(\d+\.?\d*)", 1)
    ).when(df["grade_final"] == "DRP", 0
    )
    .otherwise(None)
).withColumn("grade_numeric", col("grade_numeric").cast("double")).fillna({"grade_numeric": 5})


# 3. Filter valid numeric grades
valid_numeric_grades = ['1.00','1.25','1.50','1.75','2.00','2.25','2.50','2.75','3.00','4.00','5.00']
df = df.filter(
    when(~df["grade_numeric"].isin(0.0), df["grade_numeric"].isin([float(x) for x in valid_numeric_grades])).otherwise(True)
)


# Format grade_numeric to two decimal places x.xx
df = df.withColumn("grade_numeric", format_number("grade_numeric", 2))

# 4. Create the 'grade_classification' column
df = df.withColumn(
    "grade_classification",
    when(
        (df["grade_final"].rlike(r"^\d+\.\d+$")) & (isnull(df["grade_reexam"]) | df["grade_reexam"].isin(invalid_grades)),
        'NORMAL')
    .when(
        (df["grade_final"].isin(["-", "--"])) & (df["grade_reexam"].rlike(r"^\d+\.\d+$")),
        'NORMAL')
    .when(
        (df["grade_final"] == "INC") & (df["grade_reexam"].rlike(r"^\d+\.\d+$")),
        'INC')
    .when(
        df["grade_final"] == "DRP",
        'DROP')
    .when(
        ((df["grade_final"].rlike(r"^\d+\.\d+$")) & (df["grade_reexam"] == "INC")) |
        ((df["grade_final"].rlike(r"^\d+\.\d+$")) & (df["grade_reexam"].isin(invalid_grades))),
        'INC')
    .when(
        (df["grade_final"].rlike(r"^\d+\.\d+$")) & (~df["grade_reexam"].rlike(r"^\d+\.\d+$")),
        'NORMAL')
    .when(
        (df["grade_final"].rlike(r"^\d+\.\d+$")) & (df["grade_reexam"].rlike(r"^\d+\.\d+$")) &
        (col("grade_final").cast("double") > col("grade_reexam").cast("double")),
        'IMPROVED(REEXAM)')
    .when(
        (df["grade_final"].rlike(r"^\d+\.\d+$")) & (df["grade_reexam"].rlike(r"^\d+\.\d+$")) &
        (col("grade_final").cast("double") < col("grade_reexam").cast("double")),
        'FAILED(REEXAM)')
            
    .otherwise("INVALID")
)


In [222]:
df.printSchema()

root
 |-- id: long (nullable = true)
 |-- schoolyear: string (nullable = true)
 |-- semester: string (nullable = true)
 |-- code: string (nullable = true)
 |-- description: string (nullable = true)
 |-- units: string (nullable = true)
 |-- instructor_id: string (nullable = true)
 |-- instructor_name: string (nullable = true)
 |-- srcode: string (nullable = true)
 |-- fullname: string (nullable = true)
 |-- campus: string (nullable = true)
 |-- program: string (nullable = true)
 |-- major: string (nullable = true)
 |-- yearlevel: string (nullable = true)
 |-- curriculum: string (nullable = true)
 |-- class_section: string (nullable = true)
 |-- grade_final: string (nullable = true)
 |-- grade_reexam: string (nullable = true)
 |-- status: string (nullable = true)
 |-- grade_numeric: string (nullable = true)
 |-- grade_classification: string (nullable = false)



In [223]:
df.createOrReplaceTempView('sample_data_ai_with_id')
show_data = spark.sql("""SELECT grade_final, grade_reexam, grade_numeric, grade_classification, COUNT(*) AS occurences 
                         FROM sample_data_ai_with_id 
                         GROUP BY grade_final, grade_reexam, grade_numeric, grade_classification
                         ORDER BY occurences DESC""")
show_data.show(80)

+-----------+------------+-------------+--------------------+----------+
|grade_final|grade_reexam|grade_numeric|grade_classification|occurences|
+-----------+------------+-------------+--------------------+----------+
|       1.50|           -|         1.50|              NORMAL|    453614|
|       1.25|           -|         1.25|              NORMAL|    319500|
|       2.00|           -|         2.00|              NORMAL|    289276|
|       1.75|           -|         1.75|              NORMAL|    260019|
|       2.50|           -|         2.50|              NORMAL|    177235|
|       2.25|           -|         2.25|              NORMAL|    160176|
|       3.00|           -|         3.00|              NORMAL|    118185|
|       2.75|           -|         2.75|              NORMAL|     90519|
|       1.00|           -|         1.00|              NORMAL|     63953|
|        INC|        3.00|         3.00|                 INC|     35198|
|        INC|        5.00|         5.00|           

In [224]:
total_count = df.count()
print(f"Total instances: {total_count}")

Total instances: 2011728


FILTER OUT (OG, P, Passed, F, DRP)

In [225]:
# Filter out the data that is not included on the list
allowed_data = ['1.00','1.25','1.50',
                '1.75','2.00','2.25',
                '2.50','2.75','3.00',
                '4.00','5.00', 'INC', 'DRP']

In [226]:
df.createOrReplaceTempView('sample_data_ai_with_id')
show_data = spark.sql("""SELECT program, count(*)
                         FROM sample_data_ai_with_id
                         WHERE program = 'NULL'
                         GROUP BY program """)
show_data.show(50)

+-------+--------+
|program|count(1)|
+-------+--------+
|   NULL|    8423|
+-------+--------+



DROP programs with "NULL" strings

In [227]:
df = df.filter(col('program') != 'NULL')

CHECK FOR NULL STRINGS IN program COLUMN

In [228]:
df.createOrReplaceTempView('sample_data_ai_with_id')
show_data = spark.sql("""SELECT program, count(*)
                         FROM sample_data_ai_with_id
                         WHERE program = 'NULL'
                         GROUP BY program """)
show_data.show(50)

+-------+--------+
|program|count(1)|
+-------+--------+
+-------+--------+



In [229]:
df.createOrReplaceTempView('sample_data_ai_with_id')
show_data = spark.sql("""SELECT grade_final, grade_reexam, COUNT(*) AS occurences 
                         FROM sample_data_ai_with_id 
                         WHERE grade_reexam = '-' OR grade_reexam = '--' OR grade_reexam = 'NULL' OR grade_reexam IS NULL
                         GROUP BY grade_final, grade_reexam 
                         ORDER BY occurences DESC""")
show_data.show(50)

+-----------+------------+----------+
|grade_final|grade_reexam|occurences|
+-----------+------------+----------+
|       1.50|           -|    451708|
|       1.25|           -|    318034|
|       2.00|           -|    288104|
|       1.75|           -|    258262|
|       2.50|           -|    176806|
|       2.25|           -|    159492|
|       3.00|           -|    118091|
|       2.75|           -|     90327|
|       1.00|           -|     63339|
|       5.00|           -|     15324|
|        DRP|           -|      2991|
|        INC|           -|      1786|
|       1.25|          --|        36|
|       1.50|          --|        29|
|       2.50|        null|        22|
|       3.00|        null|        18|
|       2.75|        null|        14|
|       1.00|          --|        11|
|       2.00|        null|        11|
|          -|           -|        11|
|       2.25|        null|         9|
|       3.00|          --|         8|
|       1.75|          --|         8|
|        INC

ONLY ALLOW NUMERICAL STRINGS IN GRADE_REEXAM COLUMN

In [230]:
df = df.withColumn(
    "grade_reexam", when(col("grade_reexam").rlike("^[0-9]*\.?[0-9]+$"), col("grade_reexam")).otherwise(None)
)

In [231]:
df.createOrReplaceTempView('sample_data_ai_with_id')
show_data = spark.sql("""SELECT grade_final, grade_reexam, COUNT(*) AS occurences 
                         FROM sample_data_ai_with_id 
                         WHERE grade_final = 'INC' OR grade_final = 'DRP' 
                         GROUP BY grade_final, grade_reexam 
                         ORDER BY occurences DESC""")
show_data.show(22)

+-----------+------------+----------+
|grade_final|grade_reexam|occurences|
+-----------+------------+----------+
|        INC|        3.00|     35192|
|        INC|        5.00|     16182|
|        DRP|        null|      2994|
|        INC|        null|      1797|
|        INC|        2.50|      1472|
|        INC|        2.75|      1439|
|        INC|        2.00|      1305|
|        INC|        1.75|       917|
|        INC|        2.25|       902|
|        INC|        1.50|       876|
|        INC|        1.25|       324|
|        INC|        1.00|        50|
+-----------+------------+----------+



In [232]:
df.createOrReplaceTempView('sample_data_ai_with_id')
show_data = spark.sql("""SELECT grade_final, grade_reexam, COUNT(*) AS occurences 
                         FROM sample_data_ai_with_id
                         GROUP BY grade_final, grade_reexam 
                         ORDER BY occurences DESC""")
show_data.show(22)

+-----------+------------+----------+
|grade_final|grade_reexam|occurences|
+-----------+------------+----------+
|       1.50|        null|    451744|
|       1.25|        null|    318071|
|       2.00|        null|    288120|
|       1.75|        null|    258276|
|       2.50|        null|    176832|
|       2.25|        null|    159503|
|       3.00|        null|    118117|
|       2.75|        null|     90348|
|       1.00|        null|     63350|
|        INC|        3.00|     35192|
|        INC|        5.00|     16182|
|       5.00|        null|     15332|
|        DRP|        null|      2994|
|        INC|        null|      1797|
|        INC|        2.50|      1472|
|        INC|        2.75|      1439|
|        INC|        2.00|      1305|
|        INC|        1.75|       917|
|        INC|        2.25|       902|
|        INC|        1.50|       876|
|        INC|        1.25|       324|
|       4.00|        3.00|       118|
+-----------+------------+----------+
only showing

In [233]:
df.createOrReplaceTempView('sample_data_ai_with_id')
show_data = spark.sql("""SELECT schoolyear, COUNT(*) AS occurences 
                         FROM sample_data_ai_with_id 
                         WHERE schoolyear = '2023-2024_x' 
                         GROUP BY schoolyear
                         ORDER BY occurences DESC""")
show_data.show(22)

+----------+----------+
|schoolyear|occurences|
+----------+----------+
+----------+----------+



In [234]:
df.createOrReplaceTempView('sample_data_ai_with_id')
show_data = spark.sql("""SELECT campus, COUNT(*) AS occurences 
                         FROM sample_data_ai_with_id 
                         GROUP BY campus
                         ORDER BY occurences DESC""")

show_data.show()

+---------+----------+
|   campus|occurences|
+---------+----------+
|ALANGILAN|    745223|
|       PB|    534792|
|  NASUGBU|    239573|
|   MALVAR|    188907|
|     LIPA|    155496|
|  ROSARIO|     41678|
|   LEMERY|     36762|
|  BALAYAN|     27499|
| SAN JUAN|     16040|
|     LOBO|     12597|
|   MABINI|      4735|
|     null|         3|
+---------+----------+



In [236]:
from pyspark.sql.types import DecimalType

filtered_df = df.withColumn("units", df.units.cast("int"))\
    .withColumn("grade_numeric", df.grade_numeric.cast(DecimalType(5, 2)))


In [237]:
filtered_df.createOrReplaceTempView('data')
show_data = spark.sql("""SELECT * 
                         FROM data
                        WHERE schoolyear = '2024-2025' AND grade_final = 'INC' AND grade_reexam is NULL
                        """)

show_data.show()

+-------+----------+--------+----------+--------------------+-----+-------------+------------------+------+----------------+---------+--------------------+--------------------+---------+----------+---------------+-----------+------------+------+-------------+--------------------+
|     id|schoolyear|semester|      code|         description|units|instructor_id|   instructor_name|srcode|        fullname|   campus|             program|               major|yearlevel|curriculum|  class_section|grade_final|grade_reexam|status|grade_numeric|grade_classification|
+-------+----------+--------+----------+--------------------+-----+-------------+------------------+------+----------------+---------+--------------------+--------------------+---------+----------+---------------+-----------+------------+------+-------------+--------------------+
|2013649| 2024-2025|   FIRST|   ChE 422|Chemical Engineer...|    2|       -10621|LN-10621, FN-10621| 78369|LN78369, FN78369|ALANGILAN|BS Chemical Engin...|  

DROP RECORDS WITH 0.00 as grade_numeric AND RECORDS WITH INC-NULL AS grade_numeric WITH THE CURRENT YEAR IN SCHOOLYEAR

In [238]:
from datetime import datetime

current_year = datetime.now().year

# Drop rows where grade_final is INC and reexam is null
filtered_df = filtered_df.filter(
    ~((col('grade_final') == 'INC') & (col('grade_reexam').isNull()) & (col('schoolyear').contains(str(current_year))))
)

filtered_df = filtered_df.filter((col('grade_numeric') != 0))

In [239]:
filtered_df.printSchema()

root
 |-- id: long (nullable = true)
 |-- schoolyear: string (nullable = true)
 |-- semester: string (nullable = true)
 |-- code: string (nullable = true)
 |-- description: string (nullable = true)
 |-- units: integer (nullable = true)
 |-- instructor_id: string (nullable = true)
 |-- instructor_name: string (nullable = true)
 |-- srcode: string (nullable = true)
 |-- fullname: string (nullable = true)
 |-- campus: string (nullable = true)
 |-- program: string (nullable = true)
 |-- major: string (nullable = true)
 |-- yearlevel: string (nullable = true)
 |-- curriculum: string (nullable = true)
 |-- class_section: string (nullable = true)
 |-- grade_final: string (nullable = true)
 |-- grade_reexam: string (nullable = true)
 |-- status: string (nullable = true)
 |-- grade_numeric: decimal(5,2) (nullable = true)
 |-- grade_classification: string (nullable = false)



In [240]:
filtered_df.show(20)

+-------+----------+--------+----------+--------------------+-----+-------------+------------------+------+----------------+---------+--------------------+-----+---------+----------+---------------+-----------+------------+------+-------------+--------------------+
|     id|schoolyear|semester|      code|         description|units|instructor_id|   instructor_name|srcode|        fullname|   campus|             program|major|yearlevel|curriculum|  class_section|grade_final|grade_reexam|status|grade_numeric|grade_classification|
+-------+----------+--------+----------+--------------------+-----+-------------+------------------+------+----------------+---------+--------------------+-----+---------+----------+---------------+-----------+------------+------+-------------+--------------------+
|2048547| 2023-2024|   FIRST|   GEd 108|    Art Appreciation|    3|       -12537|LN-12537, FN-12537| 20663|LN20663, FN20663|       PB|          BS Nursing| null|   SECOND| 2018-2019|       BSN-2108|    

In [241]:
filtered_df.createOrReplaceTempView('data')
show_data = spark.sql("""SELECT * 
                         FROM data
                        WHERE schoolyear = '2024-2025' AND grade_final = 'INC' AND grade_reexam is NULL
                        """)

show_data.show()

+---+----------+--------+----+-----------+-----+-------------+---------------+------+--------+------+-------+-----+---------+----------+-------------+-----------+------------+------+-------------+--------------------+
| id|schoolyear|semester|code|description|units|instructor_id|instructor_name|srcode|fullname|campus|program|major|yearlevel|curriculum|class_section|grade_final|grade_reexam|status|grade_numeric|grade_classification|
+---+----------+--------+----+-----------+-----+-------------+---------------+------+--------+------+-------+-----+---------+----------+-------------+-----------+------------+------+-------------+--------------------+
+---+----------+--------+----+-----------+-----+-------------+---------------+------+--------+------+-------+-----+---------+----------+-------------+-----------+------------+------+-------------+--------------------+



In [242]:
filtered_df.count()

1998696

GET VALID SCHOOLYEAR

In [243]:
from pyspark.sql.functions import split, col, trim, when

# Get distinct schoolyear values
distinct_years_df = raw_df.select("schoolyear").distinct()

# Optionally, show all distinct school years
distinct_years_df.show(truncate=False)

# Extract the starting year from the 'schoolyear' column
# Assumes format like "2021-2022"
distinct_years_df = distinct_years_df.withColumn("start_year", split(col("schoolyear"), "-")[0].cast("int"))

# Filter for school years starting from 2006 (or later)
#valid_years_df = distinct_years_df.filter(col("start_year") >= 2006)

valid_years_df = distinct_years_df.orderBy("start_year")

# Collect the valid schoolyear values as a list
valid_schoolyears = [row.schoolyear for row in valid_years_df.collect()]
print("Valid schoolyears:", valid_schoolyears)

+----------+
|schoolyear|
+----------+
|2022-2023 |
|2021-2022 |
|1999-2000 |
|2000-2001 |
|2003-2004 |
|2019-2022 |
|2011-2012 |
|2012-2013 |
|2004-2005 |
|2013-2014 |
|2016-2017 |
|2010-2011 |
|2007-2008 |
|2014-2015 |
|2004      |
|2006-2007 |
|2017-2018 |
|2002-2003 |
|2019-2020 |
|2018-2019 |
+----------+
only showing top 20 rows

Valid schoolyears: ['1999-2000', '2000-2001', '2001-2002', '2002-2003', '2003-2004', '2004-2005', '2004', '2005-2006', '2006-2007', '2007-2008', '2008-2009', '2009-2010', '2010-2011', '2011-2012', '2012-2013', '2013-2014', '2014-2015', '2015-2016', '2016-2017', '2017-2018', '2018-2019', '2019-2022', '2019-2020', '2020-2021', '2021-2022', '2022-2023', '2023-2024', '2024-2025']


CREATE AND ALLOCATE YEARSEM column

In [244]:
from pyspark.sql.functions import split, col, when, concat_ws

# Define the semesters you're interested in
semesters = ["FIRST", "SECOND", "SUMMER"]

# Filter the DataFrame on the valid schoolyears, semesters, and ensure grade_numeric is not null
filtered_df = filtered_df.filter(
    (col("schoolyear").isin(valid_schoolyears)) &
    (col("semester").isin(semesters)) &
    (col("grade_numeric").isNotNull())
)

# Extract the starting year from the 'schoolyear' column (assumes format "2021-2022")
filtered_df = filtered_df.withColumn("start_year", split(col("schoolyear"), "-")[0].cast("int"))


# Create the 'yearsem' column by concatenating schoolyear and semester (e.g., "2021-2022-FIRST")
filtered_df = filtered_df.withColumn("yearsem", concat_ws("-", col("schoolyear"), col("semester")))


# Show a preview of the results
filtered_df.show(500, truncate=False)

+-------+----------+--------+------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-----+-------------+------------------+------+----------------+---------+---------------------------------------------+-------------------------+---------+----------+---------------+-----------+------------+------+-------------+--------------------+----------+----------------+
|id     |schoolyear|semester|code        |description                                                                                                                                                                  |units|instructor_id|instructor_name   |srcode|fullname        |campus   |program                                      |major                    |yearlevel|curriculum|class_section  |grade_final|grade_reexam|status|grade_numeric|grade_classification|start_year|yearsem         |
+-------+---

ADD PROGRAM ID

In [245]:
# First, read the CSV file with program mappings
program_mapping_df = spark.read.csv("C:\LEONAIDAS\program_with_id.csv", header=True)

# Convert DataFrame to dictionary with program_id as key (convert to int)
program_dict = {int(row['program_id']): row['program'] for row in program_mapping_df.collect()}

# Create a reverse dictionary for lookup, handling multiple program names
reverse_program_dict = {}
for program_id, program_name in program_dict.items():
    if isinstance(program_name, list):
        # If program_name is a list, add each name separately
        for name in program_name:
            reverse_program_dict[name] = program_id
    else:
        # Single program name
        reverse_program_dict[program_name] = program_id

# Add the special case for program_id 101
program_names_101 = [
    "Master of Arts in Education major in Pagtuturo ng Filipino",
    "Master of Arts in Education major in Filipino"
]
for name in program_names_101:
    reverse_program_dict[name] = 101

# Create a mapping function
def get_program_id(program):
    return reverse_program_dict.get(program)

# Register the UDF (User Defined Function)
from pyspark.sql.functions import udf
from pyspark.sql.types import IntegerType

get_program_id_udf = udf(get_program_id, IntegerType())

# Add the new column to your DataFrame
filtered_df = filtered_df.withColumn("program_id", get_program_id_udf(col("program")))

# Show the results
filtered_df.select("srcode", "yearsem", "grade_numeric", "program", "program_id").show(5)

# Optionally verify the mapping
filtered_df.groupBy("program", "program_id").count().orderBy("program_id").show(500, truncate=False)

+------+----------------+-------------+--------------------+----------+
|srcode|         yearsem|grade_numeric|             program|program_id|
+------+----------------+-------------+--------------------+----------+
| 21258|2023-2024-SECOND|         1.50|BS Business Admin...|        32|
| 21258|2022-2023-SECOND|         1.50|BS Business Admin...|        32|
| 21258|2022-2023-SECOND|         1.50|BS Business Admin...|        32|
| 21258|2022-2023-SECOND|         1.50|BS Business Admin...|        32|
| 21258|2022-2023-SECOND|         1.50|BS Business Admin...|        32|
+------+----------------+-------------+--------------------+----------+
only showing top 5 rows

+---------------------------------------------------------------------------------+----------+------+
|program                                                                          |program_id|count |
+---------------------------------------------------------------------------------+----------+------+
|Bachelor of Arts  in

In [246]:
filtered_df.show(100)

+-------+----------+--------+----------+--------------------+-----+-------------+------------------+------+----------------+---------+--------------------+-----+---------+----------+---------------+-----------+------------+------+-------------+--------------------+----------+----------------+----------+
|     id|schoolyear|semester|      code|         description|units|instructor_id|   instructor_name|srcode|        fullname|   campus|             program|major|yearlevel|curriculum|  class_section|grade_final|grade_reexam|status|grade_numeric|grade_classification|start_year|         yearsem|program_id|
+-------+----------+--------+----------+--------------------+-----+-------------+------------------+------+----------------+---------+--------------------+-----+---------+----------+---------------+-----------+------------+------+-------------+--------------------+----------+----------------+----------+
| 130458| 2024-2025|   FIRST|   GEd 108|    Art Appreciation|    3|       -13686|LN-1

In [247]:
# If you want a single CSV file instead of multiple parts
filtered_df.coalesce(1) \
    .write \
    .option("header", "true") \
    .mode("overwrite") \
    .csv("C:/tmp/spark_warehouse/FINAL CLEANED GRADES WITH ID.csv")

CREATE A TEMPORARY DATABASE THAT WRITES THE FILTERED_DF INTO POSTGRESQL

In [175]:
filtered_df.write \
        .format("jdbc") \
        .option("url", "jdbc:postgresql://localhost:5432/local_student_grades") \
        .option("dbtable", "cleaned_grades_with_updated_id") \
        .option("user", "postgres") \
        .option("password", "password") \
        .option("driver", properties["driver"]) \
        .mode("overwrite") \
        .save()

In [70]:
df.createOrReplaceTempView('sample_data_ai_with_id')
show_data = spark.sql("""SELECT grade_numeric
                         FROM sample_data_ai_with_id """)
show_data.show(22)

+-------------+
|grade_numeric|
+-------------+
|         2.00|
|         1.50|
|         1.00|
|         1.25|
|         1.50|
|         1.75|
|         1.50|
|         1.25|
|         1.25|
|         1.25|
|         1.50|
|         1.00|
|         1.25|
|         1.50|
|         1.50|
|         1.25|
|         2.50|
|         2.50|
|         2.75|
|         2.25|
|         2.50|
|         3.00|
+-------------+
only showing top 22 rows



In [71]:
filtered_df.printSchema()

root
 |-- id: integer (nullable = true)
 |-- schoolyear: string (nullable = true)
 |-- semester: string (nullable = true)
 |-- code: string (nullable = true)
 |-- description: string (nullable = true)
 |-- credits: integer (nullable = true)
 |-- instructor_id: string (nullable = true)
 |-- instructor: string (nullable = true)
 |-- srcode: string (nullable = true)
 |-- fullname: string (nullable = true)
 |-- campus: string (nullable = true)
 |-- college: string (nullable = true)
 |-- program: string (nullable = true)
 |-- grade_final: string (nullable = true)
 |-- grade_reexam: string (nullable = true)
 |-- status: string (nullable = true)
 |-- grade_numeric: decimal(5,2) (nullable = true)
 |-- grade_classification: string (nullable = false)



PSEUDOCODE FOR CREATING TRAINING DFs

In [None]:

converted_df = df.franz

for program in converted_df.program:
    for srcode in converted_df.srcode:
        for schoolyear_semester in converted_df.schoolyear_semester:
            for avg_type in converted_df.avg_type:
                #compute GWA
                if min:
                    #use min_grades
                    #insert min_avg
                    pass
                elif mid:
                    #use mid_grades
                    #insert mid_avg
                    pass
                else:
                    #use max_grades
                    #insert max_avg
                    pass
                
    df.create()

In [248]:
parquet_file_path = r"C:\Users\denve\Downloads\f54b49af-2cb2-435e-90e1-1d60cccd23b5-0_0-404-450_20250304102259405.parquet"
parquet_df = spark.read.parquet(parquet_file_path)

In [249]:
parquet_df.show()

+-------------------+--------------------+------------------+----------------------+--------------------+------+---------------+---------+----------+
|_hoodie_commit_time|_hoodie_commit_seqno|_hoodie_record_key|_hoodie_partition_path|   _hoodie_file_name|srcode|total_semesters|final_gwa|program_id|
+-------------------+--------------------+------------------+----------------------+--------------------+------+---------------+---------+----------+
|  20250304102259405|20250304102259405...|            100005|                      |f54b49af-2cb2-435...|100005|             10|   1.0750|        98|
|  20250304102259405|20250304102259405...|            100010|                      |f54b49af-2cb2-435...|100010|             68|   2.7530|        51|
|  20250304102259405|20250304102259405...|            100016|                      |f54b49af-2cb2-435...|100016|             49|   2.1141|        65|
|  20250304102259405|20250304102259405...|            100108|                      |f54b49af-2cb2-43

In [None]:
parquet_df.coalesce(1) \
    .write \
    .option("header", "true") \
    .mode("overwrite") \
    .csv("C:/tmp/spark_warehouse/converted_parquet_to_csv")