In [60]:
from pyspark.sql import SparkSession

#jdbc_url = "jdbc:postgresql://localhost:5432/local_student_grades"
jdbc_url = "jdbc:postgresql://192.168.20.11:5432/demo_db"
properties = {
    "user": "postgres", 
    "password": "postgres",  
    "driver": "org.postgresql.Driver",
    "fetchsize": "10000"
}
postgres_driver_path = "C:\postgresql-42.7.5.jar"



def extract(jdbc_url, table_name, properties, postgres_driver_path):
    """ Extract data from PostgreSQL database using Spark."""
    # Initialize Spark session
    spark = SparkSession.builder \
        .appName("Postgres Connection") \
        .config("spark.jars", postgres_driver_path) \
        .config("spark.driver.memory", "8g") \
        .config("spark.executor.memory", "8g") \
        .config("spark.executor.memoryOverhead", "2g") \
        .config("spark.driver.memoryOverhead", "2g") \
        .getOrCreate()

    # Extracts data from PostgreSQL database
    df = spark.read.jdbc(
        url=jdbc_url,
        table=table_name,
        properties=properties,
    )
    
    return df, spark

In [61]:
# Extract data once
df, spark = extract(jdbc_url, "filtered_data_with_id", properties, postgres_driver_path)

# Check the number of rows
row_count = df.count()
print(f'Number of rows: {row_count}')

# Print schema
df.printSchema()

Number of rows: 1999679
root
 |-- id: integer (nullable = true)
 |-- schoolyear: string (nullable = true)
 |-- semester: string (nullable = true)
 |-- code: string (nullable = true)
 |-- description: string (nullable = true)
 |-- credits: integer (nullable = true)
 |-- instructor_id: string (nullable = true)
 |-- instructor: string (nullable = true)
 |-- srcode: string (nullable = true)
 |-- fullname: string (nullable = true)
 |-- campus: string (nullable = true)
 |-- college: string (nullable = true)
 |-- program: string (nullable = true)
 |-- grade_final: string (nullable = true)
 |-- grade_reexam: string (nullable = true)
 |-- status: string (nullable = true)
 |-- grade_numeric: decimal(5,2) (nullable = true)
 |-- grade_classification: string (nullable = true)



In [62]:
df.createOrReplaceTempView('filtered_data_with_id')
show_data = spark.sql("""select * from filtered_data_with_id where srcode = '107764'""")
show_data.show(50)

+-------+----------+--------+----------+--------------------+-------+-------------+--------------+------+------------------+---------+--------------------+--------------------+-----------+------------+------+-------------+--------------------+
|     id|schoolyear|semester|      code|         description|credits|instructor_id|    instructor|srcode|          fullname|   campus|             college|             program|grade_final|grade_reexam|status|grade_numeric|grade_classification|
+-------+----------+--------+----------+--------------------+-------+-------------+--------------+------+------------------+---------+--------------------+--------------------+-----------+------------+------+-------------+--------------------+
|1444611| 2023-2024|  SECOND|   CpE 401|Computer Programm...|      1|         4067|LN4067, FN4067|107764|LN107764, FN107764|ALANGILAN|College of Engine...|BS Electrical Eng...|       1.00|        null|PASSED|         1.00|              NORMAL|
|1444608| 2023-2024|  SE

REMOVE ALL RECORDS ASSOCIATED WITH PREVIOUS PROGRAMS (SHIFTERS)

In [63]:
from pyspark.sql.functions import col, count, max, row_number
from pyspark.sql.window import Window

def remove_previous_programs(df):
    #Remove records of previous programs for shifters.
    
    # Create a window spec partitioned by srcode ordered by schoolyear desc
    window_spec = Window.partitionBy("srcode").orderBy(col("schoolyear").desc())
    
    # Get the most recent program for each student
    latest_programs = df.withColumn("row_number", row_number().over(window_spec)) \
        .filter(col("row_number") == 1) \
        .select("srcode", "program", "schoolyear") \
        .withColumnRenamed("srcode", "latest_srcode") \
        .withColumnRenamed("program", "latest_program") \
        .withColumnRenamed("schoolyear", "latest_schoolyear")
    
    # Join with original dataframe to keep only records with the most recent program
    filtered_df = df.join(
        latest_programs,
        (df.srcode == latest_programs.latest_srcode) & 
        (df.program == latest_programs.latest_program),
        "inner"
    ).drop("latest_srcode", "latest_program", "latest_schoolyear")
    
    return filtered_df

# Identify shifters (students with multiple programs)
def get_shifters(df):
    """Get list of students who have shifted programs."""
    return df.groupBy("srcode") \
        .agg(
            count("program").alias("program_count"),
            max("program").alias("current_program")
        ) \
        .filter(col("program_count") > 1)

# Example usage:
shifters = get_shifters(df)
cleaned_df = remove_previous_programs(df)

# Show statistics
print("Number of shifters:", shifters.count())
shifters.show()

Number of shifters: 58329
+------+-------------+------------------------------------------------------------+
|srcode|program_count|current_program                                             |
+------+-------------+------------------------------------------------------------+
|100005|10           |Master of Arts in Education major in Educational Management |
|100010|72           |BS Hospitality Management                                   |
|100016|50           |BS Psychology                                               |
|100108|12           |Master of Arts in Education major in Social Studies Teaching|
|100113|45           |BS Psychology                                               |
+------+-------------+------------------------------------------------------------+
only showing top 5 rows



In [69]:
cleaned_df.createOrReplaceTempView('filtered_data_with_id')
show_data = spark.sql("""select * from filtered_data_with_id where srcode = '107764' order by schoolyear, semester asc""")
show_data.show(50)

+-------+----------+--------+----------+--------------------+-------+-------------+--------------+------+------------------+---------+--------------------+--------------------+-----------+------------+------+-------------+--------------------+
|     id|schoolyear|semester|      code|         description|credits|instructor_id|    instructor|srcode|          fullname|   campus|             college|             program|grade_final|grade_reexam|status|grade_numeric|grade_classification|
+-------+----------+--------+----------+--------------------+-------+-------------+--------------+------+------------------+---------+--------------------+--------------------+-----------+------------+------+-------------+--------------------+
|1446516| 2021-2022|  SECOND|  ENGG 403|Computer-Aided De...|      1|            2|      LN2, FN2|107764|LN107764, FN107764|ALANGILAN|College of Engine...|BS Electrical Eng...|       1.00|        null|PASSED|         1.00|              NORMAL|
|1446514| 2021-2022|  SE

In [65]:
'''from pyspark.sql.functions import when
# Initial count
initial_count = df.count()
print(f"Initial number of records: {initial_count}")

# After removing previous programs
cleaned_df = remove_previous_programs(df)
after_shifters_count = cleaned_df.count()
shifters_removed = initial_count - after_shifters_count
print(f"Records removed after handling shifters: {shifters_removed}")

# After semester cleaning
valid_semesters = ["FIRST", "SECOND", "SUMMER", "SUMMER2"]
cleaned_df = cleaned_df.withColumn(
    "semester",
    when(cleaned_df["semester"].isin("SECOND_X", "SECOND SEMESTER"), "SECOND")
    .when(cleaned_df["semester"].isin(valid_semesters), cleaned_df["semester"])
    .otherwise(None)
).filter(col("semester").isNotNull())

after_semester_count = cleaned_df.count()
semester_removed = after_shifters_count - after_semester_count
print(f"Records removed after cleaning semesters: {semester_removed}")

# After removing NULL programs
cleaned_df = cleaned_df.filter(col("program").isNotNull())
final_count = cleaned_df.count()
null_program_removed = after_semester_count - final_count
print(f"Records removed with NULL programs: {null_program_removed}")

# Total records removed
total_removed = initial_count - final_count
print(f"\nTotal records removed: {total_removed}")
print(f"Final number of records: {final_count}")
print(f"Percentage of data retained: {(final_count/initial_count)*100:.2f}%")'''

'from pyspark.sql.functions import when\n# Initial count\ninitial_count = df.count()\nprint(f"Initial number of records: {initial_count}")\n\n# After removing previous programs\ncleaned_df = remove_previous_programs(df)\nafter_shifters_count = cleaned_df.count()\nshifters_removed = initial_count - after_shifters_count\nprint(f"Records removed after handling shifters: {shifters_removed}")\n\n# After semester cleaning\nvalid_semesters = ["FIRST", "SECOND", "SUMMER", "SUMMER2"]\ncleaned_df = cleaned_df.withColumn(\n    "semester",\n    when(cleaned_df["semester"].isin("SECOND_X", "SECOND SEMESTER"), "SECOND")\n    .when(cleaned_df["semester"].isin(valid_semesters), cleaned_df["semester"])\n    .otherwise(None)\n).filter(col("semester").isNotNull())\n\nafter_semester_count = cleaned_df.count()\nsemester_removed = after_shifters_count - after_semester_count\nprint(f"Records removed after cleaning semesters: {semester_removed}")\n\n# After removing NULL programs\ncleaned_df = cleaned_df.filt

In [67]:
cleaned_df.count()

1950114