In [1]:
from pyspark.sql import SparkSession

jdbc_url = "jdbc:postgresql://192.168.20.11:5432/demo_db"
properties = {
    "user": "postgres", 
    "password": "postgres",  
    "driver": "org.postgresql.Driver",
    "fetchsize": "10000"
}

#DOWNLOAD FROM ORACLE
postgres_driver_path = "C:\postgresql-42.7.5.jar"

def extract(jdbc_url, table_name, properties, postgres_driver_path):
    """ Extract data from PostgreSQL database using Spark."""
    # Initialize Spark session
    spark = SparkSession.builder \
        .appName("Postgres Connection") \
        .config("spark.jars", postgres_driver_path) \
        .config("spark.driver.memory", "8g") \
        .config("spark.executor.memory", "8g") \
        .config("spark.executor.memoryOverhead", "2g") \
        .config("spark.driver.memoryOverhead", "2g") \
        .getOrCreate()

    # Extracts data from PostgreSQL database
    df = spark.read.jdbc(
        url=jdbc_url,
        table=table_name,
        properties=properties,
    )
    
    return df, spark

# Extract data once
raw_df, spark = extract(jdbc_url, "filtered_data_with_id", properties, postgres_driver_path)

# Check the number of rows
row_count = raw_df.count()
print(f'Number of rows: {row_count}')

# Print schema
raw_df.printSchema()

Number of rows: 1999679
root
 |-- id: integer (nullable = true)
 |-- schoolyear: string (nullable = true)
 |-- semester: string (nullable = true)
 |-- code: string (nullable = true)
 |-- description: string (nullable = true)
 |-- credits: integer (nullable = true)
 |-- instructor_id: string (nullable = true)
 |-- instructor: string (nullable = true)
 |-- srcode: string (nullable = true)
 |-- fullname: string (nullable = true)
 |-- campus: string (nullable = true)
 |-- college: string (nullable = true)
 |-- program: string (nullable = true)
 |-- grade_final: string (nullable = true)
 |-- grade_reexam: string (nullable = true)
 |-- status: string (nullable = true)
 |-- grade_numeric: decimal(5,2) (nullable = true)
 |-- grade_classification: string (nullable = true)



In [2]:
from pyspark.sql.functions import col, count, max, row_number
from pyspark.sql.window import Window

def remove_previous_programs(df):
    #Remove records of previous programs for shifters.
    
    # Create a window spec partitioned by srcode ordered by schoolyear desc
    window_spec = Window.partitionBy("srcode").orderBy(col("schoolyear").desc())
    
    # Get the most recent program for each student
    latest_programs = df.withColumn("row_number", row_number().over(window_spec)) \
        .filter(col("row_number") == 1) \
        .select("srcode", "program", "schoolyear") \
        .withColumnRenamed("srcode", "latest_srcode") \
        .withColumnRenamed("program", "latest_program") \
        .withColumnRenamed("schoolyear", "latest_schoolyear")
    
    # Join with original dataframe to keep only records with the most recent program
    filtered_df = df.join(
        latest_programs,
        (df.srcode == latest_programs.latest_srcode) & 
        (df.program == latest_programs.latest_program),
        "inner"
    ).drop("latest_srcode", "latest_program", "latest_schoolyear")
    
    return filtered_df

cleaned_df = remove_previous_programs(raw_df)

In [4]:
from pyspark.sql.functions import split, col, trim, when

# Get distinct schoolyear values
distinct_years_df = raw_df.select("schoolyear").distinct()

# Optionally, show all distinct school years
distinct_years_df.show(truncate=False)

# Extract the starting year from the 'schoolyear' column
# Assumes format like "2021-2022"
distinct_years_df = distinct_years_df.withColumn("start_year", split(col("schoolyear"), "-")[0].cast("int"))

# Filter for school years starting from 2006 (or later)
#valid_years_df = distinct_years_df.filter(col("start_year") >= 2006)

valid_years_df = distinct_years_df.orderBy("start_year")

# Collect the valid schoolyear values as a list
valid_schoolyears = [row.schoolyear for row in valid_years_df.collect()]
print("Valid schoolyears:", valid_schoolyears)

+----------+
|schoolyear|
+----------+
|2022-2023 |
|2021-2022 |
|2011-2012 |
|2012-2013 |
|2013-2014 |
|2016-2017 |
|2010-2011 |
|2014-2015 |
|2007-2008 |
|2006-2007 |
|2017-2018 |
|2019-2020 |
|2018-2019 |
|2023-2024 |
|2009-2010 |
|2024-2025 |
|2008-2009 |
|2020-2021 |
|2015-2016 |
+----------+

Valid schoolyears: ['2006-2007', '2007-2008', '2008-2009', '2009-2010', '2010-2011', '2011-2012', '2012-2013', '2013-2014', '2014-2015', '2015-2016', '2016-2017', '2017-2018', '2018-2019', '2019-2020', '2020-2021', '2021-2022', '2022-2023', '2023-2024', '2024-2025']


In [7]:
from pyspark.sql.functions import split, col, when, concat_ws

# Define the semesters you're interested in
semesters = ["FIRST", "SECOND", "SUMMER"]

# Filter the DataFrame on the valid schoolyears, semesters, and ensure grade_numeric is not null
filtered_df = cleaned_df.filter(
    (col("schoolyear").isin(valid_schoolyears)) &
    (col("semester").isin(semesters)) &
    (col("grade_numeric").isNotNull())
)

# Extract the starting year from the 'schoolyear' column (assumes format "2021-2022")
filtered_df = filtered_df.withColumn("start_year", split(col("schoolyear"), "-")[0].cast("int"))

# Create a custom order for the semester column
filtered_df = filtered_df.withColumn(
    "sem_order",
    when(col("semester") == "FIRST", 1)
    .when(col("semester") == "SECOND", 2)
    .when(col("semester") == "SUMMER", 3)
)

# Create the 'yearsem' column by concatenating schoolyear and semester (e.g., "2021-2022-FIRST")
filtered_df = filtered_df.withColumn("yearsem", concat_ws("-", col("schoolyear"), col("semester")))

# Order by start_year and then by sem_order to get FIRST, SECOND, SUMMER in that order for each year
output_df = filtered_df.select("srcode", "yearsem", "grade_numeric", "program").orderBy("srcode","start_year", "sem_order", "description")

# Show a preview of the results
output_df.show(500, truncate=False)

+------+----------------+-------------+------------------------------------------------------------+
|srcode|yearsem         |grade_numeric|program                                                     |
+------+----------------+-------------+------------------------------------------------------------+
|100005|2020-2021-FIRST |1.00         |Master of Arts in Education major in Educational Management |
|100005|2020-2021-FIRST |1.25         |Master of Arts in Education major in Educational Management |
|100005|2020-2021-FIRST |1.00         |Master of Arts in Education major in Educational Management |
|100005|2020-2021-SECOND|1.00         |Master of Arts in Education major in Educational Management |
|100005|2020-2021-SECOND|1.00         |Master of Arts in Education major in Educational Management |
|100005|2020-2021-SECOND|1.25         |Master of Arts in Education major in Educational Management |
|100005|2021-2022-FIRST |1.00         |Master of Arts in Education major in Educational Man

In [16]:
# First, read the CSV file with program mappings
program_mapping_df = spark.read.csv("C:\LEONAIDAS\program_with_id.csv", header=True)

# Convert DataFrame to dictionary with program_id as key (convert to int)
program_dict = {int(row['program_id']): row['program'] for row in program_mapping_df.collect()}

# Create a reverse dictionary for lookup
reverse_program_dict = {v: k for k, v in program_dict.items()}

# Create a mapping function
def get_program_id(program):
    return reverse_program_dict.get(program)

# Register the UDF (User Defined Function)
from pyspark.sql.functions import udf
from pyspark.sql.types import IntegerType  # Changed to IntegerType

get_program_id_udf = udf(get_program_id, IntegerType())

# Add the new column to your DataFrame
output_df = output_df.withColumn("program_id", get_program_id_udf(col("program")))

# Show the results
output_df.select("srcode", "yearsem", "grade_numeric", "program", "program_id").show(5)

# Optionally verify the mapping
output_df.groupBy("program", "program_id").count().orderBy("program_id").show(500, truncate=False)

+------+----------------+-------------+--------------------+----------+
|srcode|         yearsem|grade_numeric|             program|program_id|
+------+----------------+-------------+--------------------+----------+
|100005| 2020-2021-FIRST|         1.00|Master of Arts in...|        98|
|100005| 2020-2021-FIRST|         1.25|Master of Arts in...|        98|
|100005| 2020-2021-FIRST|         1.00|Master of Arts in...|        98|
|100005|2020-2021-SECOND|         1.00|Master of Arts in...|        98|
|100005|2020-2021-SECOND|         1.00|Master of Arts in...|        98|
+------+----------------+-------------+--------------------+----------+
only showing top 5 rows

+---------------------------------------------------------------------------------+----------+------+
|program                                                                          |program_id|count |
+---------------------------------------------------------------------------------+----------+------+
|Master of Arts in Ed

In [15]:


program_dict[101] = ["Master of Arts in Education major in Pagtuturo ng Filipino", "Master of Arts in Education major in Filipino"]
program_dict

{1: 'Bachelor of Arts  in Communication',
 2: 'Bachelor of Arts in English Language Studies',
 3: 'Bachelor of Automotive Engineering Technology',
 4: 'Bachelor of Civil Engineering Technology',
 5: 'Bachelor of Computer Engineering Technology',
 6: 'Bachelor of Drafting Engineering Technology',
 7: 'Bachelor of Early Childhood Education',
 8: 'Bachelor of Electrical Engineering Technology',
 9: 'Bachelor of Electronics Engineering Technology',
 10: 'Bachelor of Elementary Education',
 11: 'Bachelor of Fine Arts and Design',
 12: 'Bachelor of Food Engineering Technology',
 13: 'Bachelor of Industrial Technology',
 14: 'Bachelor of Instrumentation and Control Engineering Technology',
 15: 'Bachelor of Laws',
 16: 'Bachelor of Mechanical Engineering Technology',
 17: 'Bachelor of Mechatronics Engineering Technology',
 18: 'Bachelor of Physical Education',
 19: 'Bachelor of Public Administration',
 20: 'Bachelor of Secondary Education',
 21: 'Bachelor of Technical-Vocational Teacher Educa