In [4]:
from pyspark.sql import SparkSession

warehouse_path = "file:///C:/tmp/spark_warehouse"

jdbc_url = "jdbc:postgresql://192.168.20.11:5432/caist_db_v1"
#jdbc_url = "jdbc:postgresql://localhost:5432/local_student_grades"
properties = {
    "user": "postgres", 
    "password": "postgres",  
    "driver": "org.postgresql.Driver",
    "fetchsize": "10000"
}

#DOWNLOAD FROM ORACLE
postgres_driver_path = "C:\postgresql-42.7.5.jar"

def extract(jdbc_url, table_name, properties, postgres_driver_path):
    """ Extract data from PostgreSQL database using Spark."""
    # Initialize Spark session
    # Initialize Spark session
    spark = SparkSession.builder \
        .master("local[*]") \
        .appName("Hudi Batch Write") \
        .config("spark.jars", postgres_driver_path) \
        .config("spark.jars.packages", "org.apache.hudi:hudi-spark3.3-bundle_2.12:0.14.0") \
        .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer") \
        .config("spark.sql.hive.convertMetastoreParquet", "false") \
        .config("spark.sql.extensions", "org.apache.spark.sql.hudi.HoodieSparkSessionExtension") \
        .config("spark.driver.memory", "4g") \
        .config("spark.executor.memory", "4g") \
        .config("spark.executor.memoryOverhead", "1g") \
        .config("spark.driver.memoryOverhead", "1g") \
        .config("spark.sql.warehouse.dir", warehouse_path) \
        .getOrCreate()

    # Extracts data from PostgreSQL database
    df = spark.read.jdbc(
        url=jdbc_url,
        table=table_name,
        properties=properties,
    )
    
    return df, spark

# Extract data once
raw_df, spark = extract(jdbc_url, "processed_grades", properties, postgres_driver_path)

# Check the number of rows
row_count = raw_df.count()
print(f'Number of rows: {row_count}')

# Print schema
raw_df.printSchema()

Number of rows: 1998648
root
 |-- id: integer (nullable = true)
 |-- schoolyear: string (nullable = true)
 |-- semester: string (nullable = true)
 |-- code: string (nullable = true)
 |-- description: string (nullable = true)
 |-- units: integer (nullable = true)
 |-- instructor_id: string (nullable = true)
 |-- instructor_name: string (nullable = true)
 |-- srcode: string (nullable = true)
 |-- fullname: string (nullable = true)
 |-- campus: string (nullable = true)
 |-- program: string (nullable = true)
 |-- major: string (nullable = true)
 |-- yearlevel: string (nullable = true)
 |-- curriculum: string (nullable = true)
 |-- class_section: string (nullable = true)
 |-- grade_final: string (nullable = true)
 |-- grade_reexam: string (nullable = true)
 |-- status: string (nullable = true)
 |-- grade_numeric: decimal(10,2) (nullable = true)
 |-- grade_classification: string (nullable = true)
 |-- start_year: integer (nullable = true)
 |-- year_sem: string (nullable = true)
 |-- program_

In [5]:
from pyspark.sql.functions import col, count, max, row_number
from pyspark.sql.window import Window

def remove_previous_programs(df):
    #Remove records of previous programs for shifters.
    
    # Create a window spec partitioned by srcode ordered by schoolyear desc
    window_spec = Window.partitionBy("srcode").orderBy(col("schoolyear").desc())
    
    # Get the most recent program for each student
    latest_programs = df.withColumn("row_number", row_number().over(window_spec)) \
        .filter(col("row_number") == 1) \
        .select("srcode", "program", "schoolyear") \
        .withColumnRenamed("srcode", "latest_srcode") \
        .withColumnRenamed("program", "latest_program") \
        .withColumnRenamed("schoolyear", "latest_schoolyear")
    
    # Join with original dataframe to keep only records with the most recent program
    filtered_df = df.join(
        latest_programs,
        (df.srcode == latest_programs.latest_srcode) & 
        (df.program == latest_programs.latest_program),
        "inner"
    ).drop("latest_srcode", "latest_program", "latest_schoolyear")
    
    return filtered_df

cleaned_df = remove_previous_programs(raw_df)

In [6]:
cleaned_df.show()

+------+----------+--------+----------+--------------------+-----+-------------+------------------+------+----------------+---------+--------------------+-----+---------+----------+---------------+-----------+------------+------+-------------+--------------------+----------+----------------+----------+
|    id|schoolyear|semester|      code|         description|units|instructor_id|   instructor_name|srcode|        fullname|   campus|             program|major|yearlevel|curriculum|  class_section|grade_final|grade_reexam|status|grade_numeric|grade_classification|start_year|        year_sem|program_id|
+------+----------+--------+----------+--------------------+-----+-------------+------------------+------+----------------+---------+--------------------+-----+---------+----------+---------------+-----------+------------+------+-------------+--------------------+----------+----------------+----------+
|181503| 2023-2024|  SUMMER|   GEd 103|Life and Works of...|    3|       -11299|LN-11299

In [7]:
from pyspark.sql.functions import split, col, trim, when

# Get distinct schoolyear values
distinct_years_df = raw_df.select("schoolyear").distinct()

# Optionally, show all distinct school years
distinct_years_df.show(truncate=False)

# Extract the starting year from the 'schoolyear' column
# Assumes format like "2021-2022"
distinct_years_df = distinct_years_df.withColumn("start_year", split(col("schoolyear"), "-")[0].cast("int"))

# Filter for school years starting from 2006 (or later)
#valid_years_df = distinct_years_df.filter(col("start_year") >= 2006)

valid_years_df = distinct_years_df.orderBy("start_year")

# Collect the valid schoolyear values as a list
valid_schoolyears = [row.schoolyear for row in valid_years_df.collect()]
print("Valid schoolyears:", valid_schoolyears)

+----------+
|schoolyear|
+----------+
|2022-2023 |
|2021-2022 |
|2011-2012 |
|2012-2013 |
|2013-2014 |
|2016-2017 |
|2010-2011 |
|2014-2015 |
|2007-2008 |
|2006-2007 |
|2017-2018 |
|2019-2020 |
|2018-2019 |
|2023-2024 |
|2009-2010 |
|2024-2025 |
|2008-2009 |
|2020-2021 |
|2015-2016 |
+----------+

Valid schoolyears: ['2006-2007', '2007-2008', '2008-2009', '2009-2010', '2010-2011', '2011-2012', '2012-2013', '2013-2014', '2014-2015', '2015-2016', '2016-2017', '2017-2018', '2018-2019', '2019-2020', '2020-2021', '2021-2022', '2022-2023', '2023-2024', '2024-2025']


In [8]:
from pyspark.sql.functions import split, col, when, concat_ws

# Define the semesters you're interested in
semesters = ["FIRST", "SECOND", "SUMMER"]

# Filter the DataFrame on the valid schoolyears, semesters, and ensure grade_numeric is not null
filtered_df = cleaned_df.filter(
    (col("schoolyear").isin(valid_schoolyears)) &
    (col("semester").isin(semesters)) &
    (col("grade_numeric").isNotNull())
)

# Extract the starting year from the 'schoolyear' column (assumes format "2021-2022")
filtered_df = filtered_df.withColumn("start_year", split(col("schoolyear"), "-")[0].cast("int"))

# Create a custom order for the semester column
filtered_df = filtered_df.withColumn(
    "sem_order",
    when(col("semester") == "FIRST", 1)
    .when(col("semester") == "SECOND", 2)
    .when(col("semester") == "SUMMER", 3)
)

# Create the 'yearsem' column by concatenating schoolyear and semester (e.g., "2021-2022-FIRST")
filtered_df = filtered_df.withColumn("yearsem", concat_ws("-", col("schoolyear"), col("semester")))

# Order by start_year and then by sem_order to get FIRST, SECOND, SUMMER in that order for each year
output_df = filtered_df.select("srcode", "yearsem", "grade_numeric", "program").orderBy("srcode","start_year", "sem_order", "description")

# Show a preview of the results
output_df.show(500, truncate=False)

+------+----------------+-------------+--------------------------------------------------+
|srcode|yearsem         |grade_numeric|program                                           |
+------+----------------+-------------+--------------------------------------------------+
|20001 |2023-2024-FIRST |1.50         |Bachelor of Technical-Vocational Teacher Education|
|20001 |2023-2024-FIRST |1.50         |Bachelor of Technical-Vocational Teacher Education|
|20001 |2023-2024-FIRST |1.75         |Bachelor of Technical-Vocational Teacher Education|
|20001 |2023-2024-FIRST |1.50         |Bachelor of Technical-Vocational Teacher Education|
|20001 |2023-2024-FIRST |2.25         |Bachelor of Technical-Vocational Teacher Education|
|20001 |2023-2024-FIRST |2.50         |Bachelor of Technical-Vocational Teacher Education|
|20001 |2023-2024-FIRST |2.50         |Bachelor of Technical-Vocational Teacher Education|
|20001 |2023-2024-FIRST |2.00         |Bachelor of Technical-Vocational Teacher Education|

In [9]:
# First, read the CSV file with program mappings
program_mapping_df = spark.read.csv("C:\LEONAIDAS\program_with_id.csv", header=True)

# Convert DataFrame to dictionary with program_id as key (convert to int)
program_dict = {int(row['program_id']): row['program'] for row in program_mapping_df.collect()}

# Create a reverse dictionary for lookup, handling multiple program names
reverse_program_dict = {}
for program_id, program_name in program_dict.items():
    if isinstance(program_name, list):
        # If program_name is a list, add each name separately
        for name in program_name:
            reverse_program_dict[name] = program_id
    else:
        # Single program name
        reverse_program_dict[program_name] = program_id

# Add the special case for program_id 101
program_names_101 = [
    "Master of Arts in Education major in Pagtuturo ng Filipino",
    "Master of Arts in Education major in Filipino"
]
for name in program_names_101:
    reverse_program_dict[name] = 101

# Create a mapping function
def get_program_id(program):
    return reverse_program_dict.get(program)

# Register the UDF (User Defined Function)
from pyspark.sql.functions import udf
from pyspark.sql.types import IntegerType

get_program_id_udf = udf(get_program_id, IntegerType())

# Add the new column to your DataFrame
output_df = output_df.withColumn("program_id", get_program_id_udf(col("program")))

# Show the results
output_df.select("srcode", "yearsem", "grade_numeric", "program", "program_id").show(5)

# Optionally verify the mapping
output_df.groupBy("program", "program_id").count().orderBy("program_id").show(500, truncate=False)

+------+---------------+-------------+--------------------+----------+
|srcode|        yearsem|grade_numeric|             program|program_id|
+------+---------------+-------------+--------------------+----------+
| 20001|2023-2024-FIRST|         1.50|Bachelor of Techn...|        21|
| 20001|2023-2024-FIRST|         1.50|Bachelor of Techn...|        21|
| 20001|2023-2024-FIRST|         1.75|Bachelor of Techn...|        21|
| 20001|2023-2024-FIRST|         1.50|Bachelor of Techn...|        21|
| 20001|2023-2024-FIRST|         2.25|Bachelor of Techn...|        21|
+------+---------------+-------------+--------------------+----------+
only showing top 5 rows

+---------------------------------------------------------------------------------+----------+------+
|program                                                                          |program_id|count |
+---------------------------------------------------------------------------------+----------+------+
|Bachelor of Arts  in Communic

In [10]:
from pyspark.sql.functions import avg, row_number, dense_rank
from pyspark.sql.window import Window

# First, calculate average grades per student per yearsem
avg_grades_df = output_df.groupBy("srcode", "yearsem", "program_id") \
    .agg(avg("grade_numeric").alias("sem_average"))

# Create a window spec to generate sequential semester numbers
window_spec = Window.partitionBy("srcode").orderBy("yearsem")

# Add sequential semester numbers and select final columns
final_df = avg_grades_df \
    .withColumn("semester", dense_rank().over(window_spec)) \
    .select("srcode", "semester", "sem_average", "program_id") \
    .orderBy("srcode", "semester")

# Show the results
final_df.show(500)

+------+--------+-----------+----------+
|srcode|semester|sem_average|program_id|
+------+--------+-----------+----------+
| 20001|       1|   2.075000|        21|
| 20001|       2|   2.175000|        21|
| 20001|       3|   2.416667|        21|
| 20001|       4|   1.775000|        21|
| 20002|       1|   1.611111|        36|
| 20003|       1|   1.475000|        20|
| 20003|       2|   1.400000|        20|
| 20003|       3|   1.500000|        20|
| 20003|       4|   1.472222|        20|
| 20003|       5|   1.343750|        20|
| 20003|       6|   1.250000|        20|
| 20003|       7|   1.062500|        20|
| 20004|       1|   1.857143|        19|
| 20005|       1|   1.875000|        62|
| 20006|       1|   1.583333|        32|
| 20006|       2|   1.333333|        32|
| 20006|       3|   1.535714|        32|
| 20006|       4|   1.464286|        32|
| 20006|       5|   2.035714|        32|
| 20006|       6|   1.958333|        32|
| 20006|       7|   2.166667|        32|
| 20007|       1

In [11]:
final_df.show(20)

+------+--------+-----------+----------+
|srcode|semester|sem_average|program_id|
+------+--------+-----------+----------+
| 20001|       1|   2.075000|        21|
| 20001|       2|   2.175000|        21|
| 20001|       3|   2.416667|        21|
| 20001|       4|   1.775000|        21|
| 20002|       1|   1.611111|        36|
| 20003|       1|   1.475000|        20|
| 20003|       2|   1.400000|        20|
| 20003|       3|   1.500000|        20|
| 20003|       4|   1.472222|        20|
| 20003|       5|   1.343750|        20|
| 20003|       6|   1.250000|        20|
| 20003|       7|   1.062500|        20|
| 20004|       1|   1.857143|        19|
| 20005|       1|   1.875000|        62|
| 20006|       1|   1.583333|        32|
| 20006|       2|   1.333333|        32|
| 20006|       3|   1.535714|        32|
| 20006|       4|   1.464286|        32|
| 20006|       5|   2.035714|        32|
| 20006|       6|   1.958333|        32|
+------+--------+-----------+----------+
only showing top

In [12]:
'''from pyspark.sql.functions import monotonically_increasing_id, current_timestamp
from pyspark.sql.window import Window

# Method 1: Using monotonically_increasing_id()
final_df = final_df.withColumn("id", monotonically_increasing_id())


final_df_df =   final_df.withColumn("ts", current_timestamp())
final_df.show()'''

'from pyspark.sql.functions import monotonically_increasing_id, current_timestamp\nfrom pyspark.sql.window import Window\n\n# Method 1: Using monotonically_increasing_id()\nfinal_df = final_df.withColumn("id", monotonically_increasing_id())\n\n\nfinal_df_df =   final_df.withColumn("ts", current_timestamp())\nfinal_df.show()'

In [13]:
final_df.show(20)

+------+--------+-----------+----------+
|srcode|semester|sem_average|program_id|
+------+--------+-----------+----------+
| 20001|       1|   2.075000|        21|
| 20001|       2|   2.175000|        21|
| 20001|       3|   2.416667|        21|
| 20001|       4|   1.775000|        21|
| 20002|       1|   1.611111|        36|
| 20003|       1|   1.475000|        20|
| 20003|       2|   1.400000|        20|
| 20003|       3|   1.500000|        20|
| 20003|       4|   1.472222|        20|
| 20003|       5|   1.343750|        20|
| 20003|       6|   1.250000|        20|
| 20003|       7|   1.062500|        20|
| 20004|       1|   1.857143|        19|
| 20005|       1|   1.875000|        62|
| 20006|       1|   1.583333|        32|
| 20006|       2|   1.333333|        32|
| 20006|       3|   1.535714|        32|
| 20006|       4|   1.464286|        32|
| 20006|       5|   2.035714|        32|
| 20006|       6|   1.958333|        32|
+------+--------+-----------+----------+
only showing top

In [14]:
'''db_name = "persem_db"
table_name = "persem_dataframe"
program_name = 'BSIT'
path = f"{warehouse_path}/{db_name}/{table_name}/{program_name}"

# Define Hudi write options
hudi_options = {
    "hoodie.table.name": f"{table_name}",
    "hoodie.datasource.write.recordkey.field": "id",  
    "hoodie.datasource.write.operation": "insert",
    "hoodie.datasource.write.keygenerator.class": "org.apache.hudi.keygen.NonpartitionedKeyGenerator"
}

filtered_df.write.format("hudi") \
    .options(**hudi_options) \
    .mode("overwrite") \
    .save(path)'''

'db_name = "persem_db"\ntable_name = "persem_dataframe"\nprogram_name = \'BSIT\'\npath = f"{warehouse_path}/{db_name}/{table_name}/{program_name}"\n\n# Define Hudi write options\nhudi_options = {\n    "hoodie.table.name": f"{table_name}",\n    "hoodie.datasource.write.recordkey.field": "id",  \n    "hoodie.datasource.write.operation": "insert",\n    "hoodie.datasource.write.keygenerator.class": "org.apache.hudi.keygen.NonpartitionedKeyGenerator"\n}\n\nfiltered_df.write.format("hudi")     .options(**hudi_options)     .mode("overwrite")     .save(path)'

In [15]:
'''# If you have program_id column:
filtered_df = final_df.filter(col("program_id") == 53)
filtered_df.show(50)'''

'# If you have program_id column:\nfiltered_df = final_df.filter(col("program_id") == 53)\nfiltered_df.show(50)'

In [16]:
filtered_df.count()

1949557

In [17]:
# If you want a single CSV file instead of multiple parts
'''filtered_df.coalesce(1) \
    .write \
    .option("header", "true") \
    .mode("overwrite") \
    .csv("C:/tmp/spark_warehouse/bsit_student_semester_data_single.csv")'''

'filtered_df.coalesce(1)     .write     .option("header", "true")     .mode("overwrite")     .csv("C:/tmp/spark_warehouse/bsit_student_semester_data_single.csv")'

In [18]:
'''db_name = "persem_db"
table_name = "persem_dataframe"
program_name = 'BSIT'
path = f"{warehouse_path}/{db_name}/{table_name}/{program_name}"

# Define Hudi write options
hudi_options = {
    "hoodie.table.name": f"{table_name}",
    "hoodie.datasource.write.recordkey.field": "id",  
    "hoodie.datasource.write.operation": "insert",
    "hoodie.datasource.write.keygenerator.class": "org.apache.hudi.keygen.NonpartitionedKeyGenerator"
}

filtered_df.write.format("hudi") \
    .options(**hudi_options) \
    .mode("overwrite") \
    .save(path)'''

'db_name = "persem_db"\ntable_name = "persem_dataframe"\nprogram_name = \'BSIT\'\npath = f"{warehouse_path}/{db_name}/{table_name}/{program_name}"\n\n# Define Hudi write options\nhudi_options = {\n    "hoodie.table.name": f"{table_name}",\n    "hoodie.datasource.write.recordkey.field": "id",  \n    "hoodie.datasource.write.operation": "insert",\n    "hoodie.datasource.write.keygenerator.class": "org.apache.hudi.keygen.NonpartitionedKeyGenerator"\n}\n\nfiltered_df.write.format("hudi")     .options(**hudi_options)     .mode("overwrite")     .save(path)'

In [19]:
from pyspark.sql.functions import count, max, desc

def get_common_semesters(df):
    """
    Get the most common maximum semester number taken by students in each program.
    Returns a DataFrame with columns: program_id, program, common_max_semester, student_count
    """
    # First get the max semester per student per program
    student_max_semesters = df.groupBy("srcode", "program_id") \
        .agg(max("semester").alias("max_semester"))
    
    # Then find the most common max semester per program by counting occurrences
    program_semester_stats = student_max_semesters.groupBy("program_id", "max_semester") \
        .agg(count("*").alias("semester_count")) \
        .orderBy(col("semester_count").desc()) \
        .groupBy("program_id") \
        .agg(
            max("max_semester").alias("common_max_semester"),
            max("semester_count").alias("student_count")
        )
    
    # Join with program names for better readability
    program_names_df = output_df.select("program", "program_id").distinct()
    final_stats = program_semester_stats.join(
        program_names_df,
        "program_id",
        "left"
    ).orderBy("program_id")
    
    return final_stats

# Example usage
semester_stats = get_common_semesters(final_df)
semester_stats.show(500, truncate=False)

+----------+-------------------+-------------+---------------------------------------------------------------------------------+
|program_id|common_max_semester|student_count|program                                                                          |
+----------+-------------------+-------------+---------------------------------------------------------------------------------+
|1         |13                 |433          |Bachelor of Arts  in Communication                                               |
|2         |10                 |142          |Bachelor of Arts in English Language Studies                                     |
|3         |1                  |221          |Bachelor of Automotive Engineering Technology                                    |
|4         |1                  |200          |Bachelor of Civil Engineering Technology                                         |
|5         |1                  |403          |Bachelor of Computer Engineering Technology        

In [20]:
# First get the max semester per program
program_max_semesters = final_df.groupBy("program_id") \
    .agg(max("semester").alias("program_max_semester"))

# Add the program_max_semester column to final_df
final_df_with_max = final_df.join(
    program_max_semesters,
    ["program_id"],
    "left"
)

# Show the results with the new column
final_df_with_max.select(
    "srcode", 
    "semester", 
    "sem_average", 
    "program_id", 
    "program_max_semester"
).show(500)

+------+--------+-----------+----------+--------------------+
|srcode|semester|sem_average|program_id|program_max_semester|
+------+--------+-----------+----------+--------------------+
| 20008|       1|   2.031250|        65|                  11|
| 20008|       2|   1.812500|        65|                  11|
| 20008|       3|   2.527778|        65|                  11|
| 20008|       4|   2.464286|        65|                  11|
| 20008|       5|   2.875000|        65|                  11|
| 20038|       1|   1.500000|        65|                  11|
| 20038|       2|   1.593750|        65|                  11|
| 20038|       3|   1.972222|        65|                  11|
| 20038|       4|   2.142857|        65|                  11|
| 20038|       5|   1.916667|        65|                  11|
| 20048|       1|   1.656250|        65|                  11|
| 20048|       2|   1.906250|        65|                  11|
| 20048|       3|   1.750000|        65|                  11|
| 20048|

In [21]:
final_df_with_max.show(500)

+----------+------+--------+-----------+--------------------+
|program_id|srcode|semester|sem_average|program_max_semester|
+----------+------+--------+-----------+--------------------+
|        21| 20001|       1|   2.075000|                  12|
|        21| 20001|       2|   2.175000|                  12|
|        21| 20001|       3|   2.416667|                  12|
|        21| 20001|       4|   1.775000|                  12|
|        36| 20002|       1|   1.611111|                  15|
|        20| 20003|       1|   1.475000|                  14|
|        20| 20003|       2|   1.400000|                  14|
|        20| 20003|       3|   1.500000|                  14|
|        20| 20003|       4|   1.472222|                  14|
|        20| 20003|       5|   1.343750|                  14|
|        20| 20003|       6|   1.250000|                  14|
|        20| 20003|       7|   1.062500|                  14|
|        19| 20004|       1|   1.857143|                   8|
|       

In [22]:
persem_df = final_df_with_max.select("srcode", "semester", "sem_average", "program_id")


In [23]:
persem_df.show(500)

+------+--------+-----------+----------+
|srcode|semester|sem_average|program_id|
+------+--------+-----------+----------+
| 20008|       1|   2.031250|        65|
| 20008|       2|   1.812500|        65|
| 20008|       3|   2.527778|        65|
| 20008|       4|   2.464286|        65|
| 20008|       5|   2.875000|        65|
| 20038|       1|   1.500000|        65|
| 20038|       2|   1.593750|        65|
| 20038|       3|   1.972222|        65|
| 20038|       4|   2.142857|        65|
| 20038|       5|   1.916667|        65|
| 20048|       1|   1.656250|        65|
| 20048|       2|   1.906250|        65|
| 20048|       3|   1.750000|        65|
| 20048|       4|   1.607143|        65|
| 20048|       5|   2.000000|        65|
| 20048|       6|   2.000000|        65|
| 20048|       7|   1.000000|        65|
| 20049|       1|   1.500000|        65|
| 20049|       2|   1.687500|        65|
| 20049|       3|   1.527778|        65|
| 20064|       1|   1.687500|        65|
| 20064|       2

In [32]:
# Import required functions
from pyspark.sql.functions import current_timestamp, monotonically_increasing_id

# Add a timestamp column for Hudi
#persem_df = persem_df.withColumn("ts", current_timestamp())

# Generate a unique identifier
persem_df = persem_df.withColumn("uuid", monotonically_increasing_id())



In [33]:
persem_df.show(100)

+------+--------+-----------+----------+----+--------------------+
|srcode|semester|sem_average|program_id|uuid|                  ts|
+------+--------+-----------+----------+----+--------------------+
| 20385|       1|   1.916667|        31|   0|2025-03-12 13:58:...|
| 20385|       2|   2.000000|        31|   1|2025-03-12 13:58:...|
| 20385|       3|   1.750000|        31|   2|2025-03-12 13:58:...|
| 20385|       4|   2.500000|        31|   3|2025-03-12 13:58:...|
| 20385|       5|   2.107143|        31|   4|2025-03-12 13:58:...|
| 20385|       6|   1.250000|        31|   5|2025-03-12 13:58:...|
| 20385|       7|   1.562500|        31|   6|2025-03-12 13:58:...|
| 23565|       1|   1.944444|        31|   7|2025-03-12 13:58:...|
| 23565|       2|   1.972222|        31|   8|2025-03-12 13:58:...|
| 23565|       3|   1.916667|        31|   9|2025-03-12 13:58:...|
| 23565|       4|   2.305556|        31|  10|2025-03-12 13:58:...|
| 26196|       1|   1.750000|        31|  11|2025-03-12 13:58:

In [37]:
print(persem_df.rdd.getNumPartitions())  

5


In [38]:
persem_df = persem_df.coalesce(1)

In [39]:
print(persem_df.rdd.getNumPartitions())  

1


In [42]:
# Import required functions
db_name = "finalforecasting_db"
table_name = "avg_persem_allprograms_dataframe"
path = f"{warehouse_path}/{db_name}/{table_name}"

# Define Hudi write options
hudi_options = {
    "hoodie.table.name": f"{table_name}",
    "hoodie.datasource.write.recordkey.field": "uuid",  
    'hoodie.datasource.write.operation': 'bulk_insert',
    "hoodie.datasource.write.keygenerator.class": "org.apache.hudi.keygen.NonpartitionedKeyGenerator"
}

persem_df.write.format("hudi") \
    .options(**hudi_options) \
    .mode("overwrite") \
    .save(path)

In [43]:
from pyspark.sql.functions import avg, count, round

# Group by srcode and calculate averages and semester count
student_summary = final_df_with_max.groupBy("srcode", "program_id", "program_max_semester") \
    .agg(
        avg("sem_average").alias("overall_average"),
        count("semester").alias("semesters_taken")
    ) \
    .orderBy("srcode")

# Show the results
student_summary.select(
    "srcode",
    "semesters_taken",
    "overall_average",
    "program_id",
    "program_max_semester"
).show(500)

+------+---------------+---------------+----------+--------------------+
|srcode|semesters_taken|overall_average|program_id|program_max_semester|
+------+---------------+---------------+----------+--------------------+
| 20001|              4|   2.1104167500|        21|                  12|
| 20002|              1|   1.6111110000|        36|                  15|
| 20003|              7|   1.3576388571|        20|                  14|
| 20004|              1|   1.8571430000|        19|                   8|
| 20005|              1|   1.8750000000|        62|                  11|
| 20006|              7|   1.7253400000|        32|                  15|
| 20007|              1|   2.0000000000|         5|                   1|
| 20008|              5|   2.3421628000|        65|                  11|
| 20009|              7|   1.5954364286|        20|                  14|
| 20010|              4|   1.8932292500|        38|                  15|
| 20011|              7|   1.5090910000|        63|

In [44]:
# Create new DataFrame with selected columns
student_summary_simplified = student_summary.select(
    "srcode",
    "semesters_taken",
    "overall_average",
    "program_id"
)

# Show the results
student_summary_simplified.show(500)

+------+---------------+---------------+----------+
|srcode|semesters_taken|overall_average|program_id|
+------+---------------+---------------+----------+
| 20001|              4|   2.1104167500|        21|
| 20002|              1|   1.6111110000|        36|
| 20003|              7|   1.3576388571|        20|
| 20004|              1|   1.8571430000|        19|
| 20005|              1|   1.8750000000|        62|
| 20006|              7|   1.7253400000|        32|
| 20007|              1|   2.0000000000|         5|
| 20008|              5|   2.3421628000|        65|
| 20009|              7|   1.5954364286|        20|
| 20010|              4|   1.8932292500|        38|
| 20011|              7|   1.5090910000|        63|
| 20012|              3|   2.0363426667|        13|
| 20013|              5|   1.7130952000|        32|
| 20014|              3|   2.0032406667|        13|
| 20015|              7|   1.9880954286|        32|
| 20016|              4|   2.5781250000|        43|
| 20017|    

In [46]:
# Import required functions
from pyspark.sql.functions import current_timestamp

# Add a timestamp column for Hudi
filtered_df = filtered_df.withColumn("ts", current_timestamp())

# Import required functions
db_name = "finalforecastiing_db"
table_name = "oneforall_dataframe"
path = f"{warehouse_path}/{db_name}/{table_name}"

# Define Hudi write options
hudi_options = {
    "hoodie.table.name": f"{table_name}",
    "hoodie.datasource.write.recordkey.field": "srcode",  
    'hoodie.datasource.write.operation': 'upsert',
    'hoodie.datasource.write.precombine.field': 'ts',
    "hoodie.datasource.write.keygenerator.class": "org.apache.hudi.keygen.NonpartitionedKeyGenerator"
}

filtered_df.write.format("hudi") \
    .options(**hudi_options) \
    .mode("overwrite") \
    .save(path)

NameError: name 'filtered_df' is not defined

In [56]:
program_dict

{1: 'Bachelor of Arts  in Communication',
 2: 'Bachelor of Arts in English Language Studies',
 3: 'Bachelor of Automotive Engineering Technology',
 4: 'Bachelor of Civil Engineering Technology',
 5: 'Bachelor of Computer Engineering Technology',
 6: 'Bachelor of Drafting Engineering Technology',
 7: 'Bachelor of Early Childhood Education',
 8: 'Bachelor of Electrical Engineering Technology',
 9: 'Bachelor of Electronics Engineering Technology',
 10: 'Bachelor of Elementary Education',
 11: 'Bachelor of Fine Arts and Design',
 12: 'Bachelor of Food Engineering Technology',
 13: 'Bachelor of Industrial Technology',
 14: 'Bachelor of Instrumentation and Control Engineering Technology',
 15: 'Bachelor of Laws',
 16: 'Bachelor of Mechanical Engineering Technology',
 17: 'Bachelor of Mechatronics Engineering Technology',
 18: 'Bachelor of Physical Education',
 19: 'Bachelor of Public Administration',
 20: 'Bachelor of Secondary Education',
 21: 'Bachelor of Technical-Vocational Teacher Educa

In [None]:
final_df_with_max.count()

271335

In [30]:
# If you want a single CSV file instead of multiple parts
final_df_with_max.coalesce(1) \
    .write \
    .option("header", "true") \
    .mode("overwrite") \
    .csv("C:/tmp/spark_warehouse/student_semester_data_single.csv")

In [20]:
# Save DataFrame as CSV
final_df_with_max.write \
    .option("header", "true") \
    .mode("overwrite") \
    .csv("C:/tmp/spark_warehouse/multiple_files/student_semester_data.csv")

In [16]:
from pyspark.sql import SparkSession

# Create Spark session with Hudi support
spark = SparkSession.builder \
    .appName("Student Data Analysis") \
    .config("spark.jars.packages", 
            "org.apache.hudi:hudi-spark3.3-bundle_2.12:0.12.2") \
    .config("spark.serializer", 
            "org.apache.spark.serializer.KryoSerializer") \
    .getOrCreate()

In [31]:
# Save as multiple JSON files (better for large datasets)
final_df_with_max.write \
    .mode("overwrite") \
    .json("C:/tmp/spark_warehouse/multiple_files/student_semester_data.json")

In [18]:
# Import required functions
from pyspark.sql.functions import current_timestamp

# Add a timestamp column for Hudi
final_df_with_max = final_df_with_max.withColumn("ts", current_timestamp())

# Define Hudi options
hudiOptions = {
    'hoodie.table.name': 'student_semester_data',
    'hoodie.datasource.write.recordkey.field': 'srcode',
    'hoodie.datasource.write.partitionpath.field': 'program_id',
    'hoodie.datasource.write.table.name': 'student_semester_data',
    'hoodie.datasource.write.operation': 'upsert',
    'hoodie.datasource.write.precombine.field': 'ts',
    'hoodie.upsert.shuffle.parallelism': 2,
    'hoodie.insert.shuffle.parallelism': 2
}

# Write to Hudi with modified format name
final_df_with_max.write \
    .format("hudi") \
    .options(**hudiOptions) \
    .mode("overwrite") \
    .save("C:/tmp/spark_warehouse/student_semester_data")

Py4JJavaError: An error occurred while calling o369.save.
: org.apache.hudi.exception.HoodieException: Unable to create org.apache.hudi.storage.hadoop.HoodieHadoopStorage
	at org.apache.hudi.storage.HoodieStorageUtils.getStorage(HoodieStorageUtils.java:44)
	at org.apache.hudi.common.table.HoodieTableMetaClient.createTableLayoutOnStorage(HoodieTableMetaClient.java:614)
	at org.apache.hudi.common.table.HoodieTableMetaClient$TableBuilder.initTable(HoodieTableMetaClient.java:1508)
	at org.apache.hudi.common.table.HoodieTableMetaClient$TableBuilder.initTable(HoodieTableMetaClient.java:1503)
	at org.apache.hudi.HoodieSparkSqlWriterInternal.writeInternal(HoodieSparkSqlWriter.scala:333)
	at org.apache.hudi.HoodieSparkSqlWriterInternal.$anonfun$write$1(HoodieSparkSqlWriter.scala:192)
	at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$6(SQLExecution.scala:109)
	at org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:169)
	at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$1(SQLExecution.scala:95)
	at org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:779)
	at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:64)
	at org.apache.spark.sql.adapter.BaseSpark3Adapter.sqlExecutionWithNewExecutionId(BaseSpark3Adapter.scala:105)
	at org.apache.hudi.HoodieSparkSqlWriterInternal.write(HoodieSparkSqlWriter.scala:214)
	at org.apache.hudi.HoodieSparkSqlWriter$.write(HoodieSparkSqlWriter.scala:129)
	at org.apache.hudi.DefaultSource.createRelation(DefaultSource.scala:170)
	at org.apache.spark.sql.execution.datasources.SaveIntoDataSourceCommand.run(SaveIntoDataSourceCommand.scala:47)
	at org.apache.spark.sql.execution.command.ExecutedCommandExec.sideEffectResult$lzycompute(commands.scala:75)
	at org.apache.spark.sql.execution.command.ExecutedCommandExec.sideEffectResult(commands.scala:73)
	at org.apache.spark.sql.execution.command.ExecutedCommandExec.executeCollect(commands.scala:84)
	at org.apache.spark.sql.execution.QueryExecution$$anonfun$eagerlyExecuteCommands$1.$anonfun$applyOrElse$1(QueryExecution.scala:98)
	at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$6(SQLExecution.scala:109)
	at org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:169)
	at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$1(SQLExecution.scala:95)
	at org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:779)
	at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:64)
	at org.apache.spark.sql.execution.QueryExecution$$anonfun$eagerlyExecuteCommands$1.applyOrElse(QueryExecution.scala:98)
	at org.apache.spark.sql.execution.QueryExecution$$anonfun$eagerlyExecuteCommands$1.applyOrElse(QueryExecution.scala:94)
	at org.apache.spark.sql.catalyst.trees.TreeNode.$anonfun$transformDownWithPruning$1(TreeNode.scala:584)
	at org.apache.spark.sql.catalyst.trees.CurrentOrigin$.withOrigin(TreeNode.scala:176)
	at org.apache.spark.sql.catalyst.trees.TreeNode.transformDownWithPruning(TreeNode.scala:584)
	at org.apache.spark.sql.catalyst.plans.logical.LogicalPlan.org$apache$spark$sql$catalyst$plans$logical$AnalysisHelper$$super$transformDownWithPruning(LogicalPlan.scala:30)
	at org.apache.spark.sql.catalyst.plans.logical.AnalysisHelper.transformDownWithPruning(AnalysisHelper.scala:267)
	at org.apache.spark.sql.catalyst.plans.logical.AnalysisHelper.transformDownWithPruning$(AnalysisHelper.scala:263)
	at org.apache.spark.sql.catalyst.plans.logical.LogicalPlan.transformDownWithPruning(LogicalPlan.scala:30)
	at org.apache.spark.sql.catalyst.plans.logical.LogicalPlan.transformDownWithPruning(LogicalPlan.scala:30)
	at org.apache.spark.sql.catalyst.trees.TreeNode.transformDown(TreeNode.scala:560)
	at org.apache.spark.sql.execution.QueryExecution.eagerlyExecuteCommands(QueryExecution.scala:94)
	at org.apache.spark.sql.execution.QueryExecution.commandExecuted$lzycompute(QueryExecution.scala:81)
	at org.apache.spark.sql.execution.QueryExecution.commandExecuted(QueryExecution.scala:79)
	at org.apache.spark.sql.execution.QueryExecution.assertCommandExecuted(QueryExecution.scala:116)
	at org.apache.spark.sql.DataFrameWriter.runCommand(DataFrameWriter.scala:860)
	at org.apache.spark.sql.DataFrameWriter.saveToV1Source(DataFrameWriter.scala:390)
	at org.apache.spark.sql.DataFrameWriter.saveInternal(DataFrameWriter.scala:363)
	at org.apache.spark.sql.DataFrameWriter.save(DataFrameWriter.scala:239)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at java.base/jdk.internal.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.base/java.lang.reflect.Method.invoke(Method.java:566)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.ClientServerConnection.waitForCommands(ClientServerConnection.java:182)
	at py4j.ClientServerConnection.run(ClientServerConnection.java:106)
	at java.base/java.lang.Thread.run(Thread.java:834)
Caused by: org.apache.hudi.exception.HoodieException: Unable to instantiate class org.apache.hudi.storage.hadoop.HoodieHadoopStorage
	at org.apache.hudi.common.util.ReflectionUtils.loadClass(ReflectionUtils.java:75)
	at org.apache.hudi.storage.HoodieStorageUtils.getStorage(HoodieStorageUtils.java:41)
	... 55 more
Caused by: java.lang.reflect.InvocationTargetException
	at java.base/jdk.internal.reflect.NativeConstructorAccessorImpl.newInstance0(Native Method)
	at java.base/jdk.internal.reflect.NativeConstructorAccessorImpl.newInstance(NativeConstructorAccessorImpl.java:62)
	at java.base/jdk.internal.reflect.DelegatingConstructorAccessorImpl.newInstance(DelegatingConstructorAccessorImpl.java:45)
	at java.base/java.lang.reflect.Constructor.newInstance(Constructor.java:490)
	at org.apache.hudi.common.util.ReflectionUtils.loadClass(ReflectionUtils.java:73)
	... 56 more
Caused by: org.apache.hudi.exception.HoodieIOException: Failed to get instance of org.apache.hadoop.fs.FileSystem
	at org.apache.hudi.hadoop.fs.HadoopFSUtils.getFs(HadoopFSUtils.java:128)
	at org.apache.hudi.hadoop.fs.HadoopFSUtils.getFs(HadoopFSUtils.java:119)
	at org.apache.hudi.storage.hadoop.HoodieHadoopStorage.<init>(HoodieHadoopStorage.java:64)
	... 61 more
Caused by: java.io.IOException: No FileSystem for scheme: C
	at org.apache.hadoop.fs.FileSystem.getFileSystemClass(FileSystem.java:2660)
	at org.apache.hadoop.fs.FileSystem.createFileSystem(FileSystem.java:2667)
	at org.apache.hadoop.fs.FileSystem.access$200(FileSystem.java:94)
	at org.apache.hadoop.fs.FileSystem$Cache.getInternal(FileSystem.java:2703)
	at org.apache.hadoop.fs.FileSystem$Cache.get(FileSystem.java:2685)
	at org.apache.hadoop.fs.FileSystem.get(FileSystem.java:373)
	at org.apache.hadoop.fs.Path.getFileSystem(Path.java:295)
	at org.apache.hudi.hadoop.fs.HadoopFSUtils.getFs(HadoopFSUtils.java:126)
	... 63 more


In [None]:
# To read the Hudi table back
hudi_df = spark.read \
    .format("hudi") \
    .load("C:/tmp/spark_warehouse/student_semester_data")