In [1]:
from pyspark.sql import SparkSession

warehouse_path = "file:///C:/tmp/spark_warehouse"

jdbc_url = "jdbc:postgresql://192.168.20.11:5432/demo_db"
properties = {
    "user": "postgres", 
    "password": "postgres",  
    "driver": "org.postgresql.Driver",
    "fetchsize": "10000"
}

#DOWNLOAD FROM ORACLE
postgres_driver_path = "C:\postgresql-42.7.5.jar"

def extract(jdbc_url, table_name, properties, postgres_driver_path):
    """ Extract data from PostgreSQL database using Spark."""
    # Initialize Spark session
    spark = SparkSession.builder \
        .master("local[*]") \
        .appName("Hudi Batch Write") \
        .config("spark.jars", postgres_driver_path) \
        .config("spark.jars.packages", "org.apache.hudi:hudi-spark3.3-bundle_2.12:0.14.0") \
        .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer") \
        .config("spark.sql.hive.convertMetastoreParquet", "false") \
        .config("spark.sql.extensions", "org.apache.spark.sql.hudi.HoodieSparkSessionExtension") \
        .config("spark.driver.memory", "4g") \
        .config("spark.executor.memory", "4g") \
        .config("spark.executor.memoryOverhead", "1g") \
        .config("spark.driver.memoryOverhead", "1g") \
        .config("spark.sql.warehouse.dir", warehouse_path) \
        .getOrCreate()

    # Extracts data from PostgreSQL database
    df = spark.read.jdbc(
        url=jdbc_url,
        table=table_name,
        properties=properties,
    )
    
    return df, spark

# Extract data once
raw_df, spark = extract(jdbc_url, "filtered_data_with_id", properties, postgres_driver_path)

# Check the number of rows
row_count = raw_df.count()
print(f'Number of rows: {row_count}')

# Print schema
raw_df.printSchema()

Number of rows: 1999679
root
 |-- id: integer (nullable = true)
 |-- schoolyear: string (nullable = true)
 |-- semester: string (nullable = true)
 |-- code: string (nullable = true)
 |-- description: string (nullable = true)
 |-- credits: integer (nullable = true)
 |-- instructor_id: string (nullable = true)
 |-- instructor: string (nullable = true)
 |-- srcode: string (nullable = true)
 |-- fullname: string (nullable = true)
 |-- campus: string (nullable = true)
 |-- college: string (nullable = true)
 |-- program: string (nullable = true)
 |-- grade_final: string (nullable = true)
 |-- grade_reexam: string (nullable = true)
 |-- status: string (nullable = true)
 |-- grade_numeric: decimal(5,2) (nullable = true)
 |-- grade_classification: string (nullable = true)



In [2]:
from pyspark.sql.functions import col, count, max, row_number
from pyspark.sql.window import Window

def remove_previous_programs(df):
    #Remove records of previous programs for shifters.
    
    # Create a window spec partitioned by srcode ordered by schoolyear desc
    window_spec = Window.partitionBy("srcode").orderBy(col("schoolyear").desc())
    
    # Get the most recent program for each student
    latest_programs = df.withColumn("row_number", row_number().over(window_spec)) \
        .filter(col("row_number") == 1) \
        .select("srcode", "program", "schoolyear") \
        .withColumnRenamed("srcode", "latest_srcode") \
        .withColumnRenamed("program", "latest_program") \
        .withColumnRenamed("schoolyear", "latest_schoolyear")
    
    # Join with original dataframe to keep only records with the most recent program
    filtered_df = df.join(
        latest_programs,
        (df.srcode == latest_programs.latest_srcode) & 
        (df.program == latest_programs.latest_program),
        "inner"
    ).drop("latest_srcode", "latest_program", "latest_schoolyear")
    
    return filtered_df

cleaned_df = remove_previous_programs(raw_df)

In [3]:
from pyspark.sql.functions import split, col, trim, when

# Get distinct schoolyear values
distinct_years_df = raw_df.select("schoolyear").distinct()

# Optionally, show all distinct school years
distinct_years_df.show(truncate=False)

# Extract the starting year from the 'schoolyear' column
# Assumes format like "2021-2022"
distinct_years_df = distinct_years_df.withColumn("start_year", split(col("schoolyear"), "-")[0].cast("int"))

# Filter for school years starting from 2006 (or later)
#valid_years_df = distinct_years_df.filter(col("start_year") >= 2006)

valid_years_df = distinct_years_df.orderBy("start_year")

# Collect the valid schoolyear values as a list
valid_schoolyears = [row.schoolyear for row in valid_years_df.collect()]
print("Valid schoolyears:", valid_schoolyears)

+----------+
|schoolyear|
+----------+
|2022-2023 |
|2021-2022 |
|2011-2012 |
|2012-2013 |
|2013-2014 |
|2010-2011 |
|2016-2017 |
|2014-2015 |
|2007-2008 |
|2006-2007 |
|2017-2018 |
|2019-2020 |
|2018-2019 |
|2023-2024 |
|2009-2010 |
|2024-2025 |
|2008-2009 |
|2020-2021 |
|2015-2016 |
+----------+

Valid schoolyears: ['2006-2007', '2007-2008', '2008-2009', '2009-2010', '2010-2011', '2011-2012', '2012-2013', '2013-2014', '2014-2015', '2015-2016', '2016-2017', '2017-2018', '2018-2019', '2019-2020', '2020-2021', '2021-2022', '2022-2023', '2023-2024', '2024-2025']


In [4]:
from pyspark.sql.functions import split, col, when, concat_ws

# Define the semesters you're interested in
semesters = ["FIRST", "SECOND", "SUMMER"]

# Filter the DataFrame on the valid schoolyears, semesters, and ensure grade_numeric is not null
filtered_df = cleaned_df.filter(
    (col("schoolyear").isin(valid_schoolyears)) &
    (col("semester").isin(semesters)) &
    (col("grade_numeric").isNotNull())
)

# Extract the starting year from the 'schoolyear' column (assumes format "2021-2022")
filtered_df = filtered_df.withColumn("start_year", split(col("schoolyear"), "-")[0].cast("int"))

# Create a custom order for the semester column
filtered_df = filtered_df.withColumn(
    "sem_order",
    when(col("semester") == "FIRST", 1)
    .when(col("semester") == "SECOND", 2)
    .when(col("semester") == "SUMMER", 3)
)

# Create the 'yearsem' column by concatenating schoolyear and semester (e.g., "2021-2022-FIRST")
filtered_df = filtered_df.withColumn("yearsem", concat_ws("-", col("schoolyear"), col("semester")))

# Order by start_year and then by sem_order to get FIRST, SECOND, SUMMER in that order for each year
output_df = filtered_df.select("srcode", "yearsem", "grade_numeric", "program", "credits").orderBy("srcode","start_year", "sem_order", "description")

# Show a preview of the results
output_df.show(500, truncate=False)

+------+----------------+-------------+------------------------------------------------------------+-------+
|srcode|yearsem         |grade_numeric|program                                                     |credits|
+------+----------------+-------------+------------------------------------------------------------+-------+
|100005|2020-2021-FIRST |1.00         |Master of Arts in Education major in Educational Management |3      |
|100005|2020-2021-FIRST |1.25         |Master of Arts in Education major in Educational Management |3      |
|100005|2020-2021-FIRST |1.00         |Master of Arts in Education major in Educational Management |3      |
|100005|2020-2021-SECOND|1.00         |Master of Arts in Education major in Educational Management |3      |
|100005|2020-2021-SECOND|1.00         |Master of Arts in Education major in Educational Management |3      |
|100005|2020-2021-SECOND|1.25         |Master of Arts in Education major in Educational Management |3      |
|100005|2021-2022-F

In [5]:
# First, read the CSV file with program mappings
program_mapping_df = spark.read.csv("C:\LEONAIDAS\program_with_id.csv", header=True)

# Convert DataFrame to dictionary with program_id as key (convert to int)
program_dict = {int(row['program_id']): row['program'] for row in program_mapping_df.collect()}
program_dict[101] = ["Master of Arts in Education major in Pagtuturo ng Filipino", "Master of Arts in Education major in Filipino"]

# Create a reverse dictionary for lookup, handling multiple program names
reverse_program_dict = {}
for program_id, program_name in program_dict.items():
    if isinstance(program_name, list):
        # If program_name is a list, add each name separately
        for name in program_name:
            reverse_program_dict[name] = program_id
    else:
        # Single program name
        reverse_program_dict[program_name] = program_id

# Add the special case for program_id 101
program_names_101 = [
    "Master of Arts in Education major in Pagtuturo ng Filipino",
    "Master of Arts in Education major in Filipino"
]
for name in program_names_101:
    reverse_program_dict[name] = 101

# Create a mapping function
def get_program_id(program):
    return reverse_program_dict.get(program)

# Register the UDF (User Defined Function)
from pyspark.sql.functions import udf
from pyspark.sql.types import IntegerType

get_program_id_udf = udf(get_program_id, IntegerType())

# Add the new column to your DataFrame
output_df = output_df.withColumn("program_id", get_program_id_udf(col("program")))

# Show the results
output_df.select("srcode", "yearsem", "grade_numeric", "program", "program_id").show(5)

# Optionally verify the mapping
output_df.groupBy("program", "program_id").count().orderBy("program_id").show(500, truncate=False)

+------+----------------+-------------+--------------------+----------+
|srcode|         yearsem|grade_numeric|             program|program_id|
+------+----------------+-------------+--------------------+----------+
|100005| 2020-2021-FIRST|         1.00|Master of Arts in...|        98|
|100005| 2020-2021-FIRST|         1.25|Master of Arts in...|        98|
|100005| 2020-2021-FIRST|         1.00|Master of Arts in...|        98|
|100005|2020-2021-SECOND|         1.00|Master of Arts in...|        98|
|100005|2020-2021-SECOND|         1.00|Master of Arts in...|        98|
+------+----------------+-------------+--------------------+----------+
only showing top 5 rows

+---------------------------------------------------------------------------------+----------+------+
|program                                                                          |program_id|count |
+---------------------------------------------------------------------------------+----------+------+
|Bachelor of Arts  in

In [6]:
program_dict

{1: 'Bachelor of Arts  in Communication',
 2: 'Bachelor of Arts in English Language Studies',
 3: 'Bachelor of Automotive Engineering Technology',
 4: 'Bachelor of Civil Engineering Technology',
 5: 'Bachelor of Computer Engineering Technology',
 6: 'Bachelor of Drafting Engineering Technology',
 7: 'Bachelor of Early Childhood Education',
 8: 'Bachelor of Electrical Engineering Technology',
 9: 'Bachelor of Electronics Engineering Technology',
 10: 'Bachelor of Elementary Education',
 11: 'Bachelor of Fine Arts and Design',
 12: 'Bachelor of Food Engineering Technology',
 13: 'Bachelor of Industrial Technology',
 14: 'Bachelor of Instrumentation and Control Engineering Technology',
 15: 'Bachelor of Laws',
 16: 'Bachelor of Mechanical Engineering Technology',
 17: 'Bachelor of Mechatronics Engineering Technology',
 18: 'Bachelor of Physical Education',
 19: 'Bachelor of Public Administration',
 20: 'Bachelor of Secondary Education',
 21: 'Bachelor of Technical-Vocational Teacher Educa

In [11]:
from pyspark.sql.functions import sum, col, round

# Calculate weighted average per student
weighted_avg_df = output_df.groupBy("srcode", "program_id") \
    .agg(
        sum(col("grade_numeric") * col("credits")).alias("weighted_sum"),
        sum("credits").alias("total_credits"),
        count("yearsem").alias("total_semesters")
    ) \
    .withColumn(
        "final_gwa",
        col("weighted_sum") / col("total_credits")
    ) \
    .select(
        "srcode",
        "total_semesters",
        round(col("final_gwa"), 4).alias("final_gwa"),
        "program_id",
        
    ) \
    .orderBy("srcode")

# Show the results
weighted_avg_df.show(500)

+------+---------------+---------+----------+
|srcode|total_semesters|final_gwa|program_id|
+------+---------------+---------+----------+
|100005|             10|   1.0750|        98|
|100010|             72|   2.7275|        51|
|100016|             50|   2.0903|        65|
|100108|             12|   1.3542|       105|
|100113|             45|   2.3019|        65|
|100143|             13|   1.2692|       101|
|100196|             52|   2.4503|        41|
|100200|             57|   2.0045|        10|
|100207|             51|   1.4032|        30|
|100216|             46|   2.5940|        53|
|100243|             58|   2.2738|        27|
|100252|             56|   1.5235|        66|
|100283|             14|   1.3750|        99|
|100312|             50|   2.6813|        35|
|100322|             62|   1.9334|        62|
|100332|             34|   1.7000|        41|
|100350|             66|   1.8698|        62|
|100399|             16|   3.2353|        56|
|100416|              8|   1.4688|

In [12]:

weighted_avg_df.show()

+------+---------------+---------+----------+
|srcode|total_semesters|final_gwa|program_id|
+------+---------------+---------+----------+
|100005|             10|   1.0750|        98|
|100010|             72|   2.7275|        51|
|100016|             50|   2.0903|        65|
|100108|             12|   1.3542|       105|
|100113|             45|   2.3019|        65|
|100143|             13|   1.2692|       101|
|100196|             52|   2.4503|        41|
|100200|             57|   2.0045|        10|
|100207|             51|   1.4032|        30|
|100216|             46|   2.5940|        53|
|100243|             58|   2.2738|        27|
|100252|             56|   1.5235|        66|
|100283|             14|   1.3750|        99|
|100312|             50|   2.6813|        35|
|100322|             62|   1.9334|        62|
|100332|             34|   1.7000|        41|
|100350|             66|   1.8698|        62|
|100399|             16|   3.2353|        56|
|100416|              8|   1.4688|

SAVE TO HUDI

In [17]:
db_name = "oneforall_db"
table_name = "oneforall_dataframe"

path = f"{warehouse_path}/{db_name}/{table_name}"

# Define Hudi write options
hudi_options = {
    "hoodie.table.name": f"{table_name}",
    "hoodie.datasource.write.recordkey.field": "srcode",  
    "hoodie.datasource.write.operation": "insert",
    "hoodie.datasource.write.keygenerator.class": "org.apache.hudi.keygen.NonpartitionedKeyGenerator"
}

weighted_avg_df.write.format("hudi") \
    .options(**hudi_options) \
    .mode("overwrite") \
    .save(path)


Py4JJavaError: An error occurred while calling o332.save.
: org.apache.hudi.exception.HoodieUpsertException: Failed to upsert for commit time 20250228144744192
	at org.apache.hudi.table.action.commit.BaseWriteHelper.write(BaseWriteHelper.java:74)
	at org.apache.hudi.table.action.commit.SparkUpsertCommitActionExecutor.execute(SparkUpsertCommitActionExecutor.java:44)
	at org.apache.hudi.table.HoodieSparkCopyOnWriteTable.upsert(HoodieSparkCopyOnWriteTable.java:114)
	at org.apache.hudi.table.HoodieSparkCopyOnWriteTable.upsert(HoodieSparkCopyOnWriteTable.java:103)
	at org.apache.hudi.client.SparkRDDWriteClient.upsert(SparkRDDWriteClient.java:142)
	at org.apache.hudi.DataSourceUtils.doWriteOperation(DataSourceUtils.java:224)
	at org.apache.hudi.HoodieSparkSqlWriter$.writeInternal(HoodieSparkSqlWriter.scala:431)
	at org.apache.hudi.HoodieSparkSqlWriter$.write(HoodieSparkSqlWriter.scala:132)
	at org.apache.hudi.DefaultSource.createRelation(DefaultSource.scala:150)
	at org.apache.spark.sql.execution.datasources.SaveIntoDataSourceCommand.run(SaveIntoDataSourceCommand.scala:47)
	at org.apache.spark.sql.execution.command.ExecutedCommandExec.sideEffectResult$lzycompute(commands.scala:75)
	at org.apache.spark.sql.execution.command.ExecutedCommandExec.sideEffectResult(commands.scala:73)
	at org.apache.spark.sql.execution.command.ExecutedCommandExec.executeCollect(commands.scala:84)
	at org.apache.spark.sql.execution.QueryExecution$$anonfun$eagerlyExecuteCommands$1.$anonfun$applyOrElse$1(QueryExecution.scala:98)
	at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$6(SQLExecution.scala:109)
	at org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:169)
	at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$1(SQLExecution.scala:95)
	at org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:779)
	at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:64)
	at org.apache.spark.sql.execution.QueryExecution$$anonfun$eagerlyExecuteCommands$1.applyOrElse(QueryExecution.scala:98)
	at org.apache.spark.sql.execution.QueryExecution$$anonfun$eagerlyExecuteCommands$1.applyOrElse(QueryExecution.scala:94)
	at org.apache.spark.sql.catalyst.trees.TreeNode.$anonfun$transformDownWithPruning$1(TreeNode.scala:584)
	at org.apache.spark.sql.catalyst.trees.CurrentOrigin$.withOrigin(TreeNode.scala:176)
	at org.apache.spark.sql.catalyst.trees.TreeNode.transformDownWithPruning(TreeNode.scala:584)
	at org.apache.spark.sql.catalyst.plans.logical.LogicalPlan.org$apache$spark$sql$catalyst$plans$logical$AnalysisHelper$$super$transformDownWithPruning(LogicalPlan.scala:30)
	at org.apache.spark.sql.catalyst.plans.logical.AnalysisHelper.transformDownWithPruning(AnalysisHelper.scala:267)
	at org.apache.spark.sql.catalyst.plans.logical.AnalysisHelper.transformDownWithPruning$(AnalysisHelper.scala:263)
	at org.apache.spark.sql.catalyst.plans.logical.LogicalPlan.transformDownWithPruning(LogicalPlan.scala:30)
	at org.apache.spark.sql.catalyst.plans.logical.LogicalPlan.transformDownWithPruning(LogicalPlan.scala:30)
	at org.apache.spark.sql.catalyst.trees.TreeNode.transformDown(TreeNode.scala:560)
	at org.apache.spark.sql.execution.QueryExecution.eagerlyExecuteCommands(QueryExecution.scala:94)
	at org.apache.spark.sql.execution.QueryExecution.commandExecuted$lzycompute(QueryExecution.scala:81)
	at org.apache.spark.sql.execution.QueryExecution.commandExecuted(QueryExecution.scala:79)
	at org.apache.spark.sql.execution.QueryExecution.assertCommandExecuted(QueryExecution.scala:116)
	at org.apache.spark.sql.DataFrameWriter.runCommand(DataFrameWriter.scala:860)
	at org.apache.spark.sql.DataFrameWriter.saveToV1Source(DataFrameWriter.scala:390)
	at org.apache.spark.sql.DataFrameWriter.saveInternal(DataFrameWriter.scala:363)
	at org.apache.spark.sql.DataFrameWriter.save(DataFrameWriter.scala:239)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at java.base/jdk.internal.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.base/java.lang.reflect.Method.invoke(Method.java:566)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.ClientServerConnection.waitForCommands(ClientServerConnection.java:182)
	at py4j.ClientServerConnection.run(ClientServerConnection.java:106)
	at java.base/java.lang.Thread.run(Thread.java:834)
Caused by: org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 282.0 failed 1 times, most recent failure: Lost task 0.0 in stage 282.0 (TID 366) (192.168.20.227 executor driver): org.apache.hudi.exception.HoodieException: ts(Part -ts) field not found in record. Acceptable fields were :[srcode, total_semesters, final_gwa, program_id]
	at org.apache.hudi.avro.HoodieAvroUtils.getNestedFieldVal(HoodieAvroUtils.java:601)
	at org.apache.hudi.HoodieCreateRecordUtils$.$anonfun$createHoodieRecordRdd$5(HoodieCreateRecordUtils.scala:144)
	at scala.collection.Iterator$$anon$10.next(Iterator.scala:461)
	at scala.collection.Iterator$$anon$10.next(Iterator.scala:461)
	at org.apache.spark.util.collection.ExternalSorter.insertAll(ExternalSorter.scala:199)
	at org.apache.spark.shuffle.sort.SortShuffleWriter.write(SortShuffleWriter.scala:63)
	at org.apache.spark.shuffle.ShuffleWriteProcessor.write(ShuffleWriteProcessor.scala:59)
	at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:99)
	at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:52)
	at org.apache.spark.scheduler.Task.run(Task.scala:136)
	at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$3(Executor.scala:548)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1504)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:551)
	at java.base/java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1128)
	at java.base/java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:628)
	at java.base/java.lang.Thread.run(Thread.java:834)

Driver stacktrace:
	at org.apache.spark.scheduler.DAGScheduler.failJobAndIndependentStages(DAGScheduler.scala:2672)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2(DAGScheduler.scala:2608)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2$adapted(DAGScheduler.scala:2607)
	at scala.collection.mutable.ResizableArray.foreach(ResizableArray.scala:62)
	at scala.collection.mutable.ResizableArray.foreach$(ResizableArray.scala:55)
	at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:2607)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1(DAGScheduler.scala:1182)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1$adapted(DAGScheduler.scala:1182)
	at scala.Option.foreach(Option.scala:407)
	at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:1182)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:2860)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2802)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2791)
	at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:952)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2228)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2249)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2268)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2293)
	at org.apache.spark.rdd.RDD.$anonfun$collect$1(RDD.scala:1021)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
	at org.apache.spark.rdd.RDD.withScope(RDD.scala:406)
	at org.apache.spark.rdd.RDD.collect(RDD.scala:1020)
	at org.apache.spark.api.java.JavaRDDLike.collect(JavaRDDLike.scala:362)
	at org.apache.spark.api.java.JavaRDDLike.collect$(JavaRDDLike.scala:361)
	at org.apache.spark.api.java.AbstractJavaRDDLike.collect(JavaRDDLike.scala:45)
	at org.apache.hudi.data.HoodieJavaRDD.collectAsList(HoodieJavaRDD.java:177)
	at org.apache.hudi.index.simple.HoodieSimpleIndex.fetchRecordLocationsForAffectedPartitions(HoodieSimpleIndex.java:147)
	at org.apache.hudi.index.simple.HoodieSimpleIndex.tagLocationInternal(HoodieSimpleIndex.java:118)
	at org.apache.hudi.index.simple.HoodieSimpleIndex.tagLocation(HoodieSimpleIndex.java:91)
	at org.apache.hudi.table.action.commit.HoodieWriteHelper.tag(HoodieWriteHelper.java:55)
	at org.apache.hudi.table.action.commit.HoodieWriteHelper.tag(HoodieWriteHelper.java:37)
	at org.apache.hudi.table.action.commit.BaseWriteHelper.write(BaseWriteHelper.java:63)
	... 49 more
Caused by: org.apache.hudi.exception.HoodieException: ts(Part -ts) field not found in record. Acceptable fields were :[srcode, total_semesters, final_gwa, program_id]
	at org.apache.hudi.avro.HoodieAvroUtils.getNestedFieldVal(HoodieAvroUtils.java:601)
	at org.apache.hudi.HoodieCreateRecordUtils$.$anonfun$createHoodieRecordRdd$5(HoodieCreateRecordUtils.scala:144)
	at scala.collection.Iterator$$anon$10.next(Iterator.scala:461)
	at scala.collection.Iterator$$anon$10.next(Iterator.scala:461)
	at org.apache.spark.util.collection.ExternalSorter.insertAll(ExternalSorter.scala:199)
	at org.apache.spark.shuffle.sort.SortShuffleWriter.write(SortShuffleWriter.scala:63)
	at org.apache.spark.shuffle.ShuffleWriteProcessor.write(ShuffleWriteProcessor.scala:59)
	at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:99)
	at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:52)
	at org.apache.spark.scheduler.Task.run(Task.scala:136)
	at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$3(Executor.scala:548)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1504)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:551)
	at java.base/java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1128)
	at java.base/java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:628)
	... 1 more


CONSUME PARQUET FILE

In [16]:
# Path to the Parquet file
parquet_file_path = "C:/tmp/spark_warehouse/oneforall_db/oneforall_dataframe/2a02a5c8-3e44-4459-9c7b-41f5695618d0-0_0-239-320_20250228144058554.parquet"

# Read Parquet file into DataFrame
df = spark.read.parquet(parquet_file_path)

# Show DataFrame schema
df.printSchema()

# Show first few rows of the DataFrame
df.show(5)

root
 |-- _hoodie_commit_time: string (nullable = true)
 |-- _hoodie_commit_seqno: string (nullable = true)
 |-- _hoodie_record_key: string (nullable = true)
 |-- _hoodie_partition_path: string (nullable = true)
 |-- _hoodie_file_name: string (nullable = true)
 |-- srcode: string (nullable = true)
 |-- total_semesters: long (nullable = true)
 |-- final_gwa: decimal(29,4) (nullable = true)
 |-- program_id: integer (nullable = true)

+-------------------+--------------------+------------------+----------------------+--------------------+------+---------------+---------+----------+
|_hoodie_commit_time|_hoodie_commit_seqno|_hoodie_record_key|_hoodie_partition_path|   _hoodie_file_name|srcode|total_semesters|final_gwa|program_id|
+-------------------+--------------------+------------------+----------------------+--------------------+------+---------------+---------+----------+
|  20250228144058554|20250228144058554...|            100005|                      |2a02a5c8-3e44-445...|100005|  

SAVE DATAFRAME AS CSV

In [25]:
# Save DataFrame as a single CSV file
weighted_avg_df.coalesce(1) \
    .write \
    .option("header", "true") \
    .mode("overwrite") \
    .csv("C:/tmp/spark_warehouse/oneforall_df")


In [23]:
weighted_avg_df.count()

58334

In [None]:
from pyspark.sql.functions import count, max, desc

def get_common_semesters(df):
    """
    Get the most common maximum semester number taken by students in each program.
    Returns a DataFrame with columns: program_id, program, common_max_semester, student_count
    """
    # First get the max semester per student per program
    student_max_semesters = df.groupBy("srcode", "program_id") \
        .agg(max("semester").alias("max_semester"))
    
    # Then find the most common max semester per program by counting occurrences
    program_semester_stats = student_max_semesters.groupBy("program_id", "max_semester") \
        .agg(count("*").alias("semester_count")) \
        .orderBy(col("semester_count").desc()) \
        .groupBy("program_id") \
        .agg(
            max("max_semester").alias("common_max_semester"),
            max("semester_count").alias("student_count")
        )
    
    # Join with program names for better readability
    program_names_df = output_df.select("program", "program_id").distinct()
    final_stats = program_semester_stats.join(
        program_names_df,
        "program_id",
        "left"
    ).orderBy("program_id")
    
    return final_stats

# Example usage
semester_stats = get_common_semesters(final_df)
semester_stats.show(500, truncate=False)

+----------+-------------------+-------------+---------------------------------------------------------------------------------+
|program_id|common_max_semester|student_count|program                                                                          |
+----------+-------------------+-------------+---------------------------------------------------------------------------------+
|1         |13                 |431          |Bachelor of Arts  in Communication                                               |
|2         |10                 |142          |Bachelor of Arts in English Language Studies                                     |
|3         |1                  |222          |Bachelor of Automotive Engineering Technology                                    |
|4         |1                  |200          |Bachelor of Civil Engineering Technology                                         |
|5         |1                  |403          |Bachelor of Computer Engineering Technology        

In [29]:
# First get the max semester per program
program_max_semesters = final_df.groupBy("program_id") \
    .agg(max("semester").alias("program_max_semester"))

# Add the program_max_semester column to final_df
final_df_with_max = final_df.join(
    program_max_semesters,
    ["program_id"],
    "left"
)

# Show the results with the new column
final_df_with_max.select(
    "srcode", 
    "semester", 
    "sem_average", 
    "program_id", 
    "program_max_semester"
).show(500)

+------+--------+-----------+----------+--------------------+
|srcode|semester|sem_average|program_id|program_max_semester|
+------+--------+-----------+----------+--------------------+
|100005|       1|   1.083333|        98|                   7|
|100005|       2|   1.083333|        98|                   7|
|100005|       3|   1.000000|        98|                   7|
|100005|       4|   1.125000|        98|                   7|
|100010|       1|   1.972222|        51|                  11|
|100010|       2|   2.027778|        51|                  11|
|100010|       3|   2.900000|        51|                  11|
|100010|       4|   5.000000|        51|                  11|
|100010|       5|   2.500000|        51|                  11|
|100010|       6|   2.318182|        51|                  11|
|100010|       7|   2.375000|        51|                  11|
|100010|       8|   2.250000|        51|                  11|
|100010|       9|   1.250000|        51|                  11|
|100010|

In [54]:
from pyspark.sql.functions import avg, count, round

# Group by srcode and calculate averages and semester count
student_summary = final_df_with_max.groupBy("srcode", "program_id", "program_max_semester") \
    .agg(
        avg("sem_average").alias("overall_average"),
        count("semester").alias("semesters_taken")
    ) \
    .orderBy("srcode")

# Show the results
student_summary.select(
    "srcode",
    "semesters_taken",
    "overall_average",
    "program_id",
    "program_max_semester"
).show(500)

+------+---------------+---------------+----------+--------------------+
|srcode|semesters_taken|overall_average|program_id|program_max_semester|
+------+---------------+---------------+----------+--------------------+
|100005|              4|   1.0729165000|        98|                   7|
|100010|             10|   2.4968182000|        51|                  11|
|100016|              9|   1.9401234444|        65|                  11|
|100108|              6|   1.3194445000|       105|                   8|
|100113|              7|   2.2450964286|        65|                  11|
|100143|              5|   1.2666666000|       101|                   8|
|100196|              9|   2.2916667778|        41|                  11|
|100200|              8|   1.9079861250|        10|                   9|
|100207|              8|   1.3489458750|        30|                  11|
|100216|              8|   2.7420758750|        53|                  14|
|100243|             10|   2.4040674000|        27|

In [57]:
# Create new DataFrame with selected columns
student_summary_simplified = student_summary.select(
    "srcode",
    "semesters_taken",
    "overall_average",
    "program_id"
)

# Show the results
student_summary_simplified.show(500)

+------+---------------+---------------+----------+
|srcode|semesters_taken|overall_average|program_id|
+------+---------------+---------------+----------+
|100005|              4|   1.0729165000|        98|
|100010|             10|   2.4968182000|        51|
|100016|              9|   1.9401234444|        65|
|100108|              6|   1.3194445000|       105|
|100113|              7|   2.2450964286|        65|
|100143|              5|   1.2666666000|       101|
|100196|              9|   2.2916667778|        41|
|100200|              8|   1.9079861250|        10|
|100207|              8|   1.3489458750|        30|
|100216|              8|   2.7420758750|        53|
|100243|             10|   2.4040674000|        27|
|100252|              7|   1.5599205714|        66|
|100283|              7|   1.3749998571|        99|
|100312|              9|   2.6613756667|        35|
|100322|              8|   1.9345113750|        62|
|100332|              6|   1.7152778333|        41|
|100350|    

In [None]:
final_df_with_max.count()

271335