In [4]:
from kafka import KafkaConsumer
bootstrap_servers = ['localhost:29092']
consumer = KafkaConsumer( bootstrap_servers=bootstrap_servers)
consumer.topics()

{'__confluent.support.metrics',
 '_schemas',
 'connect-status',
 'connect_configs',
 'connect_offsets',
 'source.public.grades_streaming',
 'test'}

In [3]:
from scripts.data_cleaners import BaseDataCleaner, AcademicDataCleaner, GradeDataCleaner
import json
from pyspark.sql import SparkSession
from pyspark.sql.functions import (
    from_json, col, when, lit, current_timestamp, 
    concat_ws, split, expr
)
from pyspark.sql.types import (
    StructType, StructField, StringType, IntegerType, LongType, 
    FloatType, TimestampType
)
import sys
sys.path.append('etl/scripts')  # Add scripts directory to path
from scripts.data_cleaners import AcademicDataCleaner, BaseDataCleaner

schema = StructType([
    StructField("id", LongType(), True),  # bigint → LongType()
    StructField("schoolyear", StringType(), True),  # character varying → StringType()
    StructField("semester", StringType(), True),
    StructField("code", StringType(), True),
    StructField("description", StringType(), True),
    StructField("units", IntegerType(), True),  # integer → IntegerType()
    StructField("instructor_id", StringType(), True),
    StructField("instructor_name", StringType(), True),
    StructField("srcode", StringType(), True),
    StructField("fullname", StringType(), True),
    StructField("campus", StringType(), True),
    StructField("program", StringType(), True),
    StructField("major", StringType(), True),
    StructField("yearlevel", StringType(), True),
    StructField("curriculum", StringType(), True),
    StructField("class_section", StringType(), True),
    StructField("grade_final", StringType(), True),
    StructField("grade_reexam", StringType(), True),
    StructField("status", StringType(), True)
])

# Configure Spark for Hudi
spark = (SparkSession.builder
         .appName("KafkaToHudiProcessor")
         .config('spark.serializer', 'org.apache.spark.serializer.KryoSerializer')
         .config("spark.jars.packages", "org.apache.hudi:hudi-spark3.3-bundle_2.12:0.14.0")
         .config('spark.sql.extensions', 'org.apache.hudi.spark3.sql.HoodieSparkSessionExtension')
         .config('className', 'org.apache.hudi.spark3.sql.HoodieSparkSessionExtension')
         .config('spark.sql.catalog.spark_catalog', 'org.apache.spark.sql.hudi.catalog.HoodieCatalog')
         .getOrCreate())

# Define transform function
def transform(df, spark):
    """Cleans and processes the extracted data."""
    grade_cleaner = GradeDataCleaner()
    df = BaseDataCleaner.standardize_case(df, ['grade_final', 'campus', 'semester', 'schoolyear'])
    df = AcademicDataCleaner.clean_semesters(df)
    df = BaseDataCleaner.remove_null_strings(df, 'semester')
    
    df = BaseDataCleaner.clean_strings(df, [
        'schoolyear', 'semester', 'code', 'description', 'units', 'instructor_id', 
        'instructor_name', 'srcode', 'fullname', 'campus', 'program', 
        'grade_final', 'grade_reexam', 'status', 'major', 'curriculum', 'class_section'
    ])
    
    df = AcademicDataCleaner.clean_schoolyear(df)
    df = grade_cleaner.process_grades(df)
    df = BaseDataCleaner.remove_null_strings(df, 'program')
    
    df = grade_cleaner.allow_numerical_data(df, "grade_reexam")

    df = AcademicDataCleaner.cast_columns(df, [("id", "int"), ("units", "int"), 
                                               ("grade_numeric", "decimal(5,2)")])
    
    df = grade_cleaner.filter_incomplete_grades(df)

    df = AcademicDataCleaner.get_valid_schoolyears(df)

    df = AcademicDataCleaner.create_yearsem_order(df)
    df = AcademicDataCleaner.map_program_ids(df, spark, "C:/LEONAIDAS/program_with_id.csv")

    # Add processing_time column
    df = df.withColumn("processing_time", current_timestamp())
    
    return df

# Define Hudi options
hudi_options = {
    'hoodie.table.name': 'grades',
    'hoodie.datasource.write.recordkey.field': 'id',
    'hoodie.datasource.write.partitionpath.field': 'schoolyear',
    'hoodie.datasource.write.table.name': 'grades',
    'hoodie.datasource.write.operation': 'upsert',
    'hoodie.datasource.write.precombine.field': 'processing_time',


}

# Kafka consumer setup
topicName = 'source.public.grades_streaming'
consumer = KafkaConsumer(topicName, auto_offset_reset='earliest', 
                         bootstrap_servers=bootstrap_servers, group_id='grades-group')

# Process each message and write to Hudi
for msg in consumer:
    # Print the message for monitoring
    message_data = json.loads(msg.value)
    print("Received message:", message_data)
    
    # Convert message to DataFrame
    message_df = spark.createDataFrame([message_data], schema=schema)
    
    # Apply transformations
    transformed_df = transform(message_df, spark)
    
    # Write to Hudi
    transformed_df.write \
        .format("hudi") \
        .options(**hudi_options) \
        .mode("append") \
        .save("C:/tmp/spark_warehouse/from_kafka")
    
    print(f"Processed and wrote message to Hudi: {message_data['id']}")

Received message: {'id': 122486, 'schoolyear': '2023-2024', 'semester': 'FIRST', 'code': 'NSTP 111RO', 'description': 'NSTP - Reserve Officers Training Corps 1', 'units': 3, 'instructor_id': '-13239', 'instructor_name': 'LN-13239, FN-13239', 'srcode': '23255', 'fullname': 'LN23255, FN23255', 'campus': 'ALANGILAN', 'program': 'BS Mechatronics Engineering', 'major': None, 'yearlevel': 'FIRST', 'curriculum': '2023-2024', 'class_section': 'NSTP111RO-AL-04', 'grade_final': '2.00', 'grade_reexam': '-', 'status': 'PASSED'}
Processed and wrote message to Hudi: 122486
Received message: {'id': 122487, 'schoolyear': '2023-2024', 'semester': 'FIRST', 'code': 'PATHFit 1', 'description': 'Movement Competency Training', 'units': 2, 'instructor_id': '-13659', 'instructor_name': 'LN-13659, FN-13659', 'srcode': '23255', 'fullname': 'LN23255, FN23255', 'campus': 'ALANGILAN', 'program': 'BS Mechatronics Engineering', 'major': None, 'yearlevel': 'FIRST', 'curriculum': '2023-2024', 'class_section': 'MEXE-11

ERROR:root:Exception while sending command.
Traceback (most recent call last):
  File "c:\Users\denve\AppData\Local\Programs\Python\Python310\lib\site-packages\py4j\clientserver.py", line 511, in send_command
    answer = smart_decode(self.stream.readline()[:-1])
RuntimeError: reentrant call inside <_io.BufferedReader name=1788>

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "c:\Users\denve\AppData\Local\Programs\Python\Python310\lib\site-packages\py4j\java_gateway.py", line 1038, in send_command
    response = connection.send_command(command)
  File "c:\Users\denve\AppData\Local\Programs\Python\Python310\lib\site-packages\py4j\clientserver.py", line 539, in send_command
    raise Py4JNetworkError(
py4j.protocol.Py4JNetworkError: Error while sending or receiving
ERROR:root:Exception while sending command.
Traceback (most recent call last):
  File "c:\Users\denve\AppData\Local\Programs\Python\Python310\lib\site-packages\py

Py4JError: An error occurred while calling o859.save

In [None]:
import json
# Kafka consumer setup
topicName = 'source.public.grades_streaming'
consumer = KafkaConsumer(topicName, auto_offset_reset='earliest', 
                         bootstrap_servers=bootstrap_servers, group_id='grades-group')

# Process each message and write to Hudi
for msg in consumer:
    # Print the message for monitoring
    message_data = json.loads(msg.value)
    print("Received message:", message_data)

Received message: {'id': 122486, 'schoolyear': '2023-2024', 'semester': 'FIRST', 'code': 'NSTP 111RO', 'description': 'NSTP - Reserve Officers Training Corps 1', 'units': 3, 'instructor_id': '-13239', 'instructor_name': 'LN-13239, FN-13239', 'srcode': '23255', 'fullname': 'LN23255, FN23255', 'campus': 'ALANGILAN', 'program': 'BS Mechatronics Engineering', 'major': None, 'yearlevel': 'FIRST', 'curriculum': '2023-2024', 'class_section': 'NSTP111RO-AL-04', 'grade_final': '2.00', 'grade_reexam': '-', 'status': 'PASSED'}
Received message: {'id': 122487, 'schoolyear': '2023-2024', 'semester': 'FIRST', 'code': 'PATHFit 1', 'description': 'Movement Competency Training', 'units': 2, 'instructor_id': '-13659', 'instructor_name': 'LN-13659, FN-13659', 'srcode': '23255', 'fullname': 'LN23255, FN23255', 'campus': 'ALANGILAN', 'program': 'BS Mechatronics Engineering', 'major': None, 'yearlevel': 'FIRST', 'curriculum': '2023-2024', 'class_section': 'MEXE-1101', 'grade_final': '1.50', 'grade_reexam': 