In [17]:
from pyspark.sql import SparkSession

warehouse_path = "file:///C:/tmp/spark_warehouse"

jdbc_url = "jdbc:postgresql://192.168.20.11:5432/caist_db_v2"
#jdbc_url = "jdbc:postgresql://localhost:5432/local_student_grades"
properties = {
    "user": "postgres", 
    "password": "postgres",  
    "driver": "org.postgresql.Driver",
    "fetchsize": "10000"
}

#DOWNLOAD FROM ORACLE
postgres_driver_path = "C:\postgresql-42.7.5.jar"

def extract(jdbc_url, table_name, properties, postgres_driver_path):
    """ Extract data from PostgreSQL database using Spark."""
    # Initialize Spark session
    # Initialize Spark session
    spark = SparkSession.builder \
        .master("local[*]") \
        .appName("Hudi Batch Write") \
        .config("spark.jars", postgres_driver_path) \
        .config("spark.jars.packages", "org.apache.hudi:hudi-spark3.3-bundle_2.12:0.14.0") \
        .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer") \
        .config("spark.sql.hive.convertMetastoreParquet", "false") \
        .config("spark.sql.extensions", "org.apache.spark.sql.hudi.HoodieSparkSessionExtension") \
        .config("spark.driver.memory", "4g") \
        .config("spark.executor.memory", "4g") \
        .config("spark.executor.memoryOverhead", "1g") \
        .config("spark.driver.memoryOverhead", "1g") \
        .config("spark.sql.warehouse.dir", warehouse_path) \
        .getOrCreate()

    # Extracts data from PostgreSQL database
    df = spark.read.jdbc(
        url=jdbc_url,
        table=table_name,
        properties=properties,
    )
    
    return df, spark

# Extract data once
raw_df, spark = extract(jdbc_url, "processed_grades", properties, postgres_driver_path)

# Check the number of rows
row_count = raw_df.count()
print(f'Number of rows: {row_count}')

# Print schema
raw_df.printSchema()

Number of rows: 1998648
root
 |-- id: integer (nullable = true)
 |-- schoolyear: string (nullable = true)
 |-- semester: string (nullable = true)
 |-- code: string (nullable = true)
 |-- description: string (nullable = true)
 |-- units: integer (nullable = true)
 |-- instructor_id: string (nullable = true)
 |-- instructor_name: string (nullable = true)
 |-- srcode: string (nullable = true)
 |-- fullname: string (nullable = true)
 |-- campus: string (nullable = true)
 |-- program: string (nullable = true)
 |-- major: string (nullable = true)
 |-- yearlevel: string (nullable = true)
 |-- curriculum: string (nullable = true)
 |-- class_section: string (nullable = true)
 |-- grade_final: string (nullable = true)
 |-- grade_reexam: string (nullable = true)
 |-- status: string (nullable = true)
 |-- grade_numeric: decimal(10,2) (nullable = true)
 |-- grade_classification: string (nullable = true)
 |-- start_year: integer (nullable = true)
 |-- year_sem: string (nullable = true)
 |-- program_

In [2]:
raw_df.show()

+------+----------+--------+----------+--------------------+-----+-------------+------------------+------+----------------+------+--------------------+-------+---------+----------+---------------+-----------+------------+------+-------------+--------------------+----------+----------------+----------+
|    id|schoolyear|semester|      code|         description|units|instructor_id|   instructor_name|srcode|        fullname|campus|             program|  major|yearlevel|curriculum|  class_section|grade_final|grade_reexam|status|grade_numeric|grade_classification|start_year|        year_sem|program_id|
+------+----------+--------+----------+--------------------+-----+-------------+------------------+------+----------------+------+--------------------+-------+---------+----------+---------------+-----------+------------+------+-------------+--------------------+----------+----------------+----------+
|801712| 2023-2024|  SECOND|  Fili 102|Filipino sa Iba't...|    3|       -11078|LN-11078, F

In [9]:
import json
from pyspark.sql.functions import explode, col

# First, let's read and process the course-career mapping
def get_course_career_mapping(json_str):
    data = json.loads(json_str)
    mapping = {}
    for course in data['courses']:
        mapping[course['course']] = course['careers']
    return mapping

# Create function to transform the dataframe
def map_courses_to_careers(df, course_career_mapping):
    # Convert mapping to a list of (course, career) pairs
    mapping_pairs = []
    for course, careers in course_career_mapping.items():
        for career in careers:
            mapping_pairs.append((course, career))
            
    # Create a mapping DataFrame
    mapping_df = spark.createDataFrame(mapping_pairs, ["description", "career"])
    
    # Join with original dataframe and select required columns
    result_df = df.join(mapping_df, "description", "inner") \
                  .select("srcode", "description", "grade_numeric", "career")
    
    return result_df

# Read the JSON file
with open("C:/LEONAIDAS/scripts/career-mapping/BS Information Technology.json", "r") as f:
    course_career_json = f.read()

# Now you can use the rest of your code as is
mapping = get_course_career_mapping(course_career_json)

# Apply the transformation to your raw_df
result_df = map_courses_to_careers(raw_df, mapping)

# Show the results
result_df = result_df.orderBy("srcode", "description")

result_df.show(50, truncate=False)


+------+------------------------------+-------------+-----------------------------------------------+
|srcode|description                   |grade_numeric|career                                         |
+------+------------------------------+-------------+-----------------------------------------------+
|20006 |Business Communication        |1.50         |BPO Specialist/Analyst                         |
|20006 |Business Communication        |1.50         |Technical Writer                               |
|20006 |Business Communication        |1.50         |BPO Team Leader/Manager                        |
|20006 |Business Communication        |1.50         |IT Consultant                                  |
|20006 |Service Culture               |2.75         |BPO Specialist/Analyst                         |
|20006 |Service Culture               |2.75         |BPO Team Leader/Manager                        |
|20006 |Service Culture               |2.75         |Help Desk/Technical Support S

In [14]:
# Import required functions
from pyspark.sql.functions import current_timestamp, monotonically_increasing_id

# Add a timestamp column for Hudi
#persem_df = persem_df.withColumn("ts", current_timestamp())

# Generate a unique identifier
result_df = result_df.withColumn("uuid", monotonically_increasing_id())
result_df.show(50, truncate=False)


+------+------------------------------+-------------+-----------------------------------------------+----+
|srcode|description                   |grade_numeric|career                                         |uuid|
+------+------------------------------+-------------+-----------------------------------------------+----+
|20006 |Business Communication        |1.50         |Technical Writer                               |0   |
|20006 |Business Communication        |1.50         |IT Consultant                                  |1   |
|20006 |Business Communication        |1.50         |BPO Specialist/Analyst                         |2   |
|20006 |Business Communication        |1.50         |BPO Team Leader/Manager                        |3   |
|20006 |Service Culture               |2.75         |Help Desk/Technical Support Specialist         |4   |
|20006 |Service Culture               |2.75         |BPO Specialist/Analyst                         |5   |
|20006 |Service Culture              

In [11]:
# Import required functions
db_name = "recommender_db"
table_name = "BSIT_recommender"
path = f"{warehouse_path}/{db_name}/{table_name}"

# Define Hudi write options
hudi_options = {
    "hoodie.table.name": f"{table_name}",
    "hoodie.datasource.write.recordkey.field": "uuid",  
    'hoodie.datasource.write.operation': 'bulk_insert',
    "hoodie.datasource.write.keygenerator.class": "org.apache.hudi.keygen.NonpartitionedKeyGenerator"
}

result_df.write.format("hudi") \
    .options(**hudi_options) \
    .mode("overwrite") \
    .save(path)

In [12]:
# Read the Hudi table
read_df = spark.read \
    .format("hudi") \
    .load(f"{warehouse_path}/recommender_db/BSIT_recommender")

# Show the contents
read_df.show()

# Optional: You can also perform queries on the data
# For example, to get total records:
print(f"Total records: {read_df.count()}")

# Or to see the schema:
read_df.printSchema()

+-------------------+--------------------+------------------+----------------------+--------------------+------+--------------------+-------------+--------------------+-----------+
|_hoodie_commit_time|_hoodie_commit_seqno|_hoodie_record_key|_hoodie_partition_path|   _hoodie_file_name|srcode|         description|grade_numeric|              career|       uuid|
+-------------------+--------------------+------------------+----------------------+--------------------+------+--------------------+-------------+--------------------+-----------+
|  20250319112634102|20250319112634102...|       25769803776|                      |30e3814c-0374-463...| 63543|Computer Programm...|         1.25|      Data Scientist|25769803776|
|  20250319112634102|20250319112634102...|       25769803777|                      |30e3814c-0374-463...| 63543|Computer Programm...|         1.25|Mobile App Developer|25769803777|
|  20250319112634102|20250319112634102...|       25769803778|                      |30e3814c-03

In [1]:
import os
from pyspark.sql.functions import current_timestamp, monotonically_increasing_id, lit

def process_program_data(program_json_path, raw_df, warehouse_path):
    """Process data for a single program and save to Hudi"""
    
    # Read the JSON file
    with open(program_json_path, "r") as f:
        course_career_json = f.read()
    
    # Get program name from file name
    program_name = os.path.basename(program_json_path).replace('.json', '')
    
    # Get the mapping
    mapping = get_course_career_mapping
    
    
    mapping(course_career_json)
    
    # Create the recommendations dataframe
    program_df = map_courses_to_careers(raw_df, mapping)
    
    # Add uuid
    program_df = program_df.withColumn("uuid", monotonically_increasing_id())
    
    # Save to Hudi
    db_name = "recommender_db"
    table_name = f"{program_name}_recommender"
    path = f"{warehouse_path}/{db_name}/{table_name}"
    
    hudi_options = {
        "hoodie.table.name": table_name,
        "hoodie.datasource.write.recordkey.field": "uuid",
        'hoodie.datasource.write.operation': 'bulk_insert',
        "hoodie.datasource.write.keygenerator.class": "org.apache.hudi.keygen.NonpartitionedKeyGenerator"
    }
    
    program_df.write.format("hudi") \
        .options(**hudi_options) \
        .mode("overwrite") \
        .save(path)
    
    return program_df

# Directory containing JSON files
json_directory = "C:/LEONAIDAS/scripts/career-mapping"

# Process each program
for json_file in os.listdir(json_directory):
    if json_file.endswith('.json'):
        program_json_path = os.path.join(json_directory, json_file)
        program_df = process_program_data(program_json_path, raw_df, warehouse_path)
        print(f"Processed {json_file}")
        program_df.show(5)

NameError: name 'raw_df' is not defined