In [2]:
from pyspark.sql import SparkSession

warehouse_path = "file:///C:/tmp/spark_warehouse"

jdbc_url = "jdbc:postgresql://192.168.20.11:5432/demo_db"
#jdbc_url = "jdbc:postgresql://localhost:5432/local_student_grades"
properties = {
    "user": "postgres", 
    "password": "postgres",  
    "driver": "org.postgresql.Driver",
    "fetchsize": "10000"
}

#DOWNLOAD FROM ORACLE
postgres_driver_path = "C:\postgresql-42.7.5.jar"

def extract(jdbc_url, table_name, properties, postgres_driver_path):
    """ Extract data from PostgreSQL database using Spark."""
    # Initialize Spark session
    # Initialize Spark session
    spark = SparkSession.builder \
        .master("local[*]") \
        .appName("Hudi Batch Write") \
        .config("spark.jars", postgres_driver_path) \
        .config("spark.jars.packages", "org.apache.hudi:hudi-spark3.3-bundle_2.12:0.14.0") \
        .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer") \
        .config("spark.sql.hive.convertMetastoreParquet", "false") \
        .config("spark.sql.extensions", "org.apache.spark.sql.hudi.HoodieSparkSessionExtension") \
        .config("spark.driver.memory", "4g") \
        .config("spark.executor.memory", "4g") \
        .config("spark.executor.memoryOverhead", "1g") \
        .config("spark.driver.memoryOverhead", "1g") \
        .config("spark.sql.warehouse.dir", warehouse_path) \
        .getOrCreate()

    # Extracts data from PostgreSQL database
    df = spark.read.jdbc(
        url=jdbc_url,
        table=table_name,
        properties=properties,
    )
    
    return df, spark

# Extract data once
raw_df, spark = extract(jdbc_url, "curriculum_courses", properties, postgres_driver_path)

# Check the number of rows
row_count = raw_df.count()
print(f'Number of rows: {row_count}')

# Print schema
raw_df.printSchema()

Number of rows: 14500
root
 |-- id: long (nullable = true)
 |-- code: string (nullable = true)
 |-- description: string (nullable = true)
 |-- units: string (nullable = true)
 |-- type: string (nullable = true)
 |-- subtype: string (nullable = true)
 |-- program: string (nullable = true)
 |-- major: string (nullable = true)
 |-- college: string (nullable = true)
 |-- curriculum: string (nullable = true)
 |-- year_level: string (nullable = true)
 |-- semester: string (nullable = true)
 |-- lecture_hours: string (nullable = true)
 |-- lab_hours: string (nullable = true)
 |-- course_id: string (nullable = true)
 |-- curriculum_id: string (nullable = true)



In [3]:
raw_df.show(50)

+---+----------+--------------------+-----+----+-------+--------------------+-----+--------------------+----------+----------+--------+-------------+---------+----------+-------------+
| id|      code|         description|units|type|subtype|             program|major|             college|curriculum|year_level|semester|lecture_hours|lab_hours| course_id|curriculum_id|
+---+----------+--------------------+-----+----+-------+--------------------+-----+--------------------+----------+----------+--------+-------------+---------+----------+-------------+
|  1|  Fili 101|Kontekstwalisadon...|    3| NON|   null|Bachelor of Arts ...| null|College of Arts a...| 2018-2019|     FIRST|   FIRST|            3|        0|UJS9IN0HYJ|   2Z8PM78HQN|
|  2|   GEd 101|Understanding the...|    3| NON|   null|Bachelor of Arts ...| null|College of Arts a...| 2018-2019|     FIRST|   FIRST|            3|        0|QVSD2N8TU3|   2Z8PM78HQN|
|  3|   GEd 105|Readings in Phili...|    3| NON|   null|Bachelor of Arts ..

In [4]:
raw_df.createOrReplaceTempView('curriculum_courses')

show_data = spark.sql("""SELECT * FROM curriculum_courses
                      WHERE units = 'Total Quality Management' or units = 'Inorganic and Organic Chemistry' """)
show_data.show(22)

+-----+-------+-----------+--------------------+----+-------+-------+--------------------+-------------------+--------------------+----------+--------+-------------+---------+---------+----------------+
|   id|   code|description|               units|type|subtype|program|               major|            college|          curriculum|year_level|semester|lecture_hours|lab_hours|course_id|   curriculum_id|
+-----+-------+-----------+--------------------+----+-------+-------+--------------------+-------------------+--------------------+----------+--------+-------------+---------+---------+----------------+
| 1375| PM 109|       null|Total Quality Man...|   3|    NON|   null|Bachelor of Indus...|Drafting Technology|College of Engine...| 2018-2019|   THIRD|       SECOND|        3|        0|      GTMF89Q7W9|
| 1434| PM 109|       null|Total Quality Man...|   3|    NON|   null|Bachelor of Indus...|Drafting Technology|College of Engine...| 2023-2024|   THIRD|       SECOND|        3|        0|YBG

In [5]:
# Create a temporary view of the data
raw_df.createOrReplaceTempView('curriculum_courses')

# Transform the data by shifting columns to the left for specified units
transformed_df = spark.sql("""
    SELECT 
        id,
        code,
        CASE 
            WHEN units IN ('Total Quality Management', 'Inorganic and Organic Chemistry') THEN units
            ELSE description
        END as description,
        CASE 
            WHEN units IN ('Total Quality Management', 'Inorganic and Organic Chemistry') THEN type
            ELSE units
        END as units,
        CASE 
            WHEN units IN ('Total Quality Management', 'Inorganic and Organic Chemistry') THEN subtype
            ELSE type
        END as type,
        CASE 
            WHEN units IN ('Total Quality Management', 'Inorganic and Organic Chemistry') THEN program
            ELSE subtype
        END as subtype,
        CASE 
            WHEN units IN ('Total Quality Management', 'Inorganic and Organic Chemistry') THEN major
            ELSE program
        END as program,
        CASE 
            WHEN units IN ('Total Quality Management', 'Inorganic and Organic Chemistry') THEN college
            ELSE major
        END as major,
        CASE 
            WHEN units IN ('Total Quality Management', 'Inorganic and Organic Chemistry') THEN curriculum
            ELSE college
        END as college,
        CASE 
            WHEN units IN ('Total Quality Management', 'Inorganic and Organic Chemistry') THEN year_level
            ELSE curriculum
        END as curriculum,
        CASE 
            WHEN units IN ('Total Quality Management', 'Inorganic and Organic Chemistry') THEN semester
            ELSE year_level
        END as year_level,
        CASE 
            WHEN units IN ('Total Quality Management', 'Inorganic and Organic Chemistry') THEN lecture_hours
            ELSE semester
        END as semester,
        CASE 
            WHEN units IN ('Total Quality Management', 'Inorganic and Organic Chemistry') THEN lab_hours
            ELSE lecture_hours
        END as lecture_hours,
        CASE 
            WHEN units IN ('Total Quality Management', 'Inorganic and Organic Chemistry') THEN course_id
            ELSE lab_hours
        END as lab_hours,
        CASE 
            WHEN units IN ('Total Quality Management', 'Inorganic and Organic Chemistry') THEN curriculum_id
            ELSE course_id
        END as course_id,
        CASE 
            WHEN units IN ('Total Quality Management', 'Inorganic and Organic Chemistry') THEN NULL
            ELSE curriculum_id
        END as curriculum_id
    FROM curriculum_courses
""")

# Show the transformed data filtered for both conditions
print("Rows with shifted data:")
transformed_df.filter(
    "description IN ('Total Quality Management', 'Inorganic and Organic Chemistry')"
).show(500)

Rows with shifted data:
+-----+-------+--------------------+-----+----+-------+--------------------+--------------------+--------------------+----------+----------+--------+-------------+---------+----------------+-------------+
|   id|   code|         description|units|type|subtype|             program|               major|             college|curriculum|year_level|semester|lecture_hours|lab_hours|       course_id|curriculum_id|
+-----+-------+--------------------+-----+----+-------+--------------------+--------------------+--------------------+----------+----------+--------+-------------+---------+----------------+-------------+
| 1018| PM 109|Total Quality Man...|    3| NON|   null|Bachelor of Indus...|Automotive Techno...|College of Engine...| 2018-2019|     THIRD|  SECOND|            3|        0|      M7SDMJ8DYR|   YJSPEF45MR|
| 1082| PM 109|Total Quality Man...|    3| NON|   null|Bachelor of Indus...|Automotive Techno...|College of Engine...| 2023-2024|     THIRD|  SECOND|       

In [6]:
# Show the transformed data filtered for both conditions
print("Rows with shifted data:")
transformed_df.filter(
    "description ='Inorganic and Organic Chemistry'"
).show(500)

Rows with shifted data:
+-----+-------+--------------------+-----+----+-------+----------------+-----------------+--------------------+----------+----------+--------+-------------+---------+----------+-------------+
|   id|   code|         description|units|type|subtype|         program|            major|             college|curriculum|year_level|semester|lecture_hours|lab_hours| course_id|curriculum_id|
+-----+-------+--------------------+-----+----+-------+----------------+-----------------+--------------------+----------+----------+--------+-------------+---------+----------+-------------+
|11892|CPH 102|Inorganic and Org...|    3| NON|   null|BS Public Health|Disaster Response|College of Nursin...| 2021-2022|     FIRST|   FIRST|            3|        0|ODEV0LEVSD|         null|
|11967|CPH 102|Inorganic and Org...|    3| NON|   null|BS Public Health|Disaster Response|College of Nursin...| 2023-2024|     FIRST|   FIRST|            3|        0|LIB8TMR0HA|         null|
+-----+-------+-

In [7]:
transformed_df.createOrReplaceTempView('curriculum_courses')
# Show the transformed data filtered for both conditions
print("Rows with shifted data:")
transformed_df.filter(
    "units in ('NON', 'LAB')"
).show(500)

Rows with shifted data:
+-----+------------+-----------+-----+----+--------------------+--------------+--------------------+---------+----------+----------+--------+-------------+----------------+----------+-------------+
|   id|        code|description|units|type|             subtype|       program|               major|  college|curriculum|year_level|semester|lecture_hours|       lab_hours| course_id|curriculum_id|
+-----+------------+-----------+-----+----+--------------------+--------------+--------------------+---------+----------+----------+--------+-------------+----------------+----------+-------------+
|  229|        null|          3|  NON|null|Bachelor of Autom...|          null|College of Engine...|2024-2025|    SECOND|    SECOND|       3|            0|      ZSHI705IN0|K9YZWXUVOT|         null|
| 3855|        null|          2|  NON|null|Bachelor of Secon...|Social Studies|College of Teache...|2023-2024|    SECOND|     FIRST|       2|            0|56RSTIVCD2-GASAV|56RSTIVCD2| 

In [8]:
# Create a temporary view of the transformed data
transformed_df.createOrReplaceTempView('curriculum_courses')

# Filter out records where units is 'NON' or 'LAB'
filtered_df = spark.sql("""
    SELECT *
    FROM curriculum_courses
    WHERE units NOT IN ('NON', 'LAB')
""")

# Show the filtered data
print("Records after removing units='NON' or 'LAB':")
filtered_df.show(500)

Records after removing units='NON' or 'LAB':
+----+----------+--------------------+-----+----+-------+--------------------+-----+--------------------+----------+----------+--------+-------------+---------+----------------+-------------+
|  id|      code|         description|units|type|subtype|             program|major|             college|curriculum|year_level|semester|lecture_hours|lab_hours|       course_id|curriculum_id|
+----+----------+--------------------+-----+----+-------+--------------------+-----+--------------------+----------+----------+--------+-------------+---------+----------------+-------------+
|   1|  Fili 101|Kontekstwalisadon...|    3| NON|   null|Bachelor of Arts ...| null|College of Arts a...| 2018-2019|     FIRST|   FIRST|            3|        0|      UJS9IN0HYJ|   2Z8PM78HQN|
|   2|   GEd 101|Understanding the...|    3| NON|   null|Bachelor of Arts ...| null|College of Arts a...| 2018-2019|     FIRST|   FIRST|            3|        0|      QVSD2N8TU3|   2Z8PM78

In [9]:
filtered_df.createOrReplaceTempView('curriculum_courses')
# Show the transformed data filtered for both conditions
print("Rows with shifted data:")
filtered_df.filter(
    "units in ('NON', 'LAB')"
).show(500)

Rows with shifted data:
+---+----+-----------+-----+----+-------+-------+-----+-------+----------+----------+--------+-------------+---------+---------+-------------+
| id|code|description|units|type|subtype|program|major|college|curriculum|year_level|semester|lecture_hours|lab_hours|course_id|curriculum_id|
+---+----+-----------+-----+----+-------+-------+-----+-------+----------+----------+--------+-------------+---------+---------+-------------+
+---+----+-----------+-----+----+-------+-------+-----+-------+----------+----------+--------+-------------+---------+---------+-------------+



In [10]:
filtered_df.createOrReplaceTempView('curriculum_courses')

clean_data = spark.sql("""
    SELECT * FROM curriculum_courses
    WHERE 
    -- Code column exclusions
    code NOT LIKE '%GEd%'
    AND code NOT LIKE '%OJT%'
    AND code != 'ES 101'
    AND code NOT LIKE '%Litr%'
    AND code NOT LIKE '%Fili%'
    AND code NOT LIKE '%NSTP%'
    AND code NOT LIKE '%PATHFIT%'
    AND code != 'PE 101'
    AND code != 'PE 102'
    AND code != 'PE 103'
    AND code != 'PE 104'
    AND (
        code NOT LIKE '%MATH%' 
        OR (
            program = 'Bachelor of Secondary Education' 
            AND major = 'Mathematics'
        )
    )
    
    -- Description column exclusions
    AND (description IS NULL OR (
        description NOT LIKE '%Supervised Industrial Training%'
        AND description NOT LIKE '%Field Study%'
        AND description NOT LIKE '%Comprehensive Examination%'
        AND description NOT LIKE '%Teaching Internship%'
        AND description NOT LIKE '%Thesis%'
        AND description NOT LIKE '%Capstone%'
        AND description NOT LIKE '%Internship%'
        AND description NOT LIKE '%Comprehensive Exam%'
        AND description NOT LIKE '%Seminar%'
        AND description NOT LIKE '%Review%'
        AND description NOT LIKE '%Colloquium%'
    ))
    
    -- Type column exclusions
    AND type NOT LIKE '%OJT%'
    
    -- College column exclusions
    AND college NOT LIKE '%Integrated School%'
    AND college NOT LIKE '%Graduate School%'
    AND college NOT LIKE '%Expanded Tertiary Education Equivalency and Accreditation Program%'
    AND college NOT LIKE '%College of Law%'
    AND college NOT LIKE '%College of Medicine%'
""")

# Show a sample of the cleaned data
clean_data.show(5)

# Optional: Count the number of rows before and after cleaning
print("Original row count:", filtered_df.count())
print("Cleaned row count:", clean_data.count())

+---+--------+--------------------+-----+----+-------+--------------------+-----+--------------------+----------+----------+--------+-------------+---------+----------+-------------+
| id|    code|         description|units|type|subtype|             program|major|             college|curriculum|year_level|semester|lecture_hours|lab_hours| course_id|curriculum_id|
+---+--------+--------------------+-----+----+-------+--------------------+-----+--------------------+----------+----------+--------+-------------+---------+----------+-------------+
| 12|COMM 101|Introduction to C...|    3| NON|   null|Bachelor of Arts ...| null|College of Arts a...| 2018-2019|     FIRST|  SECOND|            3|        0|JOX23KTE78|   2Z8PM78HQN|
| 24|COMM 120|Communication Pla...|    3| NON|   null|Bachelor of Arts ...| null|College of Arts a...| 2018-2019|    FOURTH|   FIRST|            3|        0|UBODIJ8PEN|   2Z8PM78HQN|
| 25|COMM 121|  Digital Publishing|    3| NON|   null|Bachelor of Arts ...| null|Coll

In [None]:
# If you want a single CSV file instead of multiple parts
clean_data.coalesce(1) \
    .write \
    .option("header", "true") \
    .mode("overwrite") \
    .csv("C:/tmp/spark_warehouse/recommender_cleaned_curriculum_courses.csv")

Py4JJavaError: An error occurred while calling o63.csv.
: java.io.IOException: Unable to clear output directory file:/C:/tmp/spark_warehouse/reccomender_cleaned_curriculum_courses.csv prior to writing to it
	at org.apache.spark.sql.errors.QueryExecutionErrors$.cannotClearOutputDirectoryError(QueryExecutionErrors.scala:678)
	at org.apache.spark.sql.execution.datasources.InsertIntoHadoopFsRelationCommand.deleteMatchingPartitions(InsertIntoHadoopFsRelationCommand.scala:235)
	at org.apache.spark.sql.execution.datasources.InsertIntoHadoopFsRelationCommand.run(InsertIntoHadoopFsRelationCommand.scala:128)
	at org.apache.spark.sql.execution.command.DataWritingCommandExec.sideEffectResult$lzycompute(commands.scala:113)
	at org.apache.spark.sql.execution.command.DataWritingCommandExec.sideEffectResult(commands.scala:111)
	at org.apache.spark.sql.execution.command.DataWritingCommandExec.executeCollect(commands.scala:125)
	at org.apache.spark.sql.execution.QueryExecution$$anonfun$eagerlyExecuteCommands$1.$anonfun$applyOrElse$1(QueryExecution.scala:98)
	at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$6(SQLExecution.scala:109)
	at org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:169)
	at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$1(SQLExecution.scala:95)
	at org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:779)
	at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:64)
	at org.apache.spark.sql.execution.QueryExecution$$anonfun$eagerlyExecuteCommands$1.applyOrElse(QueryExecution.scala:98)
	at org.apache.spark.sql.execution.QueryExecution$$anonfun$eagerlyExecuteCommands$1.applyOrElse(QueryExecution.scala:94)
	at org.apache.spark.sql.catalyst.trees.TreeNode.$anonfun$transformDownWithPruning$1(TreeNode.scala:584)
	at org.apache.spark.sql.catalyst.trees.CurrentOrigin$.withOrigin(TreeNode.scala:176)
	at org.apache.spark.sql.catalyst.trees.TreeNode.transformDownWithPruning(TreeNode.scala:584)
	at org.apache.spark.sql.catalyst.plans.logical.LogicalPlan.org$apache$spark$sql$catalyst$plans$logical$AnalysisHelper$$super$transformDownWithPruning(LogicalPlan.scala:30)
	at org.apache.spark.sql.catalyst.plans.logical.AnalysisHelper.transformDownWithPruning(AnalysisHelper.scala:267)
	at org.apache.spark.sql.catalyst.plans.logical.AnalysisHelper.transformDownWithPruning$(AnalysisHelper.scala:263)
	at org.apache.spark.sql.catalyst.plans.logical.LogicalPlan.transformDownWithPruning(LogicalPlan.scala:30)
	at org.apache.spark.sql.catalyst.plans.logical.LogicalPlan.transformDownWithPruning(LogicalPlan.scala:30)
	at org.apache.spark.sql.catalyst.trees.TreeNode.transformDown(TreeNode.scala:560)
	at org.apache.spark.sql.execution.QueryExecution.eagerlyExecuteCommands(QueryExecution.scala:94)
	at org.apache.spark.sql.execution.QueryExecution.commandExecuted$lzycompute(QueryExecution.scala:81)
	at org.apache.spark.sql.execution.QueryExecution.commandExecuted(QueryExecution.scala:79)
	at org.apache.spark.sql.execution.QueryExecution.assertCommandExecuted(QueryExecution.scala:116)
	at org.apache.spark.sql.DataFrameWriter.runCommand(DataFrameWriter.scala:860)
	at org.apache.spark.sql.DataFrameWriter.saveToV1Source(DataFrameWriter.scala:390)
	at org.apache.spark.sql.DataFrameWriter.saveInternal(DataFrameWriter.scala:363)
	at org.apache.spark.sql.DataFrameWriter.save(DataFrameWriter.scala:239)
	at org.apache.spark.sql.DataFrameWriter.csv(DataFrameWriter.scala:851)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at java.base/jdk.internal.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.base/java.lang.reflect.Method.invoke(Method.java:566)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.ClientServerConnection.waitForCommands(ClientServerConnection.java:182)
	at py4j.ClientServerConnection.run(ClientServerConnection.java:106)
	at java.base/java.lang.Thread.run(Thread.java:834)
