In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.window import Window
from pyspark.sql.functions import col, lag

In [35]:
spark = SparkSession.builder.appName("Test connect to Postgresql") \
        .config('spark.jars.packages', 'org.postgresql:postgresql:42.7.3') \
        .getOrCreate()
        
povertyfact_df = spark.read.format("jdbc") \
    .option("url", "jdbc:postgresql://localhost:5434/LdtbxhStage") \
    .option("driver", "org.postgresql.Driver") \
    .option("dbtable", 'public."stgPovertyStatusFact"') \
    .option("user", "postgres") \
    .option("password", "nhanbui") \
    .load()
    
member_df = spark.read.format("jdbc") \
    .option("url", "jdbc:postgresql://localhost:5434/hongheovna") \
    .option("driver", "org.postgresql.Driver") \
    .option("dbtable", 'public.family_member_info') \
    .option("user", "postgres") \
    .option("password", "nhanbui") \
    .load()
    
print("============================")
print(povertyfact_df.printSchema())
print(member_df.printSchema())
print("============================")

root
 |-- family_id: string (nullable = true)
 |-- year: short (nullable = true)
 |-- province_name: string (nullable = true)
 |-- district_name: string (nullable = true)
 |-- family_code: string (nullable = true)
 |-- owner_name: string (nullable = true)
 |-- hard_reasons: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- get_policies: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- need_policies: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- member_num: short (nullable = true)
 |-- a_grade: boolean (nullable = true)
 |-- b1_grade: short (nullable = true)
 |-- b2_grade: short (nullable = true)
 |-- b1_diff: short (nullable = true)
 |-- b2_diff: short (nullable = true)
 |-- final_result: string (nullable = true)

None
root
 |-- member_id: string (nullable = true)
 |-- family_id: string (nullable = true)
 |-- full_name: string (nullable = true)
 |-- owner_relationship: string (nullable = true)
 |-- ye

In [31]:
def grade_difference():
    # create window for ordered dataframe
    # create previous-value columns by "lag" function
    windowSpec = Window.partitionBy("family_code").orderBy(["family_code", "year", "b1_grade", "b2_grade"])
    temp_b1 = lag("b1_grade").over(windowSpec).cast("integer")
    temp_b2 = lag("b2_grade").over(windowSpec).cast("integer")

    # Calculate the difference between current and previous b1_grade, b2_grade
    df = povertyfact_df.withColumn("b1_diff", col('b1_grade') - temp_b1)
    df = df.withColumn("b2_diff", col('b2_grade') - temp_b2)
    return df

In [49]:
def count_member():
    # create a dataframe for number of member each family_id
    count_df = member_df.groupBy("family_id").count()

    # join povertyfact to count_df and assign "count" value to "member_num" -> drop "count" column at the end
    joined_df = povertyfact_df.join(count_df, on="family_id", how="inner")
    updated_df = joined_df.withColumn("member_num", joined_df["count"])
    final_df = updated_df.drop("count")
    return final_df

DataFrame[family_id: string, year: smallint, province_name: string, district_name: string, family_code: string, owner_name: string, hard_reasons: array<string>, get_policies: array<string>, need_policies: array<string>, member_num: smallint, a_grade: boolean, b1_grade: smallint, b2_grade: smallint, b1_diff: smallint, b2_diff: smallint, final_result: string]