In [2]:
from pyspark.sql import SparkSession
from pyspark.sql.window import Window
from pyspark.sql.functions import col, lag
from datetime import date
import psycopg2 
import os

In [5]:
def grade_difference(povertyfact_df):
    # create window for ordered dataframe
    # create previous-value columns by "lag" function
    windowSpec = Window.partitionBy("family_code").orderBy(["family_code", "year", "b1_grade", "b2_grade"])
    temp_b1 = lag("b1_grade").over(windowSpec).cast("integer")
    temp_b2 = lag("b2_grade").over(windowSpec).cast("integer")

    # Calculate the difference between current and previous b1_grade, b2_grade
    df = povertyfact_df.withColumn("b1_diff", col('b1_grade') - temp_b1)
    df = df.withColumn("b2_diff", col('b2_grade') - temp_b2)
    return df

In [28]:
def count_member(povertyfact_df, member_df):
    # create a dataframe for number of member each family_id
    count_df = member_df.groupBy("family_id").count()

    # join povertyfact to count_df and assign "count" value to "member_num" -> drop "count" column at the end
    joined_df = povertyfact_df.join(count_df, on="family_id", how="left")
    updated_df = joined_df.withColumn("member_num", joined_df["count"])
    updated_df = updated_df.na.fill(value=-1)
    final_df = updated_df.drop("count")
    return final_df

In [4]:
def find_age_member(memberSurveyFact_df):
    final_df = memberSurveyFact_df.withColumn('age', date.today().year - col('year_of_birth')) 
    return final_df

In [30]:
import json

with open("../../config.json", "r") as file:
        config = json.load(file)
    
spark = SparkSession.builder.appName("Test connect to Postgresql") \
        .config('spark.jars.packages', 'org.postgresql:postgresql:42.7.3') \
        .getOrCreate()
        
povertyfact_df = spark.read.format("jdbc") \
        .option("url", f"{config['URL_BASE_LOCAL']}:{config['PORT']}/LdtbxhStage") \
        .option("driver", "org.postgresql.Driver") \
        .option("dbtable", 'public."stgPovertyStatusFact"') \
        .option("user", "postgres") \
        .option("password", "nhanbui") \
        .load()
        
member_df = spark.read.format("jdbc") \
        .option("url", f"{config['URL_BASE_LOCAL']}:{config['PORT']}/hongheovna") \
        .option("driver", "org.postgresql.Driver") \
        .option("dbtable", 'public.family_member_info') \
        .option("user", "postgres") \
        .option("password", "nhanbui") \
        .load()

temp_df = grade_difference(povertyfact_df)
finalfact_df = count_member(temp_df, member_df)

print("====================")
print(finalfact_df.show(5))
print("====================")
        

with psycopg2.connect(
        database="LdtbxhStage",
        user="postgres",
        password="nhanbui",
        host="localhost",
        port="5434"
) as conn:
        with conn.cursor() as cur:
                for row in finalfact_df.collect():
                        cur.execute(f"""UPDATE public."stgPovertyStatusFact"
                                        SET 
                                                member_num={row['member_num']},
                                                b1_diff={row['b1_diff']},
                                                b2_diff={row['b2_diff']}
                                        WHERE family_id='{row['family_id']}'""")

+--------------------+----+-------------+-------------+-----------+----------+--------------------+--------------------+--------------------+----------+-------+--------+--------+-------+-------+------------+
|           family_id|year|province_name|district_name|family_code|owner_name|        hard_reasons|        get_policies|       need_policies|member_num|a_grade|b1_grade|b2_grade|b1_diff|b2_diff|final_result|
+--------------------+----+-------------+-------------+-----------+----------+--------------------+--------------------+--------------------+----------+-------+--------+--------+-------+-------+------------+
|966a5c75-d6f5-48c...|2022|     Đắk Nông|       Cư Jút|       7009|      NULL|[không có đất sản...|[hỗ trợ y tế, hỗ ...|[hỗ trợ nước sạch...|        -1|   true|      45|      65|     -1|     -1|    hộ nghèo|
|f4773df9-32bc-417...|2023|     Đắk Nông|       Cư Jút|       7003|  Độ A Đạt|[không có đất sản...|[hỗ trợ y tế, hỗ ...|[hỗ trợ nước sạch...|         1|   true|      30