In [5]:
from pyspark.sql import SparkSession
from pyspark.sql.window import Window
from pyspark.sql.functions import col, lag, datediff
from datetime import date
import psycopg2 
import os

In [8]:
def ncc_difference(stgSubsidyReportFact_df):
    df = stgSubsidyReportFact_df.withColumn("recieve_days", datediff(col("recieve_date"),col("start_subsidize")))
    final_df = df.withColumn("spend_diff", col("actual_spending")-col("subsidy_money"))
    return final_df

In [5]:
def grade_difference(povertyfact_df):
    # create window for ordered dataframe
    # create previous-value columns by "lag" function
    windowSpec = Window.partitionBy("family_code").orderBy(["family_code", "year", "b1_grade", "b2_grade"])
    temp_b1 = lag("b1_grade").over(windowSpec).cast("integer")
    temp_b2 = lag("b2_grade").over(windowSpec).cast("integer")

    # Calculate the difference between current and previous b1_grade, b2_grade
    df = povertyfact_df.withColumn("b1_diff", col('b1_grade') - temp_b1)
    df = df.withColumn("b2_diff", col('b2_grade') - temp_b2)
    return df

In [28]:
def count_member(povertyfact_df, member_df):
    # create a dataframe for number of member each family_id
    count_df = member_df.groupBy("family_id").count()

    # join povertyfact to count_df and assign "count" value to "member_num" -> drop "count" column at the end
    joined_df = povertyfact_df.join(count_df, on="family_id", how="left")
    updated_df = joined_df.withColumn("member_num", joined_df["count"])
    updated_df = updated_df.na.fill(value=-1)
    final_df = updated_df.drop("count")
    return final_df

In [4]:
def find_age_member(memberSurveyFact_df):
    final_df = memberSurveyFact_df.withColumn('age', date.today().year - col('year_of_birth')) 
    return final_df

In [10]:
import json

with open("../../config.json", "r") as file:
        config = json.load(file)
    
spark = SparkSession.builder.appName("Test connect to Postgresql") \
        .config('spark.jars.packages', 'org.postgresql:postgresql:42.7.3') \
        .getOrCreate()
        
stgSubsidyReportFact_df = spark.read.format("jdbc") \
        .option("url", f"{config['URL_BASE_LOCAL']}:{config['PORT']}/LdtbxhStage") \
        .option("driver", f"{config['DRIVER']}") \
        .option("dbtable", 'public."stgSubsidyReportFact"') \
        .option("user", f"{config['USER']}") \
        .option("password", f"{config['PASSWORD']}") \
        .load()

finalfact_df = ncc_difference(stgSubsidyReportFact_df)

print("====================")
print(finalfact_df.show(5))
print("====================")
        

with psycopg2.connect(
        database="LdtbxhStage",
        user="postgres",
        password="nhanbui",
        host="localhost",
        port="5434"
) as conn:
        with conn.cursor() as cur:
                for row in finalfact_df.collect():
                        cur.execute(f"""UPDATE public."stgSubsidyReportFact"
                                        SET 
                                                recieve_days={row['recieve_days']},
                                                spend_diff={row['spend_diff']}
                                        WHERE profile_code='{row['profile_code']}' 
                                                AND subsidy_code='{row['subsidy_code']}' 
                                                AND year={row['year']}""")

+-------------+-------------+------------+----------+------------+------+------------+----+----------+--------------------+-------------+--------+------------+------------+---------------+----------+---------------+
|province_code|district_code|profile_code|  ncc_code|   full_name|ethnic|subsidy_code|year|spend_type|        subsidy_name|subsidy_money|submoney|recieve_days|recieve_date|start_subsidize|spend_diff|actual_spending|
+-------------+-------------+------------+----------+------------+------+------------+----+----------+--------------------+-------------+--------+------------+------------+---------------+----------+---------------+
|        67   |        664  |  LS001     |LS        |Nguyễn Văn A|  Bana|  LS0       |2022|   một lần|Trợ cấp một lần k...|       4.11E7|    NULL|          16|  2022-08-20|     2022-08-04|       0.0|         4.11E7|
|        67   |        664  |  LS001     |LS        |Nguyễn Văn A|  Bana|  LS1       |2022|   một lần|Hỗ trợ chi phí bá...|    2055000.0