In [None]:
import sys

sys.argv.append('--source_catalog_table')
sys.argv.append('housing_repairs_stannah')

sys.argv.append('--source_catalog_database')
sys.argv.append('dataplatform-stg-landing-zone-database')

sys.argv.append('--s3_bucket_target')
sys.argv.append('s3://dataplatform-stg-raw-zone/housing/repairs-stannah')

In [None]:
import sys
from awsglue.transforms import *
from pyspark.context import SparkContext
from awsglue.context import GlueContext
from awsglue.dynamicframe import DynamicFrame
from awsglue.job import Job
from awsglue.utils import getResolvedOptions
from pyspark.sql.functions import rank, col, trim, when, max
import pyspark.sql.functions as F

def get_glue_env_var(key, default="none"):
    if f'--{key}' in sys.argv:
        return getResolvedOptions(sys.argv, [key])[key]
    else:
        return default
    
def getLatestPartitions(dfa):
   dfa = dfa.where(col('import_year') == dfa.select(max('import_year')).first()[0])
   dfa = dfa.where(col('import_month') == dfa.select(max('import_month')).first()[0])
   dfa = dfa.where(col('import_day') == dfa.select(max('import_day')).first()[0])
   return dfa

glueContext = GlueContext(SparkContext.getOrCreate())
job = Job(glueContext)

source_catalog_table = get_glue_env_var('source_catalog_table', '')
source_catalog_database = get_glue_env_var('source_catalog_database', '')
target_bucket = get_glue_env_var('s3_bucket_target', '')

In [None]:
data_source = glueContext.create_dynamic_frame.from_catalog(
    name_space= source_catalog_database,
    table_name= source_catalog_table
    )

# convert to data frame

df = data_source.toDF()
df = getLatestPartitions(df)

df = df.withColumnRenamed("import_date","import_datetime")
df = df.withColumn("import_date", F.concat('import_year', 'import_month', 'import_day'))
   
# convert back to dynamic frame

tmp = df[[
    'import_day',
    'import_datetime'
]]
tmp.show()
data_with_renamed_headers = DynamicFrame.fromDF(df, glueContext, "data_with_renamed_headers")

parquet_data = glueContext.write_dynamic_frame.from_options(
    frame=data_with_renamed_headers,
    connection_type="s3",
    format="parquet",
    connection_options={"path": target_bucket, "partitionKeys": ["import_year", "import_month", "import_day"]},
    transformation_ctx="data_with_renamed_headers")



job.commit()