In [None]:
import sys
sys.argv.append('--JOB_NAME')
sys.argv.append('cleaning-repairs-herts-heritage')

sys.argv.append('--source_catalog_database')
sys.argv.append('housing-repairs-raw-zone')

sys.argv.append('--source_catalog_table')
sys.argv.append('housing_repairs_repairs_herts_heritage')

sys.argv.append('--cleaned_repairs_s3_bucket_target')
sys.argv.append('s3://dataplatform-stg-refined-zone/housing-repairs/repairs-herts-heritage/cleaned')

In [None]:
import sys
from awsglue.transforms import *
from awsglue.utils import getResolvedOptions
from pyspark.context import SparkContext
from awsglue.context import GlueContext
from awsglue.job import Job
from pyspark.sql.window import Window
from pyspark.sql.functions import col, max
import pyspark.sql.functions as F
from pyspark.sql.types import StringType
from awsglue.dynamicframe import DynamicFrame
#from helpers import get_glue_env_var, PARTITION_KEYS
#from repairs_cleaning_helpers import udf_map_repair_priority, clean_column_names

def getLatestPartitions(dfa):
   dfa = dfa.where(col('import_year') == dfa.select(max('import_year')).first()[0])
   dfa = dfa.where(col('import_month') == dfa.select(max('import_month')).first()[0])
   dfa = dfa.where(col('import_day') == dfa.select(max('import_day')).first()[0])
   return dfa

In [None]:
import sys
from awsglue.utils import getResolvedOptions
import datetime
import boto3
from pyspark.sql import functions as f

PARTITION_KEYS = ['import_year', 'import_month', 'import_day', 'import_date']


def get_glue_env_var(key, default="none"):
    if f'--{key}' in sys.argv:
        return getResolvedOptions(sys.argv, [key])[key]
    else:
        return default


def get_secret(logger, secret_name, region_name):
    session = boto3.session.Session()

    client = session.client(
        service_name='secretsmanager',
        region_name=region_name
    )

    get_secret_value_response = client.get_secret_value(
        SecretId=secret_name
    )

    if 'SecretString' in get_secret_value_response:
        return get_secret_value_response['SecretString']
    else:
        return get_secret_value_response['SecretBinary'].decode('ascii')

def add_timestamp_column(data_frame):
    now = datetime.datetime.now()
    return data_frame.withColumn('import_timestamp', f.lit(str(now.timestamp())))

def add_import_time_columns(data_frame):
    now = datetime.datetime.now()
    importYear = str(now.year)
    importMonth = str(now.month).zfill(2)
    importDay = str(now.day).zfill(2)
    importDate = importYear + importMonth + importDay

    data_frame = data_frame.withColumn('import_datetime', f.current_timestamp())
    data_frame = data_frame.withColumn('import_timestamp', f.lit(str(now.timestamp())))
    data_frame = data_frame.withColumn('import_year', f.lit(importYear))
    data_frame = data_frame.withColumn('import_month', f.lit(importMonth))
    data_frame = data_frame.withColumn('import_day', f.lit(importDay))
    data_frame = data_frame.withColumn('import_date', f.lit(importDate))
    return data_frame

def convert_pandas_df_to_spark_dynamic_df(sql_context, panadas_df):
    # Convert to SparkDynamicDataFrame
    spark_df = sql_context.createDataFrame(panadas_df)
    spark_df = spark_df.coalesce(1)
    spark_df = add_import_time_columns(spark_df)

    return spark_df

def get_s3_subfolders(s3_client, bucket_name, prefix):
  there_are_more_objects_in_the_bucket_to_fetch = True
  folders = []
  continuation_token = {}
  while there_are_more_objects_in_the_bucket_to_fetch:
    list_objects_response = s3_client.list_objects_v2(
      Bucket=bucket_name,
      Delimiter='/',
      Prefix=prefix,
      **continuation_token
    )

    folders.extend(x['Prefix'] for x in list_objects_response.get('CommonPrefixes', []))
    there_are_more_objects_in_the_bucket_to_fetch = list_objects_response['IsTruncated']
    continuation_token['ContinuationToken'] = list_objects_response.get('NextContinuationToken')

  return set(folders)


In [None]:
import pyspark.sql.functions as F
from pyspark.sql.types import StringType
import re

def map_repair_priority(code):
    if code == 'Immediate':
        return 1
    elif code == 'Emergency':
        return 2
    elif code == 'Urgent':
        return 3
    elif code == 'Normal':
        return 4
    else:
        return None


# convert to a UDF Function by passing in the function and the return type of function (string in this case)
udf_map_repair_priority = F.udf(map_repair_priority, StringType())

def clean_column_names(df):
    # remove trialing underscores
    df = df.select([F.col(col).alias(re.sub("_$", "", col)) for col in df.columns])
    # lowercase and remove double underscores
    df2 = df.select([F.col(col).alias(re.sub("[^0-9a-zA-Z$]+", "_", col.lower())) for col in df.columns])
    return df2

In [None]:
args = getResolvedOptions(sys.argv, ['JOB_NAME'])

source_catalog_database = get_glue_env_var('source_catalog_database', '')
source_catalog_table = get_glue_env_var('source_catalog_table', '')
cleaned_repairs_s3_bucket_target = get_glue_env_var('cleaned_repairs_s3_bucket_target', '')

In [None]:
sc = SparkContext.getOrCreate()
glueContext = GlueContext(sc)
logger = glueContext.get_logger()
job = Job(glueContext)
job.init(args['JOB_NAME'], args)

In [None]:
source_data = glueContext.create_dynamic_frame.from_catalog(
    name_space=source_catalog_database,
    table_name=source_catalog_table,
)
df = source_data.toDF()
df = getLatestPartitions(df)

In [None]:
df.printSchema()

In [None]:
df2 = clean_column_names(df)

df2 = df2.withColumn('time_stamp', F.to_timestamp("time_stamp", "dd/MM/yyyy HH:mm:ss"))
df2 = df2.withColumn('data_source', F.lit('Heritage'))

df2 = df2.withColumnRenamed('time_stamp', 'timestamp') \
    .withColumnRenamed('notes_and_information', 'notes') \
    .withColumnRenamed('contact_information_for_access', 'phone_1') \
    .withColumnRenamed('priority_code', 'work_priority_description') \
    .withColumnRenamed('email_address', 'email_staff') \
    .withColumnRenamed('temporary_order_date', 'temp_order_number_date') \
    .withColumnRenamed('temporary_order_number__time_', 'temp_order_number_time') \
    .withColumnRenamed('STATUS', 'order_status') \
    .withColumnRenamed('status_notes', 'order_status_notes')

In [None]:
df2.show()

In [None]:
cleanedDataframe = DynamicFrame.fromDF(df2, glueContext, "cleanedDataframe")
parquetData = glueContext.write_dynamic_frame.from_options(
    frame=cleanedDataframe,
    connection_type="s3",
    format="parquet",
    connection_options={"path": cleaned_repairs_s3_bucket_target,"partitionKeys": PARTITION_KEYS},
    transformation_ctx="parquetData")
job.commit()