In [None]:
import sys
sys.argv.append('--JOB_NAME')
sys.argv.append('address-cleaning')

sys.argv.append('--source_catalog_database')
sys.argv.append('housing-repairs-raw-zone')

sys.argv.append('--source_catalog_table')
sys.argv.append('housing_repairs_fire_alarmaov')

sys.argv.append('--cleaned_repairs_s3_bucket_target')
sys.argv.append('s3://dataplatform-stg-refined-zone/housing-repairs/repairs-electrical-mechanical-fire/fire-alarmaov/cleaned')

In [None]:
import re
import sys

from awsglue.context import GlueContext
from awsglue.dynamicframe import DynamicFrame
from awsglue.job import Job
from awsglue.transforms import *
from awsglue.utils import getResolvedOptions
from pyspark.context import SparkContext
from pyspark.ml import Pipeline
from pyspark.ml.feature import RegexTokenizer, NGram, HashingTF, MinHashLSH
from pyspark.sql import types as t
from pyspark.sql.window import Window
from pyspark.sql.functions import rank, col, trim, when, max, trim
import pyspark.sql.functions as F
from pyspark.sql.functions import *
from pyspark.sql.types import StringType, TimestampType


def get_glue_env_var(key, default="none"):
    if f'--{key}' in sys.argv:
        return getResolvedOptions(sys.argv, [key])[key]
    else:
        return default

def getLatestPartitions(dfa):
   dfa = dfa.where(col('import_year') == dfa.select(max('import_year')).first()[0])
   dfa = dfa.where(col('import_month') == dfa.select(max('import_month')).first()[0])
   dfa = dfa.where(col('import_day') == dfa.select(max('import_day')).first()[0])
   return dfa

In [None]:
args = getResolvedOptions(sys.argv, ['JOB_NAME'])

source_catalog_database = get_glue_env_var('source_catalog_database', '')
source_catalog_table    = get_glue_env_var('source_catalog_table', '')
cleaned_repairs_s3_bucket_target = get_glue_env_var('cleaned_repairs_s3_bucket_target', '')


sc = SparkContext.getOrCreate()
glueContext = GlueContext(sc)
logger = glueContext.get_logger()
job = Job(glueContext)
job.init(args['JOB_NAME'], args)

logger.info('Fetch Source Data')

source_data = glueContext.create_dynamic_frame.from_catalog(
    name_space=source_catalog_database,
    table_name=source_catalog_table
) 

df = source_data.toDF()
df = getLatestPartitions(df)

In [None]:
# drop empty rows
df2 = df.filter(df.address != 'nan')

In [None]:
df2.show(vertical=True)

In [None]:
logger.info('convert date columns to datetime / date field types')
df2 = df2.withColumn('date', F.to_timestamp('date', 'dd.MM.yy'))

df2.show(vertical=True)

In [None]:
# keep selected columns
df2 = df2[['address',
           'description',
           'date',
           'temp_order_number',
           'priority_code',
           'sor',
           'cost',
           'subjective',
           'contractor_s_own_ref_no',
           'contractor_job_status_complete_or_in_progress',
           'date_completed',
           'new_uhw_number', 
           'requested_by',
           'import_datetime', 
           'import_timestamp', 
           'import_year', 
           'import_month', 
           'import_day', 
           'import_date']]

In [None]:
# add new data source column to specify which repairs sheet the repair came from
df2 = df2.withColumn('data_source', F.lit('ElecMechFire - Fire Alarm AOV'))

# rename column names
df2 = df2.withColumnRenamed('address', 'property_address') \
    .withColumnRenamed('description', 'description_of_work') \
    .withColumnRenamed('date', 'datetime_raised') \
    .withColumnRenamed('temp_order_number', 'temp_order_number_full') \
    .withColumnRenamed('priority_code', 'work_priority_description') \
    .withColumnRenamed('cost', 'order_value') \
    .withColumnRenamed('subjective', 'budget_code') \
    .withColumnRenamed('contractor_s_own_ref_no', 'contractor_ref') \
    .withColumnRenamed('contractor_job_status_complete_or_in_progress', 'order_status') \
    .withColumnRenamed('date_completed', 'completed_date') \
    .withColumnRenamed('requested_by', 'operative')

In [None]:
# remove any spaces from 'work_priority_description' column so that values can be mapped where applicable
df2 = df2.withColumn('work_priority_description', trim(df2.work_priority_description))

df2 = df2.withColumn("work_priority_priority_code", when(df2['work_priority_description'] == "Immediate", 1)
                     .when(df2['work_priority_description'] == "Emergency", 2)
                     .when(df2['work_priority_description'] == "Urgent", 3)
                     .when(df2['work_priority_description'] == "Normal", 4)
                     .otherwise(None))

In [None]:
df2.show(vertical=True)

In [None]:
df2.printSchema()

In [None]:
cleanedDataframe = DynamicFrame.fromDF(df2, glueContext, "cleanedDataframe")
parquetData = glueContext.write_dynamic_frame.from_options(
    frame=cleanedDataframe,
    connection_type="s3",
    format="parquet",
    connection_options={"path": cleaned_repairs_s3_bucket_target,"partitionKeys": ["import_year", "import_month", "import_day", "import_date"]},
    transformation_ctx="parquetData")
job.commit()