In [1]:
import sys
sys.argv.append('--JOB_NAME')
sys.argv.append('address-cleaning')

sys.argv.append('--source_catalog_database')
sys.argv.append('housing-repairs-raw-zone')

sys.argv.append('--source_catalog_table')
sys.argv.append('housing_repairs_electrical_supplies')

sys.argv.append('--cleaned_repairs_s3_bucket_target')
sys.argv.append('s3://dataplatform-stg-refined-zone/housing-repairs/repairs_electrical_mechanical_fire/electrical_supplies/cleaned')

Starting Spark application


ID,YARN Application ID,Kind,State,Spark UI,Driver log,Current session?
1,,pyspark,idle,,,✔


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

SparkSession available as 'spark'.


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [2]:
from awsglue.transforms import *
from awsglue.utils import getResolvedOptions
from pyspark.context import SparkContext
from awsglue.context import GlueContext
from awsglue.job import Job
from pyspark.ml import Pipeline
from pyspark.ml.feature import RegexTokenizer, NGram, HashingTF, MinHashLSH
from pyspark.sql.window import Window
from pyspark.sql.functions import rank, col, trim, when, max, trim
import pyspark.sql.functions as F
from pyspark.sql.types import StringType
from awsglue.dynamicframe import DynamicFrame

def get_glue_env_var(key, default="none"):
    if f'--{key}' in sys.argv:
        return getResolvedOptions(sys.argv, [key])[key]
    else:
        return default

def getLatestPartitions(dfa):
   dfa = dfa.where(col('import_year') == dfa.select(max('import_year')).first()[0])
   dfa = dfa.where(col('import_month') == dfa.select(max('import_month')).first()[0])
   dfa = dfa.where(col('import_day') == dfa.select(max('import_day')).first()[0])
   return dfa

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [3]:
args = getResolvedOptions(sys.argv, ['JOB_NAME'])

source_catalog_database = get_glue_env_var('source_catalog_database', '')
source_catalog_table    = get_glue_env_var('source_catalog_table', '')
cleaned_repairs_s3_bucket_target = get_glue_env_var('cleaned_repairs_s3_bucket_target', '')


sc = SparkContext.getOrCreate()
glueContext = GlueContext(sc)
logger = glueContext.get_logger()
job = Job(glueContext)
job.init(args['JOB_NAME'], args)


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [4]:
# logger.info('Fetch Source Data')
source_data = glueContext.create_dynamic_frame.from_catalog(
    name_space=source_catalog_database,
    table_name=source_catalog_table,
) 

df = source_data.toDF()
df = getLatestPartitions(df)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [5]:
df.printSchema()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

root
 |-- Address: string (nullable = true)
 |-- Description: string (nullable = true)
 |-- Date: string (nullable = true)
 |-- temp_order_number: string (nullable = true)
 |-- priority_code: string (nullable = true)
 |-- raised_value: string (nullable = true)
 |-- total_invoiced: string (nullable = true)
 |-- cost_code: string (nullable = true)
 |-- contractor's_own_ref._no: string (nullable = true)
 |-- new_uhw_number: string (nullable = true)
 |-- requested_by: string (nullable = true)
 |-- works_status/comments: string (nullable = true)
 |-- unnamed:_12: string (nullable = true)
 |-- unnamed:_13: string (nullable = true)
 |-- unnamed:_14: string (nullable = true)
 |-- unnamed:_15: string (nullable = true)
 |-- unnamed:_16: string (nullable = true)
 |-- unnamed:_17: string (nullable = true)
 |-- unnamed:_18: string (nullable = true)
 |-- unnamed:_19: string (nullable = true)
 |-- unnamed:_20: string (nullable = true)
 |-- unnamed:_21: string (nullable = true)
 |-- unnamed:_22: strin

In [57]:
import re
# clean up column names
logger.info('clean up column names')
def clean_column_names(df):
    # remove full stops from column names
    df = df.select([F.col("`{0}`".format(c)).alias(
        c.replace('.', '')) for c in df.columns])
    # remove trialing underscores
    df = df.select([F.col(col).alias(re.sub("_$", "", col))
                   for col in df.columns])
    # lowercase and remove double underscores
    df2 = df.select([F.col(col).alias(
        re.sub("[^0-9a-zA-Z$]+", "_", col.lower())) for col in df.columns])
    return df2

df2 = clean_column_names(df)

# drop unamed columns (empty columns)
all_columns = df2.columns
columns_to_drop = [i for i in all_columns if i.startswith('unnamed')]
df2 = df2.drop(*columns_to_drop)
df2.printSchema()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

root
 |-- address: string (nullable = true)
 |-- description: string (nullable = true)
 |-- date: string (nullable = true)
 |-- temp_order_number: string (nullable = true)
 |-- priority_code: string (nullable = true)
 |-- raised_value: string (nullable = true)
 |-- total_invoiced: string (nullable = true)
 |-- cost_code: string (nullable = true)
 |-- contractor_s_own_ref_no: string (nullable = true)
 |-- new_uhw_number: string (nullable = true)
 |-- requested_by: string (nullable = true)
 |-- works_status_comments: string (nullable = true)
 |-- import_datetime: timestamp (nullable = true)
 |-- import_timestamp: string (nullable = true)
 |-- import_year: string (nullable = true)
 |-- import_month: string (nullable = true)
 |-- import_day: string (nullable = true)
 |-- import_date: string (nullable = true)

In [58]:
logger.info('convert timestamp and date columns to datetime / date field types')
df2 = df2.withColumn('date', F.to_date('date', "dd/MM/yyyy"))

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [59]:
df2.printSchema()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

root
 |-- address: string (nullable = true)
 |-- description: string (nullable = true)
 |-- date: date (nullable = true)
 |-- temp_order_number: string (nullable = true)
 |-- priority_code: string (nullable = true)
 |-- raised_value: string (nullable = true)
 |-- total_invoiced: string (nullable = true)
 |-- cost_code: string (nullable = true)
 |-- contractor_s_own_ref_no: string (nullable = true)
 |-- new_uhw_number: string (nullable = true)
 |-- requested_by: string (nullable = true)
 |-- works_status_comments: string (nullable = true)
 |-- import_datetime: timestamp (nullable = true)
 |-- import_timestamp: string (nullable = true)
 |-- import_year: string (nullable = true)
 |-- import_month: string (nullable = true)
 |-- import_day: string (nullable = true)
 |-- import_date: string (nullable = true)

In [60]:
# add new data source column to specify which repairs sheet the repair came from
df2 = df2.withColumn('data_source', F.lit('ElecMechFire - Electrical Supplies'))

# # rename column names
df2 = df2.withColumnRenamed('date', 'datetime_raised') \
    .withColumnRenamed('requested_by', 'operative') \
    .withColumnRenamed('address', 'property_address') \
    .withColumnRenamed('description', 'description_of_work') \
    .withColumnRenamed('priority_code', 'work_priority_description') \
    .withColumnRenamed('temp_order_number', 'temp_order_number_full') \
    .withColumnRenamed('cost_code', 'budget_code')\
    .withColumnRenamed('total_invoiced', 'order_value')\
    .withColumnRenamed('works_status_comments', 'order_status')\
    .withColumnRenamed('contractor_s_own_ref_no', 'contractor_ref')\

df2.printSchema()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

root
 |-- property_address: string (nullable = true)
 |-- description_of_work: string (nullable = true)
 |-- datetime_raised: date (nullable = true)
 |-- temp_order_number_full: string (nullable = true)
 |-- work_priority_description: string (nullable = true)
 |-- raised_value: string (nullable = true)
 |-- order_value: string (nullable = true)
 |-- budget_code: string (nullable = true)
 |-- contractor_ref: string (nullable = true)
 |-- new_uhw_number: string (nullable = true)
 |-- operative: string (nullable = true)
 |-- order_status: string (nullable = true)
 |-- import_datetime: timestamp (nullable = true)
 |-- import_timestamp: string (nullable = true)
 |-- import_year: string (nullable = true)
 |-- import_month: string (nullable = true)
 |-- import_day: string (nullable = true)
 |-- import_date: string (nullable = true)
 |-- data_source: string (nullable = false)

In [82]:
def map_repair_priority(code):
    if code == 'Immediate':
        return 1
    elif code == 'Emergency':
        return 2
    elif code == 'Urgent':
        return 3
    elif code == 'Normal':
        return 4
    else:
        return None
    

# # convert to a UDF Function by passing in the function and the return type of function (string in this case)
udf_map_repair_priority = F.udf(map_repair_priority, StringType())
# apply function
df2 = df2.withColumn('work_priority_priority_code', udf_map_repair_priority('work_priority_description'))

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [83]:
df2.show(vertical=True)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

-RECORD 0-------------------------------------------
 property_address            | Langton Lodge        
 description_of_work         | Phase 3 - Visual ... 
 datetime_raised             | null                 
 temp_order_number_full      | RM/210121/253        
 work_priority_description   | Planned              
 raised_value                | 32.9                 
 order_value                 | 32.9                 
 budget_code                 | X7054                
 contractor_ref              | nan                  
 new_uhw_number              | Pending              
 operative                   | R McKenna            
 order_status                | Works complete       
 import_datetime             | 2021-07-13 09:33:... 
 import_timestamp            | 1626168781.503371    
 import_year                 | 2021                 
 import_month                | 07                   
 import_day                  | 13                   
 import_date                 | 20210713       

In [84]:
cleanedDataframe = DynamicFrame.fromDF(df2, glueContext, "cleanedDataframe")
parquetData = glueContext.write_dynamic_frame.from_options(
    frame=cleanedDataframe,
    connection_type="s3",
    format="parquet",
    connection_options={"path": cleaned_repairs_s3_bucket_target,"partitionKeys": ["import_year", "import_month", "import_day", "import_date"]},
    transformation_ctx="parquetData")
job.commit()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…