In [1]:
import re
import unicodedata
import sys
import json
import io
import pandas as pd
import requests
import time
import zipfile
import html
import boto3
import datetime
from pyspark.sql import functions as f
from awsglue.transforms import *
from awsglue.utils import getResolvedOptions
from pyspark.context import SparkContext
from awsglue.context import GlueContext
from pyspark.sql import SQLContext
from awsglue.dynamicframe import DynamicFrame
from awsglue.job import Job

Starting Spark application


ID,YARN Application ID,Kind,State,Spark UI,Driver log,User,Current session?
2,,pyspark,idle,,,,✔


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

SparkSession available as 'spark'.


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [None]:
def download_file_to_df(file_id, api_key, filename):
    url_download = f'https://api.uk.alloyapp.io/api/file/{file_id}?token={api_key}'
    r = requests.get(url_download, headers=headers)
    z = zipfile.ZipFile(io.BytesIO(r.content))
    df = pd.read_csv(html.unescape(z.extract(member=filename)), index_col=False)
    return df

def get_last_import_date_time(glue_context, database, resource):
    if not table_exists_in_catalog(glue_context, resource, database):
        print(f"Couldn't find table {resource} in database {database}.")
        return datetime.datetime(1970, 1, 1)
    print(f"found table for {resource} in {database}")
    return glue_context.sql(f"SELECT max(import_datetime) as max_import_date_time FROM `{database}`.{resource}").take(1)[0].max_import_date_time

def format_time(date_time):
    t = date_time.strftime('%Y-%m-%dT%H:%M:%S.%f')
    return t[:-3]+"Z"
  
def update_aqs(alloy_query, last_import_date_time):
    child_value = [{"type": "GreaterThan", "children": [{"type": "ItemProperty", "properties": {"itemPropertyName": "lastEditDate"}}, {"type": "DateTime", "properties": {"value": []}}]}]
    child_value[0]['children'][1]['properties']['value'] = [last_import_date_time]
    alloy_query['aqs']['children'] = child_value
    return alloy_query 

def get_task_id(response):
    if response.status_code != 200:
        print(f"Request unsuccessful while getting task id with status code: {response.status_code}")
        return 
    
    json_output = json.loads(response.text)
    task_id = json_output["backgroundTaskId"]
    return task_id

def get_task_status(response):
    if response.status_code != 200:
        print(f"Request unsuccessful while getting task status with status code: {response.status_code}")
        return

    json_output = json.loads(response.text)
    task_status = json_output["task"]["status"]
    return task_status

def get_file_item_id(response):
    if response.status_code != 200:
        print(f"Request unsuccessful while getting file item id with status code: {response.status_code}")
        return

    json_output = json.loads(response.text)
    file_id = json_output["fileItemId"]
    return file_id

In [None]:
#Helper functions
def table_exists_in_catalog(glue_context, table, database):
    tables = glue_context.tables(database)

    return tables.filter(tables.tableName == table).count() == 1

def normalize_column_name(column: str) -> str:
    """
    Normalize column name by replacing all non alphanumeric characters with underscores
    strips accents and make lowercase
    :param column: column name
    :return: normalized column name
    Example of applying: df.columns = map(clean_column_names, panada_df.columns)
    """
    formatted_name = format_name(column)
    return unicodedata.normalize('NFKD', formatted_name).encode('ASCII', 'ignore').decode()

def format_name(col_name):
    non_alpha_num_chars_stripped = re.sub('[^a-zA-Z0-9]+', "_", col_name)
    no_trailing_underscores = re.sub("_$", "", non_alpha_num_chars_stripped)
    return no_trailing_underscores.lower()

def convert_pandas_df_to_spark_dynamic_df(sql_context, panadas_df):
    # Convert to SparkDynamicDataFrame
    spark_df = sql_context.createDataFrame(panadas_df)
    spark_df = spark_df.coalesce(1)
    return spark_df

def add_import_time_columns(data_frame):
    now = datetime.datetime.now()
    importYear = str(now.year)
    importMonth = str(now.month).zfill(2)
    importDay = str(now.day).zfill(2)
    importDate = importYear + importMonth + importDay

    data_frame = data_frame.withColumn(
        'import_datetime', f.current_timestamp())
    data_frame = data_frame.withColumn(
        'import_timestamp', f.lit(str(now.timestamp())))
    data_frame = data_frame.withColumn('import_year', f.lit(importYear))
    data_frame = data_frame.withColumn('import_month', f.lit(importMonth))
    data_frame = data_frame.withColumn('import_day', f.lit(importDay))
    data_frame = data_frame.withColumn('import_date', f.lit(importDate))
    return data_frame

PARTITION_KEYS = ['import_year', 'import_month', 'import_day', 'import_date']

In [None]:
#args = getResolvedOptions(sys.argv, ['JOB_NAME'])
sc = SparkContext.getOrCreate()
glue_context = GlueContext(sc)
#logger = glue_context.get_logger()
job = Job(glue_context)
#job.init(args['JOB_NAME'], args)
sparkContext = SparkContext.getOrCreate()
glueContext = GlueContext(sparkContext)
sqlContext = SQLContext(sparkContext)

In [None]:
resource = "dw_education_and_compliance_inspection"
bucket_target = "dataplatform-stg-raw-zone"
api_key = "f1e18002-5743-4aca-9d59-90c8276866d3"
database = "dataplatform-stg-env-services-raw-zone"
prefix = "env-services/alloy/api-responses/"
aqs = {"aqs":{
  "type": "Join",
  "properties": {
    "attributes": [
      "attributes_itemsTitle",
      "attributes_itemsSubtitle",
      "attributes_itemsGeometry",
      "attributes_inspectionsInspectionNumber",
      "attributes_tasksStatus",
      "attributes_tasksTeam",
      "attributes_tasksRaisedTime",
      "attributes_tasksStartTime",
      "attributes_tasksCompletionTime",
      "attributes_wasteEducationInspectionServiceOutcome_6032eba956a338006661f6f8",
      "attributes_wasteEducationInspectionResidentAvailable_6034de1cca290e006b10eaa5",
      "attributes_wasteEducationInspectionBarriersToRRW_6034e4f16668f2006c62013b",
      "attributes_wasteEducationInspectionEnforcementOutcome_6036a88b267b37006a951ac8",
      "attributes_wasteEducationInspectionEnforcementIssue_603c11fa306e42000a19ee8e",
      "attributes_tasksDescription"
    ],
    "collectionCode": [
      "Live"
    ],
    "dodiCode": "designs_wasteEducationInspection_6032eb1356a338006661f6e4",
    "joinAttributes": [
      "root.attributes_tasksStatus.attributes_taskStatusesStatus",
      "root.attributes_tasksTeam.attributes_teamsTeamName",
      "root.attributes_wasteEducationInspectionServiceOutcome_6032eba956a338006661f6f8.attributes_serviceOutcomeServiceOutcome_602eaaad3cf282006c40f4f0",
      "root.attributes_wasteEducationInspectionBarriersToRRW_6034e4f16668f2006c62013b.attributes_barriersToRRWBarriersToRRW_6034e1c96668f2006c61f949",
      "root.attributes_wasteEducationInspectionEnforcementOutcome_6036a88b267b37006a951ac8.attributes_enforcementOutcomeEnforcementOutcome_602ea67d3cf282006c40f3f3",
      "root.attributes_wasteEducationInspectionEnforcementIssue_603c11fa306e42000a19ee8e.attributes_enforcementIssueEnforcementIssue_603c0f135b27c7000aed8475"
    ],
    "sortInfo": {
      "attributeCode": "attributes_inspectionsInspectionNumber",
      "sortOrder": "Ascending"
    }
  }
},"fileName":"DW Education&Compliance Inspection.csv","exportHeaderType":"Name"}
filename = "DW Education&Compliance Inspection/DW Education&Compliance Inspection.csv"
last_import_date_time = format_time(datetime.datetime(2020, 5, 17))

In [None]:
s3_target_url = "s3://" + bucket_target + "/" +  prefix + resource + "/"

In [None]:
headers = {'Accept': 'application/json', 'Content-Type': 'application/json'}
region = 'uk'
post_url = f'https://api.{region}.alloyapp.io/api/export/?token={api_key}'
aqs = update_aqs(aqs, last_import_date_time)
response = requests.post(post_url, data=json.dumps(aqs), headers=headers)
  
task_id = get_task_id(response)
url = f'https://api.{region}.alloyapp.io/api/task/{task_id}?token={api_key}'
print(url)

In [None]:
print(json.dumps(aqs))

In [None]:
task_status = ''
file_id = ''
while task_status != 'Complete':
    time.sleep(60)
    response = requests.get(url)
    task_status = get_task_status(response)
    print(f"task status: {task_status}")

else:
    url = f'https://api.{region}.alloyapp.io/api/export/{task_id}/file?token={api_key}'
    response = requests.get(url) 
    file_id = get_file_item_id(response)
    
    print("downloading file to df")
    pandasDataFrame = download_file_to_df(file_id, api_key, filename)
    
    print("processing DF")
    all_columns = list(pandasDataFrame)
    pandasDataFrame[all_columns] = pandasDataFrame[all_columns].astype(str)
    # Replace missing column names with valid names
    pandasDataFrame.columns = ["column" + str(i) if a.strip() == "" else a.strip() for i, a in enumerate(pandasDataFrame.columns)]
    pandasDataFrame.columns = map(normalize_column_name, pandasDataFrame.columns)
    sparkDynamicDataFrame = convert_pandas_df_to_spark_dynamic_df(sqlContext, pandasDataFrame)
    sparkDynamicDataFrame = sparkDynamicDataFrame.replace('nan', None).replace('NaT', None)
    sparkDynamicDataFrame = sparkDynamicDataFrame.na.drop('all') # Drop all rows where all values are null NOTE: must be done before add_import_time_columns
    sparkDynamicDataFrame = add_import_time_columns(sparkDynamicDataFrame)
    dataframe = DynamicFrame.fromDF(sparkDynamicDataFrame, glueContext, "alloyDWeducation")
    parquetData = glueContext.write_dynamic_frame.from_options(
        frame=dataframe,
        connection_type="s3",
        connection_options={"path": s3_target_url, "partitionKeys": PARTITION_KEYS},
        format="parquet",
        transformation_ctx=f"alloy_{resource}_sink"
    )
    print("all done")

In [None]:
print("processing DF")
all_columns = list(pandasDataFrame)
pandasDataFrame[all_columns] = pandasDataFrame[all_columns].astype(str)
# Replace missing column names with valid names
pandasDataFrame.columns = ["column" + str(i) if a.strip() == "" else a.strip() for i, a in enumerate(pandasDataFrame.columns)]
pandasDataFrame.columns = map(normalize_column_name, pandasDataFrame.columns)
sparkDynamicDataFrame = convert_pandas_df_to_spark_dynamic_df(sqlContext, pandasDataFrame)
sparkDynamicDataFrame = sparkDynamicDataFrame.replace('nan', None).replace('NaT', None)
sparkDynamicDataFrame = sparkDynamicDataFrame.na.drop('all') # Drop all rows where all values are null NOTE: must be done before add_import_time_columns
sparkDynamicDataFrame = add_import_time_columns(sparkDynamicDataFrame)
dataframe = DynamicFrame.fromDF(sparkDynamicDataFrame, glueContext, "alloyDWeducation")
parquetData = glueContext.write_dynamic_frame.from_options(
    frame=dataframe,
    connection_type="s3",
    connection_options={"path": s3_target_url, "partitionKeys": PARTITION_KEYS},
    format="parquet",
    transformation_ctx=f"alloy_{resource}_sink"
)
print("all done")