In [6]:
import configparser
from datetime import datetime, timedelta
import os
from signal import signal, SIGPIPE, SIG_DFL
from pyspark.sql import SparkSession
from pyspark.sql.functions import udf, col, from_unixtime, monotonically_increasing_id, to_date
from pyspark.sql.types import *
from pyspark.sql.functions import year, month, dayofmonth, hour, weekofyear, date_format
from pyspark.sql.functions import count
from s3path import S3Path

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [2]:
def create_spark_session():
    """
    Purpose:
        Build an access spark session for dealing data ETL of Data Lake
    :return: spark session
    """
    spark = SparkSession \
        .builder \
        .config("spark.jars.packages", "org.apache.hadoop:hadoop-aws:2.7.0") \
        .getOrCreate()
    # .config("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem") \
    # .config("spark.hadoop.fs.s3a.awsAccessKeyId", os.environ['AWS_ACCESS_KEY_ID']) \
    # .config("spark.hadoop.fs.s3a.awsSecretAccessKey", os.environ['AWS_SECRET_ACCESS_KEY'])

    return spark

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [3]:
spark = create_spark_session()

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [4]:
spark

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

<pyspark.sql.session.SparkSession object at 0x7f2fdb88e810>

In [7]:
sc.list_packages()

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

aws-cfn-bootstrap (2.0)
beautifulsoup4 (4.9.3)
boto (2.49.0)
boto3 (1.24.17)
botocore (1.27.17)
click (8.1.1)
docutils (0.14)
jmespath (1.0.0)
joblib (1.1.0)
lockfile (0.11.0)
lxml (4.8.0)
mysqlclient (1.4.2)
nltk (3.7)
nose (1.3.4)
numpy (1.20.0)
pip (9.0.1)
py-dateutil (2.2)
pystache (0.5.4)
python-daemon (2.2.3)
python-dateutil (2.8.2)
python37-sagemaker-pyspark (1.4.1)
pytz (2022.1)
PyYAML (5.4.1)
regex (2021.11.10)
s3path (0.3.4)
s3transfer (0.6.0)
setuptools (28.8.0)
simplejson (3.2.0)
six (1.13.0)
smart-open (6.0.0)
tqdm (4.63.1)
urllib3 (1.26.9)
wheel (0.29.0)
windmill (1.6)

You are using pip version 9.0.1, however version 22.1.2 is available.
You should consider upgrading via the 'pip install --upgrade pip' command.

# Check data quality:
1. Ckeck each dimension table if there are duplicate IDs.
2. There are no empty data
3. Source/ count checks to ensure completeness (It's totally matched with ETL processed data.)

In [23]:
# Expection duplicated IDs Count
EXPECTION_DUPLICATED_IDS_COUNT = 0

# Source Data Count After filter processed
df_news_source_count = 45805
df_immigration_personal_source_count = 3096313
df_immigration_main_information_source_count = 3096313
df_us_cities_demographics_source_count = 2891

df_imm_destination_city_source_count = 582
df_imm_city_res_label_source_count = 5
df_imm_travel_code_source_count = 4
df_imm_address_source_count = 55
df_imm_visa_source_count = 3

# Dest AWS S3 Bucket
dest_aws_s3_bucket = 'destetlbucket'

dim_bucket_path = S3Path(f"/{dest_aws_s3_bucket}/dimension_table")

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [11]:
# Each table's table name, unique ID key name, EXPECTION_DUPLICATED_IDS_COUNT and source data count
dict_dimension_table_IDs = {
    'df_immigration_personal': [
        'imm_per_cic_id',
        EXPECTION_DUPLICATED_IDS_COUNT,
        df_immigration_personal_source_count
    ],
    'immigration_main_information': [
        'imm_main_cic_id',
        EXPECTION_DUPLICATED_IDS_COUNT,
        df_immigration_main_information_source_count
    ],
    'news_article_data': [
        'news_cord_uid',
        EXPECTION_DUPLICATED_IDS_COUNT,
        df_news_source_count
    ],
    'us_cities_demographics_data': [
        'cidemo_id',
        EXPECTION_DUPLICATED_IDS_COUNT,
        df_us_cities_demographics_source_count
    ],
    # ***** imm_cit_res *****
    'imm_city_res_label': [
        'col_of_imm_cntyl',
        EXPECTION_DUPLICATED_IDS_COUNT,
        df_imm_city_res_label_source_count
    ],
    # ***** imm_port *****
    'imm_destination_city': [
        'code_of_imm_destination_city',
        EXPECTION_DUPLICATED_IDS_COUNT,
        df_imm_destination_city_source_count
    ],
    # ***** imm_mod *****
    'imm_travel_code': [
        'code_of_imm_travel_code',
        EXPECTION_DUPLICATED_IDS_COUNT,
        df_imm_travel_code_source_count
    ],
    # ***** imm_addr *****
    'imm_address': [
        'code_of_imm_address',
        EXPECTION_DUPLICATED_IDS_COUNT,
        df_imm_address_source_count
    ],
    # ***** imm_visa *****
    'imm_visa': [
        'code_of_imm_visa',
        EXPECTION_DUPLICATED_IDS_COUNT,
        df_imm_visa_source_count
    ]
}

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [12]:
# ***** Make sure my data, after ETL processed, has be done with no duplicated IDs.
def data_quality_check_duplicated_IDs(dim_bucket_path: dim_bucket_path, dict_dimension_table_IDs: dict_dimension_table_IDs):
    """
    Purpose:
        Make sure my data, after ETL processed, has be done with no duplicated IDs.

    Args:
        dim_bucket_path (s3path object): Get AWS S3 dimension table of destination bucket path

    Raises:
        Exception: If dimension table's has duplicated IDs, it was raised expected for checking alert messages.
    """
    for path in dim_bucket_path.iterdir():
        if path.is_dir():
            path = str(path)
            # For loop this dict_dimension_table_IDs dictionary data to check my duplicated IDs, etc.
            for dim_table, key_duplicated_source_etl in dict_dimension_table_IDs.items():
                dimension_table_name = path.split('/')[-1]
                if dimension_table_name in dim_table:
                    df = spark.read.parquet(f"s3:/{path}")

                    # For checking each dimension table's IDs is duplicated or not.
                    check_dataframe_duplicated_IDs_count = df.groupBy(key_duplicated_source_etl[0]).agg(
                        count(key_duplicated_source_etl[0]).alias('check_duplicated_IDs')).filter(col('check_duplicated_IDs') > 1).count()

                    if check_dataframe_duplicated_IDs_count != key_duplicated_source_etl[1]:
                        raise Exception(f"Check table {dim_table} has duplicated IDs, not expected {key_duplicated_source_etl[1]}!!")
                    else:
                        print(f"The table {dim_table} is expectations of value {key_duplicated_source_etl[1]}.")

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [13]:
data_quality_check_duplicated_IDs(dim_bucket_path, dict_dimension_table_IDs)

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

The table df_immigration_personal is expectations of value 0.
The table imm_address is expectations of value 0.
The table imm_city_res_label is expectations of value 0.
The table imm_destination_city is expectations of value 0.
The table imm_travel_code is expectations of value 0.
The table imm_visa is expectations of value 0.
The table immigration_main_information is expectations of value 0.
The table news_article_data is expectations of value 0.
The table us_cities_demographics_data is expectations of value 0.

In [28]:
# ***** Checking Source Data Count but figure out some filter condition to make sure it's correctly. *****
def data_quality_check_source_data_count(dim_bucket_path: dim_bucket_path, dict_dimension_table_IDs: dict_dimension_table_IDs):
    """
    Purpose:
        Checking Source Data Count but figure out some filter condition to make sure it's correctly.

    Args:
        dim_bucket_path (s3path object): Get AWS S3 dimension table of destination bucket path

    Raises:
        Exception: If dimension table's not matched of source data count, it was raised expected for checking alert messages.
    """
    for path in dim_bucket_path.iterdir():
        if path.is_dir():
            path = str(path)
            for dim_table, key_duplicated_source_etl in dict_dimension_table_IDs.items():
                dimension_table_name = path.split('/')[-1]
                if dimension_table_name in dim_table:
                    df = spark.read.parquet(f"s3:/{path}")
                    if df.count() != key_duplicated_source_etl[2]:
                        raise Exception(f"Check table {dim_table} not matched expected source data count {key_duplicated_source_etl[2]}!!")
                    else:
                        print(f"The table {dim_table} is expectations of source data count {key_duplicated_source_etl[2]:,}.")

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [29]:
data_quality_check_source_data_count(dim_bucket_path, dict_dimension_table_IDs)

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

The table df_immigration_personal is expectations of source data count 3,096,313.
The table imm_address is expectations of source data count 55.
The table imm_city_res_label is expectations of source data count 5.
The table imm_destination_city is expectations of source data count 582.
The table imm_travel_code is expectations of source data count 4.
The table imm_visa is expectations of source data count 3.
The table immigration_main_information is expectations of source data count 3,096,313.
The table news_article_data is expectations of source data count 45,805.
The table us_cities_demographics_data is expectations of source data count 2,891.