# Data Quality Checks


In [8]:
import configparser
import boto3
import awswrangler as wr
from s3path import S3Path
from pyspark.sql import SparkSession


- Check Source Data Count

In [5]:
# Execute Spark Session
spark = SparkSession \
    .builder \
    .master("local[*]") \
    .appName("spark_emr_udactity") \
    .config("spark.jars.packages", "saurfang:spark-sas7bdat:2.0.0-s_2.11") \
    .getOrCreate()


Ivy Default Cache set to: /Users/oneforall_nick/.ivy2/cache
The jars for the packages stored in: /Users/oneforall_nick/.ivy2/jars
:: loading settings :: url = jar:file:/Users/oneforall_nick/spark-2.4.8-bin-hadoop2.7/jars/ivy-2.4.0.jar!/org/apache/ivy/core/settings/ivysettings.xml
saurfang#spark-sas7bdat added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-c171e8b2-79ab-471a-b0fa-11aa50506d00;1.0
	confs: [default]
	found saurfang#spark-sas7bdat;2.0.0-s_2.11 in spark-packages
	found com.epam#parso;2.0.8 in central
	found org.slf4j#slf4j-api;1.7.5 in central
	found org.apache.logging.log4j#log4j-api-scala_2.11;2.7 in central
	found org.scala-lang#scala-reflect;2.11.8 in central
:: resolution report :: resolve 302ms :: artifacts dl 8ms
	:: modules in use:
	com.epam#parso;2.0.8 from central in [default]
	org.apache.logging.log4j#log4j-api-scala_2.11;2.7 from central in [default]
	org.scala-lang#scala-reflect;2.11.8 from central in [default]
	org.slf4j#slf4

In [6]:
spark


In [None]:
# Stop spark session if I don't need it.
# spark.stop()

In [7]:
# spark session setting configuration
spark.conf.set("spark.sql.shuffle.partitions", "5")
spark.sparkContext.getConf().getAll()


[('spark.driver.port', '51664'),
 ('spark.files',
  'file:///Users/oneforall_nick/.ivy2/jars/saurfang_spark-sas7bdat-2.0.0-s_2.11.jar,file:///Users/oneforall_nick/.ivy2/jars/com.epam_parso-2.0.8.jar,file:///Users/oneforall_nick/.ivy2/jars/org.apache.logging.log4j_log4j-api-scala_2.11-2.7.jar,file:///Users/oneforall_nick/.ivy2/jars/org.slf4j_slf4j-api-1.7.5.jar,file:///Users/oneforall_nick/.ivy2/jars/org.scala-lang_scala-reflect-2.11.8.jar'),
 ('spark.app.name', 'spark_emr_udactity'),
 ('spark.executor.id', 'driver'),
 ('spark.jars.packages', 'saurfang:spark-sas7bdat:2.0.0-s_2.11'),
 ('spark.submit.pyFiles',
  '/Users/oneforall_nick/.ivy2/jars/saurfang_spark-sas7bdat-2.0.0-s_2.11.jar,/Users/oneforall_nick/.ivy2/jars/com.epam_parso-2.0.8.jar,/Users/oneforall_nick/.ivy2/jars/org.apache.logging.log4j_log4j-api-scala_2.11-2.7.jar,/Users/oneforall_nick/.ivy2/jars/org.slf4j_slf4j-api-1.7.5.jar,/Users/oneforall_nick/.ivy2/jars/org.scala-lang_scala-reflect-2.11.8.jar'),
 ('spark.jars',
  'file:

In [9]:
# ***** Access AWS Cloud configure ************
config = configparser.ConfigParser()
config.read_file(
    open('/Users/oneforall_nick/workspace/Udacity_capstone_project/cfg/dl.cfg'))
# config.read_file(open('dl.cfg'))

aws_access_key = config["ACCESS"]["AWS_ACCESS_KEY_ID"]
aws_secret_access_key = config["ACCESS"]["AWS_SECRET_ACCESS_KEY"]
aws_token = config["ACCESS"]["AWS_TOKEN"]
# Access data from AWS S3
# SOURCE_S3_BUCKET = config['S3']['SOURCE_S3_BUCKET']
SOURCE_S3_BUCKET = 's3://mydatapool'
# Write data to AWS S3
# DEST_S3_BUCKET = config['S3']['DEST_S3_BUCKET']
DEST_S3_BUCKET = 's3://destetlbucket'
# *********************************************

# ***** Local Testing configure ************
# SOURCE_S3_BUCKET = '/Users/oneforall_nick/workspace/Udacity_capstone_project/airflow/'
# DEST_S3_BUCKET = '/Users/oneforall_nick/workspace/Udacity_capstone_project/airflow/dest_data'

# ***** Local Testing configure *****************

session = boto3.Session(
    aws_access_key_id=aws_access_key,
    aws_secret_access_key=aws_secret_access_key,
    aws_session_token=aws_token
)

s3_access = session.resource('s3')


In [3]:
# Immigration Data: Extract apr16
imm_path = "/Users/oneforall_nick/workspace/Udacity_capstone_project/airflow/data/immigration_data/immigration_apr16_sub.sas7bdat"

imm_lable_path = "/Users/oneforall_nick/workspace/Udacity_capstone_project/airflow/data/immigration_data/immigration_labels_descriptions.SAS"

# New Data
news_path = "/Users/oneforall_nick/workspace/Udacity_capstone_project/airflow/data/news_data/metadata.csv"

# Us City Demographics Data
us_city_dem_path = "/Users/oneforall_nick/workspace/Udacity_capstone_project/airflow/data/usCitiesDemographics_data/usCitiesDemo.csv"

In [10]:
# Immigration Data: Read and count
df_imm_data = spark.read.format('com.github.saurfang.sas.spark').load(imm_path)

In [11]:
type(df_imm_data)

pyspark.sql.dataframe.DataFrame

In [14]:
f"df_imm_data: apr16, count: {df_imm_data.count()}"

                                                                                

'df_imm_data: apr16, count: 3096313'

In [23]:
# TODO: Immigration Data: Label

session = boto3.Session(
    aws_access_key_id=aws_access_key,
    aws_secret_access_key=aws_secret_access_key,
    aws_session_token=aws_token
)

s3_access = session.resource('s3')


s3_object = s3_access.Bucket('mydatapool').Object('data/immigration_data/immigration_labels_descriptions.SAS').get()

text = s3_object['Body'].read()

context = text.decode(encoding='utf-8')

context = context.replace('\t', '')


In [25]:
# Convert text data to dimension table
def code_mapping(context, idx):
        content_mapping = context[context.index(idx):]
        content_line_split = content_mapping[:content_mapping.index(
            ';')].split('\n')
        content_line_list = [line.replace("'", "")
                             for line in content_line_split]
        content_two_dims = [i.strip().split('=')
                            for i in content_line_list[1:]]
        content_three_dims = [[i[0].strip(), i[1].strip().split(', ')[:][0], e]
                              for i in content_two_dims if len(i) == 2 for e in i[1].strip().split(', ')[1:]]
        return content_two_dims, content_three_dims

# ***** imm_cit_res *****
imm_cit_res_two, imm_cit_res_three = code_mapping(context, "i94cntyl")

In [27]:
imm_cit_res_three

[['582', 'MEXICO Air Sea', 'and Not Reported (I-94'],
 ['582', 'MEXICO Air Sea', 'no land arrivals)'],
 ['717', 'BONAIRE', 'ST EUSTATIUS'],
 ['717', 'BONAIRE', 'SABA'],
 ['245', 'CHINA', 'PRC'],
 ['473', 'MICRONESIA', 'FED. STATES OF'],
 ['471', 'INVALID: MARIANA ISLANDS', 'NORTHERN']]

In [28]:
f"Dimension Table: imm_cit_res_three, Count: {len(imm_cit_res_three)}"

'Dimension Table: imm_cit_res_three, Count: 7'

In [16]:
# New Data
df_news = spark.read.options(header=True, delimiter=',').csv(path=news_path)


In [18]:
type(df_news)


pyspark.sql.dataframe.DataFrame

In [19]:
f"df_news: News, count: {df_news.count()}"


                                                                                

'df_news: News, count: 45827'

In [20]:
# Us City Demographics Data
df_us_cities_demographics = spark.read.options(header=True, delimiter=';').csv(us_city_dem_path)


In [21]:
type(df_us_cities_demographics)


pyspark.sql.dataframe.DataFrame

In [22]:
f"df_news: Us Cities Demographics, count: {df_us_cities_demographics.count()}"


'df_news: Us Cities Demographics, count: 2891'

- Check data is empty or not

In [1]:
 # ***** Access AWS Cloud configure ************
config = configparser.ConfigParser()
config.read_file(open('/Users/oneforall_nick/workspace/Udacity_capstone_project/cfg/dl.cfg'))

aws_access_key = config["ACCESS"]["AWS_ACCESS_KEY_ID"]
aws_secret_access_key = config["ACCESS"]["AWS_SECRET_ACCESS_KEY"]
aws_token = config["ACCESS"]["AWS_TOKEN"]
dest_aws_s3_bucket = config["S3"]["DEST_S3_BUCKET"]


In [2]:
dim_bucket_path = S3Path(f"{dest_aws_s3_bucket}/dimension_table")
fact_bucket_path = S3Path(f"{dest_aws_s3_bucket}/fact_table")

In [9]:
 # ******** Dimension Tables ********
# Data Object Name: df_immigration_personal
if data_list := [str(path_1).split('/')[-1] for path in dim_bucket_path.iterdir() if path.is_dir() for path_1 in path.iterdir() if str(path_1).split('/')[-2] in 'df_immigration_personal']:
    print(
        f"Data: df_immigration_personal, Count data objects: {len(data_list)}")
else:
    raise ValueError("This table does not contain data!!")

# Data Object Name: imm_address
if data_list := [str(path_1).split('/')[-1] for path in dim_bucket_path.iterdir() if path.is_dir() for path_1 in path.iterdir() if str(path_1).split('/')[-2] in 'imm_address']:
    print(f"Data: imm_address, Count data objects: {len(data_list)}")
else:
    raise ValueError("This table does not contain data!!")


# Data Object Name: imm_city_res_label
if data_list := [str(path_1).split('/')[-1] for path in dim_bucket_path.iterdir() if path.is_dir() for path_1 in path.iterdir() if str(path_1).split('/')[-2] in 'imm_city_res_label']:
    print(f"Data: imm_city_res_label, Count data objects: {len(data_list)}")
else:
    raise ValueError("This table does not contain data!!")


# Data Object Name: imm_destination_city
if data_list := [str(path_1).split('/')[-1] for path in dim_bucket_path.iterdir() if path.is_dir() for path_1 in path.iterdir() if str(path_1).split('/')[-2] in 'imm_destination_city']:
    print(f"Data: imm_destination_city, Count data objects: {len(data_list)}")
else:
    raise ValueError("This table does not contain data!!")


# Data Object Name: imm_travel_code
if data_list := [str(path_1).split('/')[-1] for path in dim_bucket_path.iterdir() if path.is_dir() for path_1 in path.iterdir() if str(path_1).split('/')[-2] in 'imm_travel_code']:
    print(f"Data: imm_travel_code, Count data objects: {len(data_list)}")
else:
    raise ValueError("This table does not contain data!!")


# Data Object Name: imm_visa
if data_list := [str(path_1).split('/')[-1] for path in dim_bucket_path.iterdir() if path.is_dir() for path_1 in path.iterdir() if str(path_1).split('/')[-2] in 'imm_visa']:
    print(f"Data: imm_visa, Count data objects: {len(data_list)}")
else:
    raise ValueError("This table does not contain data!!")


# Data Object Name: immigration_main_information
if data_list := [str(path_1) for path in dim_bucket_path.iterdir() if path.is_dir() for path_1 in path.iterdir(
) if str(path_1).split('/')[-2] == 'immigration_main_information' and path_1.is_dir()]:
    print(
        f"Data: immigration_main_information, Count data objects: {len(data_list)}")
else:
    raise ValueError("This table does not contain data!!")


# Data Object Name: news_article_data
if data_list := [str(path_1).split('/')[-1] for path in dim_bucket_path.iterdir() if path.is_dir() for path_1 in path.iterdir() if str(path_1).split('/')[-2] in 'news_article_data']:
    print(
        f"Data: news_article_data, Count data objects: {len(data_list)}")
else:
    raise ValueError("This table does not contain data!!")


# Data Object Name: us_cities_demographics_data
if data_list := [str(path_1).split('/')[-1] for path in dim_bucket_path.iterdir() if path.is_dir() for path_1 in path.iterdir() if str(path_1).split('/')[-2] in 'us_cities_demographics_data']:
    print(
        f"Data: us_cities_demographics_data, Count data objects: {len(data_list)}")
else:
    raise ValueError("This table does not contain data!!")


Data: df_immigration_personal, Count data objects: 114
Data: imm_address, Count data objects: 9
Data: imm_city_res_label, Count data objects: 9
Data: imm_destination_city, Count data objects: 9
Data: imm_travel_code, Count data objects: 6
Data: imm_visa, Count data objects: 5
Data: immigration_main_information, Count data objects: 1
Data: news_article_data, Count data objects: 6305
Data: us_cities_demographics_data, Count data objects: 301


In [10]:
 # ******** Fact Tables ********
# Data Object Name: notification
if data_list := [str(path_1).split('/')[-1] for path in fact_bucket_path.iterdir() if path.is_dir() for path_1 in path.iterdir() if path_1.is_dir() if str(path_1).split('/')[-2] in 'notification']:
    print(f"Data: notification, Count data objects: {len(data_list)}")
else:
    raise ValueError("This table does not contain data!!")


Data: notification, Count data objects: 2


# Data Schema - Dimension Tables

In [67]:
 # ***** df_immigration_personal column schema and data type *****
partition_filter = lambda x: x["imm_person_birth_year"] == "2016"
dfs = wr.s3.read_parquet(path="s3://destetlbucket/dimension_table/df_immigration_personal/", dataset=True, partition_filter=partition_filter)
dfs.info(verbose=True)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 765 entries, 0 to 1
Data columns (total 4 columns):
 #   Column                 Non-Null Count  Dtype   
---  ------                 --------------  -----   
 0   imm_per_cic_id         765 non-null    string  
 1   imm_person_gender      651 non-null    string  
 2   imm_visatype           765 non-null    string  
 3   imm_person_birth_year  765 non-null    category
dtypes: category(1), string(3)
memory usage: 24.8 KB


In [72]:
 # ***** imm_address column schema and data type *****
dfs = wr.s3.read_parquet(path="s3://destetlbucket/dimension_table/imm_address/", dataset=True)
dfs.info(verbose=True)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 55 entries, 0 to 6
Data columns (total 2 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   code_of_imm_address   55 non-null     string
 1   value_of_imm_address  55 non-null     string
dtypes: string(2)
memory usage: 1.3 KB


In [73]:
 # ***** imm_city_res_label column schema and data type *****
dfs = wr.s3.read_parquet(path="s3://destetlbucket/dimension_table/imm_city_res_label/", dataset=True)
dfs.info(verbose=True)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 7 entries, 0 to 0
Data columns (total 3 columns):
 #   Column                            Non-Null Count  Dtype 
---  ------                            --------------  ----- 
 0   col_of_imm_cntyl                  7 non-null      Int32 
 1   value_of_imm_cntyl                7 non-null      string
 2   value_of_imm_cntyl_organizations  7 non-null      string
dtypes: Int32(1), string(2)
memory usage: 203.0 bytes


In [74]:
 # ***** imm_destination_city column schema and data type *****
dfs = wr.s3.read_parquet(path="s3://destetlbucket/dimension_table/imm_destination_city/", dataset=True)
dfs.info(verbose=True)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 596 entries, 0 to 77
Data columns (total 3 columns):
 #   Column                               Non-Null Count  Dtype 
---  ------                               --------------  ----- 
 0   code_of_imm_destination_city         596 non-null    string
 1   value_of_imm_destination_city        596 non-null    string
 2   value_of_alias_imm_destination_city  596 non-null    string
dtypes: string(3)
memory usage: 18.6 KB


In [75]:
 # ***** imm_travel_code column schema and data type *****
dfs = wr.s3.read_parquet(path="s3://destetlbucket/dimension_table/imm_travel_code/", dataset=True)
dfs.info(verbose=True)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4 entries, 0 to 0
Data columns (total 2 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   code_of_imm_travel_code   0 non-null      Int32 
 1   value_of_imm_travel_code  4 non-null      string
dtypes: Int32(1), string(1)
memory usage: 84.0 bytes


In [76]:
 # ***** imm_visa column schema and data type *****
dfs = wr.s3.read_parquet(path="s3://destetlbucket/dimension_table/imm_visa/", dataset=True)
dfs.info(verbose=True)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3 entries, 0 to 0
Data columns (total 2 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   code_of_imm_visa   3 non-null      Int32 
 1   value_of_imm_visa  3 non-null      string
dtypes: Int32(1), string(1)
memory usage: 63.0 bytes


In [79]:
# ***** immigration_main_information column schema and data type *****
def partition_filter(x): return x["imm_year"] == "2016"

dfs = wr.s3.read_parquet(path="s3://destetlbucket/dimension_table/immigration_main_information/",
                         dataset=True, partition_filter=partition_filter)
dfs.info(verbose=True)


<class 'pandas.core.frame.DataFrame'>
Int64Index: 3096313 entries, 0 to 10320
Data columns (total 12 columns):
 #   Column              Dtype   
---  ------              -----   
 0   imm_main_cic_id     Int32   
 1   imm_cntyl           Int32   
 2   imm_visa            Int32   
 3   imm_port            string  
 4   imm_arrival_date    object  
 5   imm_departure_date  object  
 6   imm_model           Int32   
 7   imm_address         string  
 8   imm_airline         string  
 9   imm_flight_no       string  
 10  imm_year            category
 11  imm_month           category
dtypes: Int32(4), category(2), object(2), string(4)
memory usage: 230.3+ MB


In [80]:
 # ***** news_article_data column schema and data type *****
partition_filter = lambda x: "2016-01-01" <= x["news_publish_time"] <= "2016-01-02"

dfs = wr.s3.read_parquet(path="s3://destetlbucket/dimension_table/news_article_data/", dataset=True, partition_filter=partition_filter)
dfs.info(verbose=True)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 41 entries, 0 to 0
Data columns (total 8 columns):
 #   Column             Non-Null Count  Dtype   
---  ------             --------------  -----   
 0   news_cord_uid      41 non-null     string  
 1   news_source        41 non-null     string  
 2   news_title         41 non-null     string  
 3   news_licence       41 non-null     string  
 4   news_abstract      36 non-null     string  
 5   news_authors       41 non-null     string  
 6   news_url           41 non-null     string  
 7   news_publish_time  41 non-null     category
dtypes: category(1), string(7)
memory usage: 2.7 KB


In [81]:
 # ***** us_cities_demographics_data column schema and data type *****
dfs = wr.s3.read_parquet(path="s3://destetlbucket/dimension_table/us_cities_demographics_data/", dataset=True)
dfs.info(verbose=True)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2891 entries, 0 to 8
Data columns (total 7 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   cidemo_city              2891 non-null   string 
 1   cidemo_state             2891 non-null   string 
 2   cidemo_median_age        2891 non-null   float32
 3   cidemo_total_population  2891 non-null   Int32  
 4   cidemo_state_code        2891 non-null   string 
 5   cidemo_count             2891 non-null   Int32  
 6   cidemo_id                2891 non-null   Int64  
dtypes: Int32(2), Int64(1), float32(1), string(3)
memory usage: 155.3 KB


# Data Schema - Fact Table


In [84]:
 # ***** notification column schema and data type *****

dfs = wr.s3.read_parquet(
    path="s3://destetlbucket/fact_table/notification/news_publish_time=2016-04-02/part-00000-f532f766-0d4b-4fdd-b106-512af5269e4a.c000.snappy.parquet")
dfs.info(verbose=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9186 entries, 0 to 9185
Data columns (total 33 columns):
 #   Column                               Non-Null Count  Dtype  
---  ------                               --------------  -----  
 0   imm_main_cic_id                      9186 non-null   Int32  
 1   imm_year                             9186 non-null   Int32  
 2   imm_month                            9186 non-null   Int32  
 3   imm_cntyl                            9186 non-null   Int32  
 4   imm_visa                             9186 non-null   Int32  
 5   imm_port                             9186 non-null   string 
 6   imm_arrival_date                     9186 non-null   object 
 7   imm_departure_date                   8722 non-null   object 
 8   imm_model                            9186 non-null   Int32  
 9   imm_address                          8662 non-null   string 
 10  imm_airline                          8891 non-null   string 
 11  imm_flight_no                 

22/06/23 13:46:36 WARN HeartbeatReceiver: Removing executor driver with no recent heartbeats: 2289024 ms exceeds timeout 120000 ms
22/06/23 13:46:37 WARN SparkContext: Killing executors is not supported by current scheduler.
