In [15]:
import configparser
from datetime import datetime, timedelta
import os
from signal import signal, SIGPIPE, SIG_DFL
from pyspark.sql import SparkSession
from pyspark.sql.functions import udf, col, from_unixtime, monotonically_increasing_id, to_date
from pyspark.sql.types import *
from pyspark.sql.functions import year, month, dayofmonth, hour, weekofyear, date_format, count, when
from s3path import S3Path

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [5]:
def create_spark_session():
    """
    Purpose:
        Build an access spark session for dealing data ETL of Data Lake
    :return: spark session
    """
    spark = SparkSession \
        .builder \
        .config("spark.jars.packages", "org.apache.hadoop:hadoop-aws:2.7.0") \
        .getOrCreate()
    # .config("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem") \
    # .config("spark.hadoop.fs.s3a.awsAccessKeyId", os.environ['AWS_ACCESS_KEY_ID']) \
    # .config("spark.hadoop.fs.s3a.awsSecretAccessKey", os.environ['AWS_SECRET_ACCESS_KEY'])

    return spark

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [7]:
spark = create_spark_session()

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [8]:
spark

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

<pyspark.sql.session.SparkSession object at 0x7f80df6ed810>

In [17]:
dest_aws_s3_bucket = 'destetlbucket'

dim_bucket_path = S3Path(f"/{dest_aws_s3_bucket}/dimension_table")
fact_bucket_path = S3Path(f"/{dest_aws_s3_bucket}/fact_table")

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [16]:
# Purpose: 1. Training Data: Depending on this aim, users could query the news table for machine learning before customers received news articles.

# 1. Filter news articles title and abstract
# 2. Label news title and abstract related articles that were matched covid word for user to split news_abstract words and language detect.

for path in dim_bucket_path.iterdir():
    path = str(path)
    if path.split("/")[-1] == 'news_article_data':
        df = spark.read.parquet(f"s3:/{path}")
        df.withColumn("label_covid_for_ml", when( (df.news_title.rlike("covid")) | (df.news_abstract.isNull()), "Covid_News_Abstract_Null")\
                   .when((df.news_title.rlike("covid")) | (df.news_abstract.isNotNull()), "Covid_News_Abstract")
                                        .otherwise("Other News")).filter(col('news_publish_time').isNotNull()).sort(col('news_publish_time').desc()).show()
    

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+-------------+-----------+--------------------+------------+--------------------+--------------------+--------------------+-----------------+-------------------+
|news_cord_uid|news_source|          news_title|news_licence|       news_abstract|        news_authors|            news_url|news_publish_time| label_covid_for_ml|
+-------------+-----------+--------------------+------------+--------------------+--------------------+--------------------+-----------------+-------------------+
|     rni54i0i|   Elsevier|Chapter 33 Avian ...|   els-covid|Abstract Infectio...|Khataby, Khadija;...|https://doi.org/1...|       2020-12-31|Covid_News_Abstract|
|     p8pns7r9|   Elsevier|Chapter 13 Biotec...|   els-covid|Abstract The appl...|Malik, Yashpal Si...|https://doi.org/1...|       2020-12-31|Covid_News_Abstract|
|     gy8aata6|   Elsevier|Specificity in PD...|   els-covid|Abstract Globular...|Amacher, Jeanine ...|https://doi.org/1...|       2020-12-31|Covid_News_Abstract|
|     sn7rswab|   Else

In [47]:
# Purpose: 2. People's behavior and pandemic spread by people's traffic tools: With this aim, users could join dimension tables like imm_address, and imm_travel to gain more information about the pandemic spread with people's behavior.

for path in dim_bucket_path.iterdir():
    path = str(path)
    if path.split("/")[-1] == 'imm_address':
        df_imm_address = spark.read.parquet(f"s3:/{path}")
    if path.split("/")[-1] == 'imm_travel_code':
        df_imm_travel_code = spark.read.parquet(f"s3:/{path}")


VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [34]:
for path in fact_bucket_path.iterdir():
    path = str(path)
    if path.split("/")[-1] == 'notification':
        df_notification = spark.read.parquet(f"s3:/{path}")

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [45]:

df_pandemic_spread = df_notification.join(df_imm_address).where(df_notification['value_of_alias_imm_destination_city'] == df_imm_address['code_of_imm_address'])\
                                    .join(df_imm_travel_code).where(df_notification['imm_model'] == df_imm_travel_code['code_of_imm_travel_code'])
            

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [49]:
df_pandemic_spread.select(col('news_publish_time'), col('imm_year'), col('imm_month'), col('value_of_imm_travel_code'), col('value_of_imm_address')).distinct().show()

VBox()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+-----------------+--------+---------+------------------------+--------------------+
|news_publish_time|imm_year|imm_month|value_of_imm_travel_code|value_of_imm_address|
+-----------------+--------+---------+------------------------+--------------------+
|       2016-04-01|    2016|        4|                     Sea|             ARIZONA|
|       2016-04-02|    2016|        4|                    Land|           MINNESOTA|
|       2016-04-02|    2016|        4|                    Land|             ARIZONA|
|       2016-04-02|    2016|        4|                     Sea|               TEXAS|
|       2016-04-02|    2016|        4|                     Air|          NEW JERSEY|
|       2016-04-01|    2016|        4|            Not reported|          WASHINGTON|
|       2016-04-02|    2016|        4|                     Sea|            COLORADO|
|       2016-04-02|    2016|        4|                    Land|             FLORIDA|
|       2016-04-02|    2016|        4|            Not reported|  