In [1]:
import os
import configparser
import boto3
from datetime import datetime, timedelta
from pyspark.sql import SparkSession
# from signal import signal, SIGPIPE, SIG_DFL
from pyspark.sql.functions import col, monotonically_increasing_id, udf, to_date
from pyspark.sql.types import (StructType,
                               StructField,
                               StringType,
                               IntegerType,
                               DoubleType,
                               DateType,
                               FloatType)

# create local spark session

In [2]:
spark = SparkSession \
    .builder \
    .master("local[*]") \
    .appName("spark_emr_udactity") \
    .config("spark.jars.packages","saurfang:spark-sas7bdat:2.0.0-s_2.11") \
    .getOrCreate()

Ivy Default Cache set to: /Users/oneforall_nick/.ivy2/cache
The jars for the packages stored in: /Users/oneforall_nick/.ivy2/jars
:: loading settings :: url = jar:file:/Users/oneforall_nick/spark-2.4.8-bin-hadoop2.7/jars/ivy-2.4.0.jar!/org/apache/ivy/core/settings/ivysettings.xml
saurfang#spark-sas7bdat added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-a4d813b9-1549-4505-b79a-151a74526da0;1.0
	confs: [default]
	found saurfang#spark-sas7bdat;2.0.0-s_2.11 in spark-packages
	found com.epam#parso;2.0.8 in central
	found org.slf4j#slf4j-api;1.7.5 in central
	found org.apache.logging.log4j#log4j-api-scala_2.11;2.7 in central
	found org.scala-lang#scala-reflect;2.11.8 in central
:: resolution report :: resolve 299ms :: artifacts dl 7ms
	:: modules in use:
	com.epam#parso;2.0.8 from central in [default]
	org.apache.logging.log4j#log4j-api-scala_2.11;2.7 from central in [default]
	org.scala-lang#scala-reflect;2.11.8 from central in [default]
	org.slf4j#slf4

# Check spark session information

In [48]:
spark

In [49]:
# Stop spark session if I don't need it.
spark.stop()

In [4]:
# spark session setting configuration
spark.conf.set("spark.sql.shuffle.partitions", "5")
spark.sparkContext.getConf().getAll()

[('spark.files',
  'file:///Users/oneforall_nick/.ivy2/jars/saurfang_spark-sas7bdat-2.0.0-s_2.11.jar,file:///Users/oneforall_nick/.ivy2/jars/com.epam_parso-2.0.8.jar,file:///Users/oneforall_nick/.ivy2/jars/org.apache.logging.log4j_log4j-api-scala_2.11-2.7.jar,file:///Users/oneforall_nick/.ivy2/jars/org.slf4j_slf4j-api-1.7.5.jar,file:///Users/oneforall_nick/.ivy2/jars/org.scala-lang_scala-reflect-2.11.8.jar'),
 ('spark.app.name', 'spark_emr_udactity'),
 ('spark.executor.id', 'driver'),
 ('spark.driver.host', 'localhost'),
 ('spark.jars.packages', 'saurfang:spark-sas7bdat:2.0.0-s_2.11'),
 ('spark.submit.pyFiles',
  '/Users/oneforall_nick/.ivy2/jars/saurfang_spark-sas7bdat-2.0.0-s_2.11.jar,/Users/oneforall_nick/.ivy2/jars/com.epam_parso-2.0.8.jar,/Users/oneforall_nick/.ivy2/jars/org.apache.logging.log4j_log4j-api-scala_2.11-2.7.jar,/Users/oneforall_nick/.ivy2/jars/org.slf4j_slf4j-api-1.7.5.jar,/Users/oneforall_nick/.ivy2/jars/org.scala-lang_scala-reflect-2.11.8.jar'),
 ('spark.jars',
  'f

# Access AWS S3 to get my source data After I upload data from local to AWS S3

In [44]:
# ***** Access AWS Cloud configure ************
config = configparser.ConfigParser()
config.read_file(open('/Users/oneforall_nick/workspace/Udacity_capstone_project/cfg/dl.cfg'))
# config.read_file(open('dl.cfg'))

aws_access_key = config["ACCESS"]["AWS_ACCESS_KEY_ID"]
aws_secret_access_key = config["ACCESS"]["AWS_SECRET_ACCESS_KEY"]
aws_token = config["ACCESS"]["AWS_TOKEN"]
# Access data from AWS S3
# SOURCE_S3_BUCKET = config['S3']['SOURCE_S3_BUCKET']
SOURCE_S3_BUCKET = 's3://mydatapool'
# Write data to AWS S3
# DEST_S3_BUCKET = config['S3']['DEST_S3_BUCKET']
DEST_S3_BUCKET = 's3://destetlbucket'
# *********************************************

# ***** Local Testing configure ************
# SOURCE_S3_BUCKET = '/Users/oneforall_nick/workspace/Udacity_capstone_project/airflow/'
# DEST_S3_BUCKET = '/Users/oneforall_nick/workspace/Udacity_capstone_project/airflow/dest_data'

# ***** Local Testing configure *****************

session = boto3.Session(
    aws_access_key_id=aws_access_key,
    aws_secret_access_key=aws_secret_access_key,
    aws_session_token=aws_token
)

s3_access = session.resource('s3')


# Dimension: Label Data
*Data format: <br>*
    TXT
*This step will seperate multiple tables:*
- imm_cit_res
- imm_port
- imm_mod
- imm_addr
- imm_visa

In [47]:
# ****** immigration_labels_descriptions ******

# Get AWS S3 data Object: immigration_labels_descriptions.SAS
s3_object = s3_access.Bucket('mydatapool').Object('data/immigration_data/immigration_labels_descriptions.SAS').get()
text = s3_object['Body'].read()
context = text.decode(encoding ='utf-8')
# for obj in s3_object.objects.all():
#     print(obj.key)

context = context.replace('\t', '')


def code_mapping(context, idx):
    content_mapping = context[context.index(idx):]
    content_line_split = content_mapping[:content_mapping.index(
        ';')].split('\n')
    content_line_list = [line.replace("'", "")
                         for line in content_line_split]
    content_two_dims = [i.strip().split('=') for i in content_line_list[1:]]
    content_three_dims = [[i[0].strip(), i[1].strip().split(', ')[:][0], e]
                          for i in content_two_dims if len(i) == 2 for e in i[1].strip().split(', ')[1:]]
    return content_two_dims, content_three_dims

# TODO:OK: three columns
imm_cit_res_two, imm_cit_res_three = code_mapping(context, "i94cntyl")
df_imm_city_res_label = spark.sparkContext.parallelize(imm_cit_res_three).toDF(["col_of_imm_cntyl", "value_of_imm_cntyl", "value_of_imm_cntyl_organizations"]) \
    .withColumn("col_of_imm_cntyl", col("col_of_imm_cntyl").cast("Integer")) \
    .withColumn("value_of_imm_cntyl", col("value_of_imm_cntyl").cast("String")) \
    .withColumn("value_of_imm_cntyl", col("value_of_imm_cntyl_organizations").cast("String")) \

df_imm_city_res_label.show()

# TODO:OK: three columns
imm_port_two, imm_port_three = code_mapping(context, "i94prtl")
df_imm_destination_city = spark.sparkContext.parallelize(imm_port_three).toDF(["code_of_imm_destination_city", "value_of_imm_destination_city", "value_of_alias_imm_destination_city"]) \
                                                .withColumn("code_of_imm_destination_city", col("code_of_imm_destination_city").cast("String")) \
                                                .withColumn("value_of_imm_destination_city", col("value_of_imm_destination_city").cast("String")) \
                                                .withColumn("value_of_alias_imm_destination_city", col("value_of_alias_imm_destination_city").cast("String"))

df_imm_destination_city.show()

# TODO:OK: two columns
imm_mode_two, imm_mode_three = code_mapping(context, "i94model")
df_imm_travel_code = spark.sparkContext.parallelize(imm_mode_two).toDF(["code_of_imm_travel_code", "value_of_imm_travel_code"]) \
                                           .withColumn("code_of_imm_travel_code", col("code_of_imm_travel_code").cast("Integer")) \
                                           .withColumn("value_of_imm_travel_code", col("value_of_imm_travel_code").cast("String"))
df_imm_travel_code.show()

# TODO:OK: two columns
imm_addr_two, imm_addr_three = code_mapping(context, "i94addrl")
df_imm_address = spark.sparkContext.parallelize(imm_addr_two).toDF(["code_of_imm_address", "value_of_imm_address"]) \
    .withColumn("code_of_imm_address", col("code_of_imm_address").cast("String")) \
    .withColumn("value_of_imm_address", col("value_of_imm_address").cast("String"))
df_imm_address.show()


# TODO:OK: two columns
imm_visa = {'1': 'Business',
            '2': 'Pleasure',
            '3': 'Student'}

df_imm_visa = spark.sparkContext.parallelize(imm_visa.items()).toDF(["code_of_imm_visa", "value_of_imm_visa"]) \
                                    .withColumn("code_of_imm_visa", col("code_of_imm_visa").cast("Integer")) \
                                    .withColumn("value_of_imm_visa", col("value_of_imm_visa").cast("String"))
df_imm_visa.show()

+----------------+--------------------+--------------------------------+
|col_of_imm_cntyl|  value_of_imm_cntyl|value_of_imm_cntyl_organizations|
+----------------+--------------------+--------------------------------+
|             582|and Not Reported ...|            and Not Reported ...|
|             582|   no land arrivals)|               no land arrivals)|
|             717|        ST EUSTATIUS|                    ST EUSTATIUS|
|             717|                SABA|                            SABA|
|             245|                 PRC|                             PRC|
|             473|      FED. STATES OF|                  FED. STATES OF|
|             471|            NORTHERN|                        NORTHERN|
+----------------+--------------------+--------------------------------+

+----------------------------+-----------------------------+-----------------------------------+
|code_of_imm_destination_city|value_of_imm_destination_city|value_of_alias_imm_destination_city|
+-

# Dimension: News
- Data format: <br>
    CSV
- explain: <br>
    display data persist in local memory

In [26]:
# file path: data >> news_article
"""Table: news_article schema
pk: cord_uid -> news_cord_uid
1. source_x -> news_source
    schema: StringType()
2. title -> news_title
    schema: StringType()
3. license -> news_licence
    schema: StringType()
4. abstract -> news_abstract
    schema: StringType()
5. publish_time -> news_publish_time (fk)
    schema: TimestampType()
6. authors -> news_authors
    schema: StringType()
7. url -> news_url
    schema: StringType()
"""
data_news = "/Users/oneforall_nick/workspace/Udacity_capstone_project/airflow/data/news_data/metadata.csv"

# news_schema = StructType([
#     StructField(name="news_cord_uid", dataType=StringType(), nullable=True),
#     StructField(name="news_source", dataType=StringType(), nullable=True),
#     StructField(name="news_title", dataType=StringType(), nullable=True),
#     StructField(name="news_licence", dataType=StringType(), nullable=True),
#     StructField(name="news_abstract", dataType=StringType(), nullable=True),
#     StructField(name="news_publish_time", dataType=DateType(), nullable=True),
#     StructField(name="news_authors", dataType=StringType(), nullable=True),
#     StructField(name="news_url", dataType=StringType(), nullable=True)
# ])

df_news = spark.read.options(header=True, delimiter=',').csv(path=data_news)

df_news = df_news.withColumn("news_cord_uid", col("cord_uid").cast("String")) \
    .withColumn("news_source", col("source_x").cast("String")) \
    .withColumn("news_title", col("title").cast("String")) \
    .withColumn("news_licence", col("license").cast("String")) \
    .withColumn("news_abstract", col("abstract").cast("String")) \
    .withColumn("news_publish_time", to_date(col("publish_time"), "yyyy-MM-dd")) \
    .withColumn("news_authors", col("authors").cast("String")) \
    .withColumn("news_url", col("url").cast("String")) \
    .select(col("news_cord_uid"),
            col("news_source"),
            col("news_title"),
            col("news_licence"),
            col("news_abstract"),
            col("news_publish_time"),
            col("news_authors"),
            col("news_url"))

df_news_tmp = df_news.createOrReplaceTempView("news_article_data")

df_news_tmp = spark.sql(
    "SELECT DISTINCT news_publish_time FROM news_article_data")

df_news_tmp.persist()

df_news_tmp.explain()

# df_news_tmp.unpersist()

== Physical Plan ==
InMemoryTableScan [news_publish_time#1508]
   +- InMemoryRelation [news_publish_time#1508], StorageLevel(disk, memory, 1 replicas)
         +- *(2) HashAggregate(keys=[news_publish_time#1508], functions=[])
            +- Exchange hashpartitioning(news_publish_time#1508, 5)
               +- *(1) HashAggregate(keys=[news_publish_time#1508], functions=[])
                  +- *(1) Project [cast(cast(unix_timestamp(publish_time#1378, yyyy-MM-dd, Some(Asia/Taipei)) as timestamp) as date) AS news_publish_time#1508]
                     +- *(1) FileScan csv [publish_time#1378] Batched: false, Format: CSV, Location: InMemoryFileIndex[file:/Users/oneforall_nick/workspace/Udacity_capstone_project/airflow/data/news..., PartitionFilters: [], PushedFilters: [], ReadSchema: struct<publish_time:string>


In [38]:
df_news.show(n=3, truncate=10)

+-------------+-----------+----------+------------+-------------+-----------------+------------+----------+
|news_cord_uid|news_source|news_title|news_licence|news_abstract|news_publish_time|news_authors|  news_url|
+-------------+-----------+----------+------------+-------------+-----------------+------------+----------+
|     vho70jcx|    biorxiv|SIANN: ...|     biorxiv|   Next-ge...|       2014-01-10|  Samuel ...|https:/...|
|     i9tbix2v|    biorxiv|Spatial...|     biorxiv|   An emer...|       2014-06-04|  Lin WAN...|https:/...|
|     62gfisc6|    biorxiv|Sequenc...|     biorxiv|   Germlin...|       2014-07-03|  Corey T...|https:/...|
+-------------+-----------+----------+------------+-------------+-----------------+------------+----------+
only showing top 3 rows



# Dimension Table: Us Cities Demographics data
- Data format: <br>
    CSV
- explain: <br>
    display data persist in local memory

In [41]:
# Create a us-cities data dimension table
"""Table: us_cities_demographics schema
pk: generated -> cidemo_id
    schema: IntegerType()
1. City -> cidemo_city
    schema: StringType()
2. State -> cidemo_state
    schema: StringType()
3. Median Age -> cidemo_median_age
    schema: FloatType()
4. Total Population -> cidemo_total_population
    schema: IntegerType()
5. State Code -> cidemo_state_code (fk)
    schema: StringType()
6. Count -> cidemo_count
    schema: IntegerType()
"""

data_us_cities_demographics = "/Users/oneforall_nick/workspace/Udacity_capstone_project/airflow/data/usCitiesDemographics_data/usCitiesDemo.csv"

# TODO -> Must be defined a function that generated each table schema:
us_cities_demographics_data_schema = StructType([
    StructField(name="cidemo_city", dataType=StringType(), nullable=True),
    StructField(name="cidemo_state", dataType=StringType(), nullable=True),
    StructField(name="cidemo_median_age", dataType=FloatType(), nullable=True),
    StructField(name="cidemo_total_population", dataType=IntegerType(), nullable=True),
    StructField(name="cidemo_state_code", dataType=StringType(), nullable=True),
    StructField(name="cidemo_count", dataType=IntegerType(), nullable=True)
])

# Using pyspark to read csv file
df_us_cities_demographics = spark.read.options(header=True, delimiter=';').csv(data_us_cities_demographics)
"""
root
 |-- cidemo_city: string (nullable = true)
 |-- cidemo_state: string (nullable = true)
 |-- cidemo_median_age: float (nullable = true)
 |-- cidemo_total_population: integer (nullable = true)
 |-- cidemo_state_code: string (nullable = true)
 |-- cidemo_count: integer (nullable = true)
"""
df_us_cities_demographics = df_us_cities_demographics.withColumn("cidemo_city", col("City").cast("String")) \
                    .withColumn("cidemo_state", col("State").cast("String")) \
                        .withColumn("cidemo_median_age", col("Median Age").cast("Float")) \
                            .withColumn("cidemo_male_population", col("Male Population").cast("Integer")) \
                                .withColumn("cidemo_female_population", col("Female Population").cast("Integer")) \
                                    .withColumn("cidemo_total_population", col("Total Population").cast("Integer")) \
                                            .withColumn("cidemo_number_of_veterans", col("Number of Veterans").cast("Integer")) \
                                                .withColumn("cidemo_foreign_born", col("Foreign-born").cast("Integer")) \
                                                    .withColumn("cidemo_average_household_size", col("Average Household Size").cast("Float")) \
                                                        .withColumn("cidemo_state_code", col("State Code").cast("String")) \
                                                            .withColumn("cidemo_race", col("Race").cast("String")) \
    .withColumn("cidemo_count", col("Count").cast("Integer")) \
                    .select(col("cidemo_city"),
                            col("cidemo_state"),
                            col("cidemo_median_age"),
                            col("cidemo_total_population"),
                            col("cidemo_state_code"),
                            col("cidemo_count"))

# Auto-generated series of id
df_us_cities_demographics = df_us_cities_demographics.withColumn("cidemo_id", monotonically_increasing_id())

df_us_cities_demographics_temp = df_us_cities_demographics.createOrReplaceTempView("us_cities_demographics_data")

df_us_cities_demographics_temp = spark.sql("SELECT * FROM us_cities_demographics_data")

df_us_cities_demographics_temp.persist()

df_us_cities_demographics_temp.explain()

# df_us_cities_demographics_temp.unpersist()

== Physical Plan ==
InMemoryTableScan [cidemo_city#2180, cidemo_state#2194, cidemo_median_age#2209, cidemo_total_population#2260, cidemo_state_code#2342, cidemo_count#2389, cidemo_id#2420L]
   +- InMemoryRelation [cidemo_city#2180, cidemo_state#2194, cidemo_median_age#2209, cidemo_total_population#2260, cidemo_state_code#2342, cidemo_count#2389, cidemo_id#2420L], StorageLevel(disk, memory, 1 replicas)
         +- *(1) Project [City#2156 AS cidemo_city#2180, State#2157 AS cidemo_state#2194, cast(Median Age#2158 as float) AS cidemo_median_age#2209, cast(Total Population#2161 as int) AS cidemo_total_population#2260, State Code#2165 AS cidemo_state_code#2342, cast(Count#2167 as int) AS cidemo_count#2389, monotonically_increasing_id() AS cidemo_id#2420L]
            +- *(1) FileScan csv [City#2156,State#2157,Median Age#2158,Male Population#2159,Female Population#2160,Total Population#2161,Number of Veterans#2162,Foreign-born#2163,Average Household Size#2164,State Code#2165,Race#2166,Count#2

In [43]:
df_us_cities_demographics_temp.show()

+----------------+--------------+-----------------+-----------------------+-----------------+------------+---------+
|     cidemo_city|  cidemo_state|cidemo_median_age|cidemo_total_population|cidemo_state_code|cidemo_count|cidemo_id|
+----------------+--------------+-----------------+-----------------------+-----------------+------------+---------+
|   Silver Spring|      Maryland|             33.8|                  82463|               MD|       25924|        0|
|          Quincy| Massachusetts|             41.0|                  93629|               MA|       58723|        1|
|          Hoover|       Alabama|             38.5|                  84839|               AL|        4759|        2|
|Rancho Cucamonga|    California|             34.5|                 175232|               CA|       24437|        3|
|          Newark|    New Jersey|             34.6|                 281913|               NJ|       76402|        4|
|          Peoria|      Illinois|             33.1|             

# Dimension: Immigration data
- Data format: <br>
    SAS
- explain: <br>
    display data persist in local memory

In [5]:
# ****** imm_data ******
imm_data = "/Users/oneforall_nick/workspace/Udacity_capstone_project/airflow/data/immigration_data/immigration_apr16_sub.sas7bdat"
df_imm_data = spark.read.format('com.github.saurfang.sas.spark').load(imm_data)

In [31]:
df_imm_data.count()

                                                                                

3096313

In [10]:
df_imm_data.show(n=5, truncate=5)

+-----+-----+------+------+------+-------+-------+-------+-------+-------+------+-------+-----+--------+--------+-----+-------+-------+-------+-------+-------+-------+------+------+-------+------+-----+--------+
|cicid|i94yr|i94mon|i94cit|i94res|i94port|arrdate|i94mode|i94addr|depdate|i94bir|i94visa|count|dtadfile|visapost|occup|entdepa|entdepd|entdepu|matflag|biryear|dtaddto|gender|insnum|airline|admnum|fltno|visatype|
+-----+-----+------+------+------+-------+-------+-------+-------+-------+------+-------+-----+--------+--------+-----+-------+-------+-------+-------+-------+-------+------+------+-------+------+-----+--------+
|  6.0|20...|   4.0| 692.0| 692.0|    XXX|  20...|   null|   null|   null|  37.0|    2.0|  1.0|    null|    null| null|      T|   null|      U|   null|  19...|  10...|  null|  null|   null| 1....| null|      B2|
|  7.0|20...|   4.0| 254.0| 276.0|    ATL|  20...|    1.0|     AL|   null|  25.0|    3.0|  1.0|   20...|     SEO| null|      G|   null|      Y|   null| 

# Dimension: Immigration personal data
- Data format: <br>
    SAS
- explain: <br>
    display data persist in local memory

In [12]:
"""Table: immigration_personal schema -> According to this person data that I will make a core data table to display notifications information.
pk: cicid -> imm_per_cic_id
    schema: StringType()
1. biryear -> imm_person_birth_year
    schema: IntegerType()
2. gender -> imm_person_gender
    schema: StringType()
3. visatype -> imm_person_visa_type
    schema: StringType()
"""
# show(n=5, truncate=5)
df_immigration_personal = df_imm_data.withColumn("imm_per_cic_id", col("cicid").cast("String"))\
           .withColumn("imm_person_birth_year", col("biryear").cast("Integer"))\
           .withColumn("imm_person_gender", col("gender").cast("String"))\
           .withColumn("imm_visatype", col("visatype").cast("String")).select(col("imm_per_cic_id"), \
                                                                              col("imm_person_birth_year"), \
                                                                              col("imm_person_gender"), \
                                                                              col("imm_visatype"))

df_immigration_personal_tmp = df_immigration_personal.createOrReplaceTempView("imm_personal")

df_immigration_personal_tmp = spark.sql("SELECT * FROM imm_personal")

df_immigration_personal_tmp.persist()

df_immigration_personal_tmp.explain()

== Physical Plan ==
InMemoryTableScan [imm_per_cic_id#611, imm_person_birth_year#641, imm_person_gender#672, imm_visatype#704]
   +- InMemoryRelation [imm_per_cic_id#611, imm_person_birth_year#641, imm_person_gender#672, imm_visatype#704], StorageLevel(disk, memory, 1 replicas)
         +- *(1) Project [cast(cicid#0 as string) AS imm_per_cic_id#611, cast(biryear#20 as int) AS imm_person_birth_year#641, gender#22 AS imm_person_gender#672, visatype#27 AS imm_visatype#704]
            +- *(1) Scan SasRelation(/Users/oneforall_nick/workspace/Udacity_capstone_project/airflow/data/immigration_data/immigration_apr16_sub.sas7bdat,null,Configuration: core-default.xml, core-site.xml, mapred-default.xml, mapred-site.xml, yarn-default.xml, yarn-site.xml, hdfs-default.xml, hdfs-site.xml,0) [cicid#0,i94yr#1,i94mon#2,i94cit#3,i94res#4,i94port#5,arrdate#6,i94mode#7,i94addr#8,depdate#9,i94bir#10,i94visa#11,count#12,dtadfile#13,visapost#14,occup#15,entdepa#16,entdepd#17,entdepu#18,matflag#19,biryear#20,

In [30]:
df_immigration_personal_tmp.show(n=5)

+--------------+---------------------+-----------------+------------+
|imm_per_cic_id|imm_person_birth_year|imm_person_gender|imm_visatype|
+--------------+---------------------+-----------------+------------+
|           6.0|                 1979|             null|          B2|
|           7.0|                 1991|                M|          F1|
|          15.0|                 1961|                M|          B2|
|          16.0|                 1988|             null|          B2|
|          17.0|                 2012|             null|          B2|
+--------------+---------------------+-----------------+------------+
only showing top 5 rows



# Dimension: Immigration main data
- Data format: <br>
    SAS
- explain: <br>
    display data persist in local memory

In [14]:
""""Table: immigration_main_information schema
pk: cicid -> imm_main_cic_id
1. i94yr: 4 digit year of the arrival  -> imm_year
2. i94mon: numeric month of the arrival -> imm_month
3. i94citi&i94res: 3 digit code of origin city -> imm_citi_res -> imm_cntyl
4. i94visa: reason for immigration -> imm_visa
    three categories:
        1 = Business
        2 = Pleasure
        3 = Student
5. i94port: 3 character code of destination city --> Foreign key (used to map to USDemographics and City Temperature data) -> imm_port
6. arrdate: arrival date of the departure -> imm_arrival_date:
7. deptdate: departure date
date_add
7. i94mode: 1 digit travel code -> imm_model:
    four categories:
        1 = 'Air'
	    2 = 'Sea'
	    3 = 'Land'
	    9 = 'Not reported'
8. i94addr -> imm_address
    ex: 'AL'='ALABAMA'
9. airline -> imm_airline
10 fltno -> imm_flight_no
    schema: StringType()
"""


def convert_to_datetime(days: DoubleType) -> datetime:
    """convert_to_datetime converts days to datetime format

    Args:
        days (DoubleType): from sas arrive or departure date

    Returns:
        datetime: added days to datetime format result.
    """
    if days is not None:
        date = datetime.strptime('1960-01-01', '%Y-%m-%d')

        return date + timedelta(days=days)

udf_convert_to_datetime = udf(lambda x: convert_to_datetime(x), DateType())

immigration_main_information = df_imm_data.withColumn("imm_main_cic_id", col("cicid").cast("Integer"))\
            .withColumn("imm_year", col("i94yr").cast("Integer"))\
                .withColumn("imm_month", col("i94mon").cast("Integer"))\
                    .withColumn("imm_cntyl", col("i94cit").cast("Integer"))\
                        .withColumn("imm_visa", col("i94visa").cast("Integer"))\
                            .withColumn("imm_port", col("i94port").cast("String"))\
                                .withColumn("imm_arrival_date", udf_convert_to_datetime(col("arrdate")))\
                                    .withColumn("imm_departure_date", udf_convert_to_datetime(col("depdate")))\
                                        .withColumn("imm_model", col("i94mode").cast("Integer"))\
                                            .withColumn("imm_address", col("i94addr").cast("String"))\
                                                .withColumn("imm_airline", col("airline").cast("String"))\
                                                    .withColumn("imm_flight_no", col("fltno").cast("String"))\
        .select(col('imm_main_cic_id'), \
                    col('imm_year'),\
                        col('imm_month'),\
                            col('imm_cntyl'),\
                                col('imm_visa'),\
                                    col('imm_port'),\
                                        col('imm_arrival_date'),\
                                            col('imm_departure_date'),\
                                                col('imm_model'),\
                                                    col('imm_address'),\
                                                        col('imm_airline'),\
                                                            col('imm_flight_no'))

df_immigration_main_information = immigration_main_information.createOrReplaceTempView(
    "immigration_main_information_data")

df_immigration_main_information = spark.sql("SELECT * FROM immigration_main_information_data")

df_immigration_main_information.persist()

df_immigration_main_information.explain()



== Physical Plan ==
InMemoryTableScan [imm_main_cic_id#785, imm_year#815, imm_month#846, imm_cntyl#878, imm_visa#911, imm_port#945, imm_arrival_date#981, imm_departure_date#1018, imm_model#1055, imm_address#1093, imm_airline#1132, imm_flight_no#1172]
   +- InMemoryRelation [imm_main_cic_id#785, imm_year#815, imm_month#846, imm_cntyl#878, imm_visa#911, imm_port#945, imm_arrival_date#981, imm_departure_date#1018, imm_model#1055, imm_address#1093, imm_airline#1132, imm_flight_no#1172], StorageLevel(disk, memory, 1 replicas)
         +- *(2) Project [cast(cicid#0 as int) AS imm_main_cic_id#785, cast(i94yr#1 as int) AS imm_year#815, cast(i94mon#2 as int) AS imm_month#846, cast(i94cit#3 as int) AS imm_cntyl#878, cast(i94visa#11 as int) AS imm_visa#911, i94port#5 AS imm_port#945, pythonUDF0#1237 AS imm_arrival_date#981, pythonUDF1#1238 AS imm_departure_date#1018, cast(i94mode#7 as int) AS imm_model#1055, i94addr#8 AS imm_address#1093, airline#24 AS imm_airline#1132, fltno#26 AS imm_flight_no#

In [33]:
df_immigration_main_information.printSchema()

root
 |-- imm_main_cic_id: integer (nullable = true)
 |-- imm_year: integer (nullable = true)
 |-- imm_month: integer (nullable = true)
 |-- imm_cntyl: integer (nullable = true)
 |-- imm_visa: integer (nullable = true)
 |-- imm_port: string (nullable = true)
 |-- imm_arrival_date: date (nullable = true)
 |-- imm_departure_date: date (nullable = true)
 |-- imm_model: integer (nullable = true)
 |-- imm_address: string (nullable = true)
 |-- imm_airline: string (nullable = true)
 |-- imm_flight_no: string (nullable = true)



# Fact: Nofification

In [28]:
# Notification Table
"""
t2.imm_main_cic_id
t2.imm_per_cic_id
t2.news_cord_uid
src.cidemo_id
src.value_of_imm_destination_city
t2.news_title
t2.news_abstract
t2.news_publish_time
t2.news_authors
"""

#  ** t1: join imm two tables
#  ** t2: join news table with t1
#  ** t3: join us cities table with t2

df_notification = spark.sql(
        "WITH t1 AS \
            (SELECT * \
               FROM immigration_main_information_data imid \
             INNER JOIN imm_personal ip \
                    ON imid.imm_main_cic_id = ip.imm_per_cic_id \
                 WHERE imid.imm_year = 2016 \
            ), t2 AS \
                (SELECT * \
                   FROM t1 \
                 INNER JOIN news_article_data nad \
                        ON t1.imm_arrival_date = nad.news_publish_time \
            ) \
            SELECT  * \
              FROM t2 \
            LIMIT 5 \
        "
    )

df_notification.show(n=5, truncate=3)





+---------------+--------+---------+---------+--------+--------+----------------+------------------+---------+-----------+-----------+-------------+--------------+---------------------+-----------------+------------+-------------+-----------+----------+------------+-------------+-----------------+------------+--------+
|imm_main_cic_id|imm_year|imm_month|imm_cntyl|imm_visa|imm_port|imm_arrival_date|imm_departure_date|imm_model|imm_address|imm_airline|imm_flight_no|imm_per_cic_id|imm_person_birth_year|imm_person_gender|imm_visatype|news_cord_uid|news_source|news_title|news_licence|news_abstract|news_publish_time|news_authors|news_url|
+---------------+--------+---------+---------+--------+--------+----------------+------------------+---------+-----------+-----------+-------------+--------------+---------------------+-----------------+------------+-------------+-----------+----------+------------+-------------+-----------------+------------+--------+
|            982|     201|        4| 

                                                                                

22/06/23 23:02:40 WARN HeartbeatReceiver: Removing executor driver with no recent heartbeats: 2361589 ms exceeds timeout 120000 ms
22/06/23 23:02:40 ERROR TaskSchedulerImpl: Lost executor driver on localhost: Executor heartbeat timed out after 2361589 ms
22/06/23 23:02:40 WARN BlockManagerMasterEndpoint: No more replicas available for broadcast_10_piece0 !
22/06/23 23:02:40 WARN BlockManagerMasterEndpoint: No more replicas available for broadcast_11_piece0 !
22/06/23 23:02:40 WARN SparkContext: Killing executors is not supported by current scheduler.
