In [2]:
import pandas as pd
import numpy as np

import configparser
from pyspark.sql import SparkSession, Window
from pyspark import SparkContext, SparkConf
from pyspark.sql.functions import udf, col, monotonically_increasing_id, row_number
from pyspark.sql.functions import year, month, dayofmonth, hour, weekofyear, date_format
from pyspark.sql.types import TimestampType, DateType, StringType
from pyspark.sql import functions as F

from datetime import datetime, timedelta
import os

import config

from etl import create_spark_session

In [4]:
def convert_datetime(num_days):
    """
    Converts a uni-codic numeric string value to a date object.
    """
    try:
        start = datetime(1960, 1, 1)
        res = start + timedelta(days=int(float((num_days))))
        return res.date()
    except:
        return None

In [3]:
config_app_name = "udac_config"
session_app_name = "udac_cap"

spark = create_spark_session(config_app_name, session_app_name)


output_data = "test_output/"

# Test aiport data function
input_data = "data/us_cities_demographics.csv"
#process_cities_demographics_data(spark, input_data, output_data)

tourism_data = "data/immigration_data_sample.csv"
airport_codes = "data/airport_dict.csv"
country_codes = "data/country_codes.csv"

In [17]:
tourism = spark.read.option('header', True) \
                    .csv(tourism_data)
#.option('delimiter', ";") \
                

airports = spark.read.option('header', True).csv(airport_codes)
countries = spark.read.option('header', True).csv(country_codes)

# Create airport-cities dictionary
airports2 = airports.withColumn("city", F.split(col("airport"), ",").getItem(0))
airports2 = airports2.withColumn("city", F.initcap("city")) \
                    .drop("airport")

# Create country-I94 code dictionary
udf_datetime_from_sas = udf(lambda x: convert_datetime(x), DateType())
countries2 = countries.withColumn("country", F.initcap("country")) \
                    .withColumn("country_code", countries.country_code.cast('integer'))


# Process tourism data
udf_datetime_from_sas = udf(lambda x: convert_datetime(x), DateType())
cols_to_drop = ["insnum", "dtadfile", "fltno", "i94bir", "occup", "matflag",
                "admnum", "entdepu", "visapost", "arrdate", "depdate"]


tourism2 = tourism.withColumn("arrival_date", udf_datetime_from_sas(tourism.arrdate)) \
                .withColumn("departure_date", udf_datetime_from_sas(tourism.depdate)) \
                .drop(*cols_to_drop)

In [18]:
tourism2.printSchema()

root
 |-- _c0: string (nullable = true)
 |-- cicid: string (nullable = true)
 |-- i94yr: string (nullable = true)
 |-- i94mon: string (nullable = true)
 |-- i94cit: string (nullable = true)
 |-- i94res: string (nullable = true)
 |-- i94port: string (nullable = true)
 |-- i94mode: string (nullable = true)
 |-- i94addr: string (nullable = true)
 |-- i94visa: string (nullable = true)
 |-- count: string (nullable = true)
 |-- entdepa: string (nullable = true)
 |-- entdepd: string (nullable = true)
 |-- biryear: string (nullable = true)
 |-- dtaddto: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- airline: string (nullable = true)
 |-- visatype: string (nullable = true)
 |-- arrival_date: date (nullable = true)
 |-- departure_date: date (nullable = true)



In [12]:
tourism2.printSchema()

root
 |-- _c0: string (nullable = true)
 |-- cicid: string (nullable = true)
 |-- i94yr: string (nullable = true)
 |-- i94mon: string (nullable = true)
 |-- i94cit: string (nullable = true)
 |-- i94res: string (nullable = true)
 |-- i94port: string (nullable = true)
 |-- i94mode: string (nullable = true)
 |-- i94addr: string (nullable = true)
 |-- i94visa: string (nullable = true)
 |-- count: string (nullable = true)
 |-- entdepa: string (nullable = true)
 |-- entdepd: string (nullable = true)
 |-- biryear: string (nullable = true)
 |-- dtaddto: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- airline: string (nullable = true)
 |-- visatype: string (nullable = true)
 |-- departure_date: date (nullable = true)

