In [1]:
import pandas as pd
from pyspark.sql import SparkSession
from pyspark.sql.types import StructField,StringType
from pyspark.sql.types import StructType,StructField, StringType, IntegerType 
from pyspark.sql.types import ArrayType, DoubleType, BooleanType, DateType
from pyspark.sql.functions import udf, col, array_contains, when, concat_ws, to_date, to_timestamp, lit, date_format
from pyspark.sql.window import Window
import pyspark.sql.functions as F

In [2]:
spark = SparkSession \
        .builder \
        .appName("Spark Covid Earlier Transformations") \
        .getOrCreate()

In [3]:
# Load dataset from Johns Hopkins University Github page
# with command pd.read_csv("datafile.csv")
#covid_daily_report = pd.read_csv("data/01-23-2020.csv")
covid_daily_report = spark.read \
                    .option("header",True) \
                    .option("inferSchema",True) \
                    .csv("data/*.csv")


#                    

In [4]:
#covid_daily_report.createOrReplaceTempView("cases_table")

In [5]:
covid_daily_report.printSchema()

root
 |-- FIPS: string (nullable = true)
 |-- Admin2: string (nullable = true)
 |-- Province_State: string (nullable = true)
 |-- Country_Region: string (nullable = true)
 |-- Last_Update: string (nullable = true)
 |-- Lat: string (nullable = true)
 |-- Long_: string (nullable = true)
 |-- Confirmed: string (nullable = true)
 |-- Deaths: string (nullable = true)
 |-- Recovered: string (nullable = true)
 |-- Active: string (nullable = true)
 |-- Combined_Key: string (nullable = true)
 |-- Incident_Rate: string (nullable = true)
 |-- Case_Fatality_Ratio: string (nullable = true)



### JHU changed the dataset schema during the time
*Initial schema: Province/State,Country/Region,Last Update,Confirmed,Deaths,Recovered*

*Second schema: FIPS,Admin2,Province_State,Country_Region,Last_Update,Lat,Long_,Confirmed,Deaths,Recovered,Active,Combined_Key*

*Third schema: Province/State,Country/Region,Last Update,Confirmed,Deaths,Recovered,Latitude,Longitude*

*Forth schema: FIPS,Admin2,Province_State,Country_Region,Last_Update,Lat,Long_,Confirmed,Deaths,Recovered,Active,Combined_Key,Incidence_Rate,Case-Fatality_Ratio*

In [6]:
# spark.sql("SELECT COUNT(DISTINCT 'Country/Region') AS Country, COUNT(DISTINCT 'Province/State') AS State \
#             FROM cases_table").show()

In [7]:
# print((str(covid_daily_report.columns)))
# type(covid_daily_report)
# covid_daily_report.head(5)


In [8]:
df2=covid_daily_report
df2= df2 \
        .withColumn("Country_Region", 
        when(df2["Country_Region"].contains("China"),"China")
        .otherwise(df2["Country_Region"]))

In [9]:
df2= df2 \
        .withColumn("Country_Region", 
        when(df2["Country_Region"].contains("Republic of Korea"),"Korea, South")
        .otherwise(df2["Country_Region"]))

In [10]:
df2= df2 \
        .withColumn("Country_Region", 
        when(df2["Country_Region"].contains("Cote d'Ivoir"),"Cote d Ivoir")
        .otherwise(df2["Country_Region"]))

In [11]:
df2= df2 \
        .withColumn("Country_Region", 
        when(df2["Country_Region"].contains("Taiwan*"),"Taiwan")
        .otherwise(df2["Country_Region"]))

In [12]:
df2= df2 \
        .withColumn("Combined_Key", 
        when(df2["Country_Region"].isNull(),
        concat_ws(", ", df2["Province_State"], df2["Country_Region"]))
        .otherwise(df2["Combined_Key"]))
df2= df2 \
        .withColumn("Combined_Key", 
        when(df2["Combined_Key"].contains("Taiwan*"),"Taiwan")
        .otherwise(df2["Combined_Key"]))

In [13]:
df2= df2 \
        .withColumn("CaseFatalityRatio", (df2.Deaths/df2.Confirmed)*100)

In [14]:
if not (StructField("Incidence_Rate",StringType(),True) in df2.schema):
    df2= df2 \
        .withColumn("Incidence_Rate", 
        lit(0.0))

In [15]:
# get_datetime = udf(date_convert, TimestampType())
# df2 = df2.withColumn("Date", get_datetime('Last_Update'))

In [16]:
df2 = df2 \
        .withColumn("Last_up", to_timestamp("Last_Update", "yyyy-MM-dd HH:mm:ss")) \
        .withColumn("Year", date_format(col("Last_Update"), "yyyy")) \
        .withColumn("Month", date_format(col("Last_Update"), "MM")) \
        .withColumn("Day", date_format(col("Last_Update"), "dd")) \
        .drop("Last_up")

In [17]:
df2= df2 \
        .withColumn("Datekey",
        date_format(col("Last_Update"), "yyyyMMdd"))

In [18]:
df2 = df2.select(
                 col('Datekey'), \
                 col('Year'), \
                 col('Month'), \
                 col('Day'), \
                 col('Province_State').alias('State'), \
                 col('Country_Region').alias('Country'), \
                 col('Last_Update').alias('Date'), \
                 col('Lat').alias('Latitude'), \
                 col('Long_').alias('Longitude'), \
                 col('Confirmed'), \
                 col('Deaths'), \
                 col('Recovered'), \
                 col('Active'), \
                 col('Incidence_Rate'), \
                 col('CaseFatalityRatio'), \
                 col('Combined_Key'))

In [19]:
df2['Country'].contains('Taiwan*')

Column<'contains(Country, Taiwan*)'>

In [20]:
# window = Window.partitionBy("Country","State").orderBy("Date")

# data = F.col("Confirmed") - F.coalesce(F.lag(F.col("Confirmed")).over(window), F.lit(0))
# df2.withColumn("newConfirmed", data)

In [21]:
df2.printSchema()

root
 |-- Datekey: string (nullable = true)
 |-- Year: string (nullable = true)
 |-- Month: string (nullable = true)
 |-- Day: string (nullable = true)
 |-- State: string (nullable = true)
 |-- Country: string (nullable = true)
 |-- Date: string (nullable = true)
 |-- Latitude: string (nullable = true)
 |-- Longitude: string (nullable = true)
 |-- Confirmed: string (nullable = true)
 |-- Deaths: string (nullable = true)
 |-- Recovered: string (nullable = true)
 |-- Active: string (nullable = true)
 |-- Incidence_Rate: double (nullable = false)
 |-- CaseFatalityRatio: double (nullable = true)
 |-- Combined_Key: string (nullable = true)



In [22]:
# output_schema = StructType() \
#       .add("State",StringType(),True) \
#       .add("Country",StringType(),True) \
#       .add("Date",DateType(),True) \
#       .add("Latitude",DoubleType(),True) \
#       .add("Longitude",DoubleType(),True) \
#       .add("Confirmed",IntegerType(),True) \
#       .add("Deaths",IntegerType(),True) \
#       .add("Recovered",IntegerType(),True) \
#       .add("Active",IntegerType(),True) \
#       .add("IncidentRate",DoubleType(),True) \
#       .add("CaseFatalityRatio",DoubleType(),True) \
#       .add("CombinedKey",StringType(),True)

In [23]:
# df2.write \
#     .mode("overwrite") \
#     .partitionBy("Year", "Month", "Day", "Country") \
#     .parquet('out/curated/')

In [24]:
df2.write \
    .format("com.databricks.spark.csv") \
    .mode("overwrite") \
    .option("header",False) \
    .option("escape", "") \
    .option("quote", "") \
    .option("emptyValue", "") \
    .option("delimiter", ";") \
    .save("out/landed")