In [2]:
import pandas as pd
from pyspark.sql import SparkSession
from pyspark.sql.types import StructField,StringType
from pyspark.sql.types import StructType,StructField, StringType, IntegerType 
from pyspark.sql.types import ArrayType, DoubleType, BooleanType, DateType
from pyspark.sql.functions import col, array_contains, when, concat_ws, to_date, lit
from pyspark.sql.window import Window
import pyspark.sql.functions as F

In [3]:
spark = SparkSession \
        .builder \
        .appName("Spark Covid Earlier Transformations") \
        .getOrCreate()

In [4]:
# Load dataset from Johns Hopkins University Github page
# with command pd.read_csv("datafile.csv")
#covid_daily_report = pd.read_csv("data/01-23-2020.csv")
covid_daily_report = spark.read \
                    .option("header",True) \
                    .option("inferSchema",True) \
                    .csv("data/03-24-2020.csv")


#                    

In [5]:
#covid_daily_report.createOrReplaceTempView("cases_table")

In [6]:
covid_daily_report.printSchema()

root
 |-- FIPS: integer (nullable = true)
 |-- Admin2: string (nullable = true)
 |-- Province_State: string (nullable = true)
 |-- Country_Region: string (nullable = true)
 |-- Last_Update: string (nullable = true)
 |-- Lat: double (nullable = true)
 |-- Long_: double (nullable = true)
 |-- Confirmed: integer (nullable = true)
 |-- Deaths: integer (nullable = true)
 |-- Recovered: integer (nullable = true)
 |-- Active: integer (nullable = true)
 |-- Combined_Key: string (nullable = true)



### JHU changed the dataset schema during the time
*Initial schema: Province/State,Country/Region,Last Update,Confirmed,Deaths,Recovered*

*Second schema: FIPS,Admin2,Province_State,Country_Region,Last_Update,Lat,Long_,Confirmed,Deaths,Recovered,Active,Combined_Key*

*Third schema: Province/State,Country/Region,Last Update,Confirmed,Deaths,Recovered,Latitude,Longitude*

*Forth schema: FIPS,Admin2,Province_State,Country_Region,Last_Update,Lat,Long_,Confirmed,Deaths,Recovered,Active,Combined_Key,Incidence_Rate,Case-Fatality_Ratio*

In [7]:
# spark.sql("SELECT COUNT(DISTINCT 'Country/Region') AS Country, COUNT(DISTINCT 'Province/State') AS State \
#             FROM cases_table").show()

In [8]:
print((str(covid_daily_report.columns)))
type(covid_daily_report)
covid_daily_report.head(5)


['FIPS', 'Admin2', 'Province_State', 'Country_Region', 'Last_Update', 'Lat', 'Long_', 'Confirmed', 'Deaths', 'Recovered', 'Active', 'Combined_Key']


[Row(FIPS=45001, Admin2='Abbeville', Province_State='South Carolina', Country_Region='US', Last_Update='2020-03-24 23:37:31', Lat=34.22333378, Long_=-82.46170658, Confirmed=1, Deaths=0, Recovered=0, Active=0, Combined_Key='Abbeville, South Carolina, US'),
 Row(FIPS=22001, Admin2='Acadia', Province_State='Louisiana', Country_Region='US', Last_Update='2020-03-24 23:37:31', Lat=30.295064899999996, Long_=-92.41419698, Confirmed=2, Deaths=0, Recovered=0, Active=0, Combined_Key='Acadia, Louisiana, US'),
 Row(FIPS=51001, Admin2='Accomack', Province_State='Virginia', Country_Region='US', Last_Update='2020-03-24 23:37:31', Lat=37.76707161, Long_=-75.63234615, Confirmed=1, Deaths=0, Recovered=0, Active=0, Combined_Key='Accomack, Virginia, US'),
 Row(FIPS=16001, Admin2='Ada', Province_State='Idaho', Country_Region='US', Last_Update='2020-03-24 23:37:31', Lat=43.4526575, Long_=-116.24155159999998, Confirmed=19, Deaths=0, Recovered=0, Active=0, Combined_Key='Ada, Idaho, US'),
 Row(FIPS=19001, Admin

In [9]:
df2=covid_daily_report
df2= df2 \
        .withColumn("Country_Region", 
        when(df2["Country_Region"].contains("China"),"China")
        .otherwise(df2["Country_Region"]))

In [10]:
df2= df2 \
        .withColumn("Country_Region", 
        when(df2["Country_Region"].contains("Republic of Korea"),"Korea, South")
        .otherwise(df2["Country_Region"]))

In [11]:
df2= df2 \
        .withColumn("Combined_Key", 
        when(df2["Country_Region"].isNull(),
        concat_ws(", ", df2["Province_State"], df2["Country_Region"]))
        .otherwise(df2["Combined_Key"]))

In [12]:
df2= df2 \
        .withColumn("CaseFatalityRatio", (df2.Deaths/df2.Confirmed)*100)

In [13]:
if not (StructField("Incidence_Rate",StringType(),True) in df2.schema):
    df2= df2 \
        .withColumn("Incidence_Rate", 
        lit(""))

In [14]:
df2= df2 \
        .withColumn("Last_Update",
        to_date(col("Last_Update"),"MM-dd-yyyy")
                    )

In [15]:
df2.head(100)

[Row(FIPS=45001, Admin2='Abbeville', Province_State='South Carolina', Country_Region='US', Last_Update=None, Lat=34.22333378, Long_=-82.46170658, Confirmed=1, Deaths=0, Recovered=0, Active=0, Combined_Key='Abbeville, South Carolina, US', CaseFatalityRatio=0.0, Incidence_Rate=''),
 Row(FIPS=22001, Admin2='Acadia', Province_State='Louisiana', Country_Region='US', Last_Update=None, Lat=30.295064899999996, Long_=-92.41419698, Confirmed=2, Deaths=0, Recovered=0, Active=0, Combined_Key='Acadia, Louisiana, US', CaseFatalityRatio=0.0, Incidence_Rate=''),
 Row(FIPS=51001, Admin2='Accomack', Province_State='Virginia', Country_Region='US', Last_Update=None, Lat=37.76707161, Long_=-75.63234615, Confirmed=1, Deaths=0, Recovered=0, Active=0, Combined_Key='Accomack, Virginia, US', CaseFatalityRatio=0.0, Incidence_Rate=''),
 Row(FIPS=16001, Admin2='Ada', Province_State='Idaho', Country_Region='US', Last_Update=None, Lat=43.4526575, Long_=-116.24155159999998, Confirmed=19, Deaths=0, Recovered=0, Active

In [16]:
df2.printSchema()

root
 |-- FIPS: integer (nullable = true)
 |-- Admin2: string (nullable = true)
 |-- Province_State: string (nullable = true)
 |-- Country_Region: string (nullable = true)
 |-- Last_Update: date (nullable = true)
 |-- Lat: double (nullable = true)
 |-- Long_: double (nullable = true)
 |-- Confirmed: integer (nullable = true)
 |-- Deaths: integer (nullable = true)
 |-- Recovered: integer (nullable = true)
 |-- Active: integer (nullable = true)
 |-- Combined_Key: string (nullable = true)
 |-- CaseFatalityRatio: double (nullable = true)
 |-- Incidence_Rate: string (nullable = false)



In [17]:
df2 = df2.select(col('Province_State').alias('State'), \
                         col('Country_Region').alias('Country'), \
                         col('Last_Update').alias('Date'), \
                         col('Lat').alias('Latitude'), \
                         col('Long_').alias('Longitude'), \
                         col('Confirmed'), \
                         col('Deaths'), \
                         col('Recovered'), \
                         col('Active'), \
                         col('Incidence_Rate'), \
                         col('CaseFatalityRatio'), \
                         col('Combined_Key'))

In [19]:
window = Window.partitionBy("Country","State").orderBy("Date")

data = F.col("Confirmed") - F.coalesce(F.lag(F.col("Confirmed")).over(window), F.lit(0))
df2.withColumn("newConfirmed", data)

DataFrame[State: string, Country: string, Date: date, Latitude: double, Longitude: double, Confirmed: int, Deaths: int, Recovered: int, Active: int, Incidence_Rate: string, CaseFatalityRatio: double, Combined_Key: string, newConfirmed: int]

In [24]:
data

Column<'(Confirmed - coalesce(lag(Confirmed, 1, NULL) OVER (PARTITION BY Country, State ORDER BY Date ASC NULLS FIRST unspecifiedframe$()), 0))'>

In [None]:
output_schema = StructType() \
      .add("State",StringType(),True) \
      .add("Country",StringType(),True) \
      .add("Date",DateType(),True) \
      .add("Latitude",DoubleType(),True) \
      .add("Longitude",DoubleType(),True) \
      .add("Confirmed",IntegerType(),True) \
      .add("Deaths",IntegerType(),True) \
      .add("Recovered",IntegerType(),True) \
      .add("Active",IntegerType(),True) \
      .add("IncidentRate",DoubleType(),True) \
      .add("CaseFatalityRatio",DoubleType(),True) \
      .add("CombinedKey",StringType(),True)

In [None]:
output_data.head(10)

In [None]:
df2.write \
    .format("com.databricks.spark.csv") \
    .option("header",True) \
    .option("escape", "") \
    .option("quote", "") \
    .option("emptyValue", "") \
    .option("delimiter", ";") \
    .save("out/spark_output")

In [None]:
df2.dtypes()