In [47]:
import pandas as pd
from pyspark.sql import SparkSession
from pyspark.sql.functions import when

In [23]:
spark = SparkSession \
        .builder \
        .appName("Spark Covid Earlier Transformations") \
        .getOrCreate()

In [40]:
# Load dataset from Johns Hopkins University Github page
# with command pd.read_csv("datafile.csv")
#covid_daily_report = pd.read_csv("data/01-23-2020.csv")
covid_daily_report = spark.read.option("inferSchema",True) \
                    .option("header",True) \
                    .csv("data/01-23-2021.csv")

In [41]:
covid_daily_report.createOrReplaceTempView("cases_table")

In [42]:
covid_daily_report.printSchema()

root
 |-- FIPS: integer (nullable = true)
 |-- Admin2: string (nullable = true)
 |-- Province_State: string (nullable = true)
 |-- Country_Region: string (nullable = true)
 |-- Last_Update: string (nullable = true)
 |-- Lat: double (nullable = true)
 |-- Long_: double (nullable = true)
 |-- Confirmed: integer (nullable = true)
 |-- Deaths: integer (nullable = true)
 |-- Recovered: integer (nullable = true)
 |-- Active: integer (nullable = true)
 |-- Combined_Key: string (nullable = true)
 |-- Incident_Rate: double (nullable = true)
 |-- Case_Fatality_Ratio: double (nullable = true)



### JHU changed the dataset schema during the time
*Initial schema: Province/State,Country/Region,Last Update,Confirmed,Deaths,Recovered*

*Second schema: FIPS,Admin2,Province_State,Country_Region,Last_Update,Lat,Long_,Confirmed,Deaths,Recovered,Active,Combined_Key*

*Third schema: Province/State,Country/Region,Last Update,Confirmed,Deaths,Recovered,Latitude,Longitude*

*Forth schema: FIPS,Admin2,Province_State,Country_Region,Last_Update,Lat,Long_,Confirmed,Deaths,Recovered,Active,Combined_Key,Incidence_Rate,Case-Fatality_Ratio*

In [32]:
spark.sql("SELECT COUNT(DISTINCT 'Country/Region') AS Country, COUNT(DISTINCT 'Province/State') AS State \
            FROM cases_table").show()

+-------+-----+
|Country|State|
+-------+-----+
|      1|    1|
+-------+-----+



In [46]:
print((str(covid_daily_report.columns)))
type(covid_daily_report)


['FIPS', 'Admin2', 'Province_State', 'Country_Region', 'Last_Update', 'Lat', 'Long_', 'Confirmed', 'Deaths', 'Recovered', 'Active', 'Combined_Key', 'Incident_Rate', 'Case_Fatality_Ratio']


pyspark.sql.dataframe.DataFrame

In [55]:
df2= covid_daily_report.withColumn("Country_Region", when(
                covid_daily_report["Country_Region"].contains("China"),
                "China").otherwise(covid_daily_report["Country_Region"]))

In [58]:
df2.head(10)

[Row(FIPS=None, Admin2=None, Province_State=None, Country_Region='Afghanistan', Last_Update='2021-01-24 05:22:35', Lat=33.93911, Long_=67.709953, Confirmed=54559, Deaths=2373, Recovered=46943, Active=5243, Combined_Key='Afghanistan', Incident_Rate=140.15238923230766, Case_Fatality_Ratio=4.349419894059642),
 Row(FIPS=None, Admin2=None, Province_State=None, Country_Region='Albania', Last_Update='2021-01-24 05:22:35', Lat=41.1533, Long_=20.1683, Confirmed=71441, Deaths=1310, Recovered=43384, Active=26747, Combined_Key='Albania', Incident_Rate=2482.4866217249287, Case_Fatality_Ratio=1.833680939516524),
 Row(FIPS=None, Admin2=None, Province_State=None, Country_Region='Algeria', Last_Update='2021-01-24 05:22:35', Lat=28.0339, Long_=1.6596, Confirmed=105369, Deaths=2861, Recovered=71755, Active=30753, Combined_Key='Algeria', Incident_Rate=240.28846930733206, Case_Fatality_Ratio=2.7152198464444),
 Row(FIPS=None, Admin2=None, Province_State=None, Country_Region='Andorra', Last_Update='2021-01-2