In [2]:
%run utilities.ipynb

In [8]:
# IMPORT DFS

df_cases_deaths = spark.read.csv("Datasets/cases_deaths.csv", header=True, inferSchema=True)
df_country_lookup = spark.read.csv("Datasets/country_lookup.csv", header=True, inferSchema=True) 

In [9]:
df_cases_deaths.show()

+-----------+------------+---------+----------+---------------+-----------+----------+-----------+--------------------+
|    country|country_code|continent|population|      indicator|daily_count|      date|rate_14_day|              source|
+-----------+------------+---------+----------+---------------+-----------+----------+-----------+--------------------+
|Afghanistan|         AFG|     Asia|  38928341|confirmed cases|          0|2020-01-02|       NULL|Epidemic intellig...|
|Afghanistan|         AFG|     Asia|  38928341|confirmed cases|          0|2020-01-03|       NULL|Epidemic intellig...|
|Afghanistan|         AFG|     Asia|  38928341|confirmed cases|          0|2020-01-04|       NULL|Epidemic intellig...|
|Afghanistan|         AFG|     Asia|  38928341|confirmed cases|          0|2020-01-05|       NULL|Epidemic intellig...|
|Afghanistan|         AFG|     Asia|  38928341|confirmed cases|          0|2020-01-06|       NULL|Epidemic intellig...|
|Afghanistan|         AFG|     Asia|  38

In [10]:
df_cases_deaths.printSchema()

root
 |-- country: string (nullable = true)
 |-- country_code: string (nullable = true)
 |-- continent: string (nullable = true)
 |-- population: long (nullable = true)
 |-- indicator: string (nullable = true)
 |-- daily_count: integer (nullable = true)
 |-- date: date (nullable = true)
 |-- rate_14_day: double (nullable = true)
 |-- source: string (nullable = true)



In [11]:
df_cases_deaths.dtypes

[('country', 'string'),
 ('country_code', 'string'),
 ('continent', 'string'),
 ('population', 'bigint'),
 ('indicator', 'string'),
 ('daily_count', 'int'),
 ('date', 'date'),
 ('rate_14_day', 'double'),
 ('source', 'string')]

In [12]:
df_cases_deaths.describe()

DataFrame[summary: string, country: string, country_code: string, continent: string, population: string, indicator: string, daily_count: string, rate_14_day: string, source: string]

In [13]:
df_cases_deaths.head()

Row(country='Afghanistan', country_code='AFG', continent='Asia', population=38928341, indicator='confirmed cases', daily_count=0, date=datetime.date(2020, 1, 2), rate_14_day=None, source='Epidemic intelligence, national daily data')

In [14]:
### FILTER DATAFRAME ###

df_cases_deaths = (
    df_cases_deaths
    .where(col('continent') == 'Europe')
    .drop('continent', 'rate_14_day', 'source')
)

df_cases_deaths.show()

+-------+------------+----------+---------------+-----------+----------+
|country|country_code|population|      indicator|daily_count|      date|
+-------+------------+----------+---------------+-----------+----------+
|Albania|         ALB|   2862427|confirmed cases|          0|2020-01-02|
|Albania|         ALB|   2862427|confirmed cases|          0|2020-01-03|
|Albania|         ALB|   2862427|confirmed cases|          0|2020-01-04|
|Albania|         ALB|   2862427|confirmed cases|          0|2020-01-05|
|Albania|         ALB|   2862427|confirmed cases|          0|2020-01-06|
|Albania|         ALB|   2862427|confirmed cases|          0|2020-01-07|
|Albania|         ALB|   2862427|confirmed cases|          0|2020-01-08|
|Albania|         ALB|   2862427|confirmed cases|          0|2020-01-09|
|Albania|         ALB|   2862427|confirmed cases|          0|2020-01-10|
|Albania|         ALB|   2862427|confirmed cases|          0|2020-01-11|
|Albania|         ALB|   2862427|confirmed cases|  

In [15]:
pivoted_df_cases_and_death = (
    df_cases_deaths
    .groupBy('country', 'country_code', 'population', 'date')
    .pivot('indicator')
    .agg({'daily_count':'sum'})
    .orderBy('country', 'country_code', 'date')
    .fillna(0)
)

In [16]:
pivoted_df_cases_and_death.show()

+-------+------------+----------+----------+---------------+------+
|country|country_code|population|      date|confirmed cases|deaths|
+-------+------------+----------+----------+---------------+------+
|Albania|         ALB|   2862427|2020-01-02|              0|     0|
|Albania|         ALB|   2862427|2020-01-03|              0|     0|
|Albania|         ALB|   2862427|2020-01-04|              0|     0|
|Albania|         ALB|   2862427|2020-01-05|              0|     0|
|Albania|         ALB|   2862427|2020-01-06|              0|     0|
|Albania|         ALB|   2862427|2020-01-07|              0|     0|
|Albania|         ALB|   2862427|2020-01-08|              0|     0|
|Albania|         ALB|   2862427|2020-01-09|              0|     0|
|Albania|         ALB|   2862427|2020-01-10|              0|     0|
|Albania|         ALB|   2862427|2020-01-11|              0|     0|
|Albania|         ALB|   2862427|2020-01-12|              0|     0|
|Albania|         ALB|   2862427|2020-01-13|    

In [17]:
df_country_lookup.head()

Row(country='Aruba', country_code_2_digit='AW', country_code_3_digit='ABW', continent='America', population=106766)

In [18]:
condition = [pivoted_df_cases_and_death.country == df_country_lookup.country]

fields = [
    pivoted_df_cases_and_death['country']
    ,df_country_lookup['country_code_3_digit']
    ,df_country_lookup['country_code_2_digit']
    ,pivoted_df_cases_and_death['population']
    ,'cases_count'
    ,'deaths_count'
    ,'reported_date'
]

df_cases_and_death = (
    pivoted_df_cases_and_death
    .join(df_country_lookup, on=condition, how='left')
    .where(col('continent') == 'Europe')
    .withColumnRenamed('confirmed cases', 'cases_count')
    .withColumnRenamed('deaths', 'deaths_count')
    .withColumnRenamed('date', 'reported_date')
    .orderBy(pivoted_df_cases_and_death['country'])
    .select(fields) # projection
)

In [19]:
df_cases_and_death.show()

+-------+--------------------+--------------------+----------+-----------+------------+-------------+
|country|country_code_3_digit|country_code_2_digit|population|cases_count|deaths_count|reported_date|
+-------+--------------------+--------------------+----------+-----------+------------+-------------+
|Albania|                 ALB|                  AL|   2862427|         15|           0|   2020-05-21|
|Albania|                 ALB|                  AL|   2862427|          0|           0|   2020-01-11|
|Albania|                 ALB|                  AL|   2862427|         77|           2|   2020-06-27|
|Albania|                 ALB|                  AL|   2862427|          0|           0|   2020-01-03|
|Albania|                 ALB|                  AL|   2862427|          0|           0|   2020-03-03|
|Albania|                 ALB|                  AL|   2862427|          4|           0|   2020-03-18|
|Albania|                 ALB|                  AL|   2862427|         14|        

In [21]:
df_cases_and_death.head(5)

[Row(country='Albania', country_code_3_digit='ALB', country_code_2_digit='AL', population=2862427, cases_count=77, deaths_count=2, reported_date=datetime.date(2020, 6, 27)),
 Row(country='Albania', country_code_3_digit='ALB', country_code_2_digit='AL', population=2862427, cases_count=0, deaths_count=0, reported_date=datetime.date(2020, 3, 3)),
 Row(country='Albania', country_code_3_digit='ALB', country_code_2_digit='AL', population=2862427, cases_count=0, deaths_count=0, reported_date=datetime.date(2020, 1, 3)),
 Row(country='Albania', country_code_3_digit='ALB', country_code_2_digit='AL', population=2862427, cases_count=0, deaths_count=0, reported_date=datetime.date(2020, 1, 9)),
 Row(country='Albania', country_code_3_digit='ALB', country_code_2_digit='AL', population=2862427, cases_count=0, deaths_count=0, reported_date=datetime.date(2020, 1, 26))]

In [22]:
df_cases_and_death.tail(5)

[Row(country='the Holy See/ Vatican City State', country_code_3_digit='VAT', country_code_2_digit='VA', population=809, cases_count=0, deaths_count=0, reported_date=datetime.date(2020, 7, 13)),
 Row(country='the Holy See/ Vatican City State', country_code_3_digit='VAT', country_code_2_digit='VA', population=809, cases_count=0, deaths_count=0, reported_date=datetime.date(2020, 7, 30)),
 Row(country='the Holy See/ Vatican City State', country_code_3_digit='VAT', country_code_2_digit='VA', population=809, cases_count=0, deaths_count=0, reported_date=datetime.date(2020, 6, 10)),
 Row(country='the Holy See/ Vatican City State', country_code_3_digit='VAT', country_code_2_digit='VA', population=809, cases_count=0, deaths_count=0, reported_date=datetime.date(2020, 3, 17)),
 Row(country='the Holy See/ Vatican City State', country_code_3_digit='VAT', country_code_2_digit='VA', population=809, cases_count=0, deaths_count=0, reported_date=datetime.date(2020, 8, 30))]