In [2]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('rheeza').getOrCreate()

## Understanding the dataset

In [5]:
trials_df = spark.read.json('dataset.json', multiLine=True)

In [9]:
trials_df.show(4)

+----------------+--------------------+---------+-----------------+-------------------+------------------------------+--------------------+
|ageofparticipant|           clinician|drug_used|experimentenddate|experimentstartdate|noofhourspassedatfirstreaction|              result|
+----------------+--------------------+---------+-----------------+-------------------+------------------------------+--------------------+
|              19|{Ontario, Saul, t...|  Placebo|    1619827200000|      1617235200000|                            52|{BP normalized, r...|
|              14|{Ontario, Saul, n...| Naproxen|    1619827200000|      1617235200000|                            78|    {Follow-up, N/A}|
|              17|{Ontario, Saul, n...|  Placebo|    1619827200000|      1617235200000|                            14|    {Follow-up, N/A}|
|              18|{Ontario, Will, n...| Naproxen|    1619827200000|      1617235200000|                            14|{BP normalized, N/A}|
+----------------+--

In [7]:
trials_df.printSchema()

root
 |-- ageofparticipant: long (nullable = true)
 |-- clinician: struct (nullable = true)
 |    |-- branch: string (nullable = true)
 |    |-- name: string (nullable = true)
 |    |-- role: string (nullable = true)
 |-- drug_used: string (nullable = true)
 |-- experimentenddate: string (nullable = true)
 |-- experimentstartdate: string (nullable = true)
 |-- noofhourspassedatfirstreaction: long (nullable = true)
 |-- result: struct (nullable = true)
 |    |-- conclusion: string (nullable = true)
 |    |-- sideeffectsonparticipant: string (nullable = true)



## To convert the data into a datafram readable for clinicians

In [11]:
trials_df.columns

['ageofparticipant',
 'clinician',
 'drug_used',
 'experimentenddate',
 'experimentstartdate',
 'noofhourspassedatfirstreaction',
 'result']

## To flaten the json file, i.e to convert into different columns.


In [12]:
columns = ['ageofparticipant',
 'clinician.branch',
 'clinician.name',
 'clinician.role',
 'drug_used',
 'experimentenddate',
 'experimentstartdate',
 'noofhourspassedatfirstreaction',
 'result.conclusion',
 'result.sideeffectsonparticipant']

In [13]:
trials_df.select(columns).show(5)

+----------------+-------+-------+---------+---------+-----------------+-------------------+------------------------------+-------------+------------------------+
|ageofparticipant| branch|   name|     role|drug_used|experimentenddate|experimentstartdate|noofhourspassedatfirstreaction|   conclusion|sideeffectsonparticipant|
+----------------+-------+-------+---------+---------+-----------------+-------------------+------------------------------+-------------+------------------------+
|              19|Ontario|   Saul|therapist|  Placebo|    1619827200000|      1617235200000|                            52|BP normalized|          rashes on neck|
|              14|Ontario|   Saul|    nurse| Naproxen|    1619827200000|      1617235200000|                            78|    Follow-up|                     N/A|
|              17|Ontario|   Saul|    nurse|  Placebo|    1619827200000|      1617235200000|                            14|    Follow-up|                     N/A|
|              18|Onta

In [23]:
trials_df.select(columns).columns

['ageofparticipant',
 'branch',
 'name',
 'role',
 'drug_used',
 'experimentenddate',
 'experimentstartdate',
 'noofhourspassedatfirstreaction',
 'conclusion',
 'sideeffectsonparticipant']

## Now we have a straightened dataframe.

In [14]:
from pyspark.sql import functions as fn

In [19]:
# Counting null values per column
trials_df.select([ fn.count(fn.when(fn.col(column).isNull(), column)).alias(column) for column in columns]).show()

+----------------+----------------+--------------+--------------+---------+-----------------+-------------------+------------------------------+-----------------+-------------------------------+
|ageofparticipant|clinician.branch|clinician.name|clinician.role|drug_used|experimentenddate|experimentstartdate|noofhourspassedatfirstreaction|result.conclusion|result.sideeffectsonparticipant|
+----------------+----------------+--------------+--------------+---------+-----------------+-------------------+------------------------------+-----------------+-------------------------------+
|               0|               0|             0|           109|        0|                0|                  0|                            73|               53|                              0|
+----------------+----------------+--------------+--------------+---------+-----------------+-------------------+------------------------------+-----------------+-------------------------------+



## Now we do data cleaning

In [20]:
new_trials_df = trials_df.select(columns)

In [21]:
new_trials_df.printSchema()

root
 |-- ageofparticipant: long (nullable = true)
 |-- branch: string (nullable = true)
 |-- name: string (nullable = true)
 |-- role: string (nullable = true)
 |-- drug_used: string (nullable = true)
 |-- experimentenddate: string (nullable = true)
 |-- experimentstartdate: string (nullable = true)
 |-- noofhourspassedatfirstreaction: long (nullable = true)
 |-- conclusion: string (nullable = true)
 |-- sideeffectsonparticipant: string (nullable = true)



## Renaming columns

In [26]:
new_column_names = {
    'ageofparticipant' : 'age_of_participant',
    'branch': 'clinic_branch',
    'name':'clinician_name',
    'role':'assistant_role',
    'drug_used':'drug',
    'experimentenddate':'experiment_end_date',
    'experimentstartdate':'experiment_start_date',
    'noofhourspassedatfirstreaction':'no_of_hours_passed_at_first_reaction',
    'conclusion':'conclusion',
    'sideeffectsonparticipant':'side_effects'

}

In [30]:
new_trials = new_trials_df.withColumnsRenamed(new_column_names)

## Handling Null Values

In [31]:
new_trials.show()

+------------------+-------------+--------------+--------------+--------+-------------------+---------------------+------------------------------------+-------------+--------------+
|age_of_participant|clinic_branch|clinician_name|assistant_role|    drug|experiment_end_date|experiment_start_date|no_of_hours_passed_at_first_reaction|   conclusion|  side_effects|
+------------------+-------------+--------------+--------------+--------+-------------------+---------------------+------------------------------------+-------------+--------------+
|                19|      Ontario|          Saul|     therapist| Placebo|      1619827200000|        1617235200000|                                  52|BP normalized|rashes on neck|
|                14|      Ontario|          Saul|         nurse|Naproxen|      1619827200000|        1617235200000|                                  78|    Follow-up|           N/A|
|                17|      Ontario|          Saul|         nurse| Placebo|      16198272000

In [32]:
new_trials = new_trials.na.fill({'conclusion':'unknown', 'assistant_role': 'unknown'})

In [37]:
new_trials.describe()

DataFrame[summary: string, age_of_participant: string, clinic_branch: string, clinician_name: string, assistant_role: string, drug: string, experiment_end_date: string, experiment_start_date: string, no_of_hours_passed_at_first_reaction: string, conclusion: string, side_effects: string]