In [2]:
# Google Drive is mounted manually (sometimes the import code doesn't work for me)
# the data set (csv) is downloaded and uploaded to the Drive

# load packages
import pandas as pd
pd.set_option("display.max_columns", None) # to display all columns of the dataframe
import numpy as np
import plotly.express as px


In [3]:
# to read the data set
outbreak_2022 = pd.read_csv('/content/drive/MyDrive/ob_report_2022.csv')

# to take a look at the data set
outbreak_2022.head()

# look at the unique categories of the outbreak setting column
outbreak_2022['Outbreak Setting'].unique()

array(['LTCH', 'Retirement Home', 'Hospital-Chronic Care',
       'Hospital-Acute Care', 'Hospital-Psychiatric', 'Transitional Care'],
      dtype=object)

In [4]:
# to create a new data frame to categorize the outbreak settings into 'hospital' or 'non-hospital'
# reference: https://colab.research.google.com/github/google/eng-edu/blob/main/ml/cc/exercises/pandas_dataframe_ultraquick_tutorial.ipynb#scrollTo=FNZsPOgSD4F2

# Create and populate a 6x2 NumPy array.
my_data = np.array([['LTCH', 'Non-hospital'],
                    ['Retirement Home', 'Non-hospital'], 
                    ['Hospital-Chronic Care', 'Hospital'], 
                    ['Hospital-Acute Care', 'Hospital'], 
                    ['Hospital-Psychiatric', 'Hospital'],
                    ['Transitional Care', 'Non-hospital']])

# Create a Python list that holds the names of the two columns.
my_column_names = ['Outbreak locations', 'category']

# Create a DataFrame.
category_table = pd.DataFrame(data=my_data, columns=my_column_names)

# Print the entire DataFrame
print(category_table)

      Outbreak locations      category
0                   LTCH  Non-hospital
1        Retirement Home  Non-hospital
2  Hospital-Chronic Care      Hospital
3    Hospital-Acute Care      Hospital
4   Hospital-Psychiatric      Hospital
5      Transitional Care  Non-hospital


In [6]:
# to merge the data set and the category table to add the category
outbreak_2022_category = pd.merge(outbreak_2022,
                           category_table,
                           how='left',
                           left_on='Outbreak Setting',
                           right_on='Outbreak locations')

outbreak_2022_category.head()

# to count the number of cases of causative agents in non-hospital setting
non_hospital = outbreak_2022_category.loc[outbreak_2022_category['category'] == 'Non-hospital', ['Causative Agent-1']]
value_non_h = non_hospital['Causative Agent-1'].value_counts().rename_axis('types').reset_index(name='Non-hospital cases')
print(value_non_h)

# to count the number of cases of causative agents in hospital setting
hospital = outbreak_2022_category.loc[outbreak_2022_category['category'] == 'Hospital', ['Causative Agent-1']]
value_h = hospital['Causative Agent-1'].value_counts().rename_axis('types').reset_index(name='Hospital cases')
print(value_h)

# to merge the counts
value_merge = pd.merge(value_h,
                           value_non_h,
                           how='left',
                           left_on='types',
                           right_on='types')
print(value_merge)


                          types  Non-hospital cases
0                      COVID-19                 548
1   Respiratory syncytial virus                  26
2            Unable to identify                  24
3    Influenza A (Not subtyped)                  23
4                    Rhinovirus                  14
5                  Coronavirus*                  13
6                Norovirus-like                  12
7              Influenza A (H3)                  12
8        Enterovirus/Rhinovirus                   9
9               Metapneumovirus                   6
10                Parainfluenza                   2
11           Influenza A (H3N2)                   2
                         types  Hospital cases
0                     COVID-19             426
1  Respiratory syncytial virus               4
2             Influenza A (H3)               4
3   Influenza A (Not subtyped)               3
4                   Rhinovirus               2
5        CPE Unspecified (NDM)            

In [10]:
# to make an interative bar graph to compare the cases of different causative agents
bar_fig = px.bar(value_merge,
                  y ='types', x= ['Hospital cases', 'Non-hospital cases'],
                 hover_name = 'types',
                 labels = {'variable':'Setting', 'types':'Causative agent',
                           'value':'Cases'},
                 title = 'Outbreaks in Hospital vs. Non-hospital Settings 2022',
                 color_discrete_sequence = ['#1A85FF', '#D41159'])

# set the font to arial to improve accessibility
bar_fig.update_layout(font_family = 'Arial',
                      title_font_family = 'Arial',
                      title = {'x':0.5})
bar_fig.update_xaxes(title_font_family = 'Arial')
bar_fig.show()

# to export as html to remain interactive to my Drive
bar_fig.write_html("/content/drive/MyDrive/Visualization_by_Python.html")