# <font color='mediumblue'>EDA Novel Corona Virus 2019 Dataset</font>

The goal of this simple project is  to do EDA to all the datasets provided by https://www.kaggle.com/sudalairajkumar/novel-corona-virus-2019-dataset, and take some insights regarding data quality and data visualization. This is a work in progress so I will change it frequently, also if you have some positive criticism, just go ahead and share it!

**To run this notebook, please click 'run' --> 'run all'**

<font color='blue'>Imports</font>

In [None]:
#General imports
import pandas as pd
from IPython.core.display import display, HTML
# ipyaggrid
from ipyaggrid import Grid
import datetime

In [None]:
# visualizations
import plotly
import plotly.figure_factory as ff
plotly.offline.init_notebook_mode()
import plotly.graph_objects as go
import plotly.express as px
import chart_studio.plotly as py
from plotly.subplots import make_subplots

In [None]:
# imports ipywidgets
import ipywidgets as widgets
from ipywidgets import interact, interact_manual
from ipywidgets import Layout
from ipywidgets import TwoByTwoLayout

In [None]:
import os
os.environ['KAGGLE_USERNAME'] = 'mteresa'
os.environ['KAGGLE_KEY'] = '5c921a8ead65d7cbb3084e9a9e8ef349'
#os.environ['KAGGLE_PROXY'] = '<proxy-address>' ## skip this step if you are not working behind a firewall

In [None]:
import kaggle as kg

In [None]:
kg.api.authenticate()
kg.api.dataset_download_files(dataset="sudalairajkumar/novel-corona-virus-2019-dataset", path = 'gt.zip',unzip=True)

<font color='blue'>Get datasets

In [None]:
covid19_data=pd.read_csv('gt.zip/covid_19_data.csv')
covid19_line_list_data = pd.read_csv('gt.zip/COVID19_line_list_data.csv',index_col=0,error_bad_lines=False)
time_series_covid19_confirmed= pd.read_csv('gt.zip/time_series_covid_19_confirmed.csv',index_col=0,error_bad_lines=False)
time_series_covid19_deaths= pd.read_csv('gt.zip/time_series_covid_19_deaths.csv',index_col=0,error_bad_lines=False)
time_series_covid19_recovered= pd.read_csv('gt.zip/time_series_covid_19_recovered.csv',index_col=0,error_bad_lines=False)

## <font color='mediumblue'>1. EDA *covid19_data* dataset</font>

In [None]:
#covid19_data = pd.read_csv('D:/User/ProgramsData/Anaconda2019_10/Data/Corona/covid_19_data.csv',index_col = 0)

<font color='mediumblue'>Let's take a quick peek</font>

In [None]:
covid19_data.tail()

`SNo : serial number`

In [None]:
covid19_data = covid19_data.reset_index()

In [None]:
# data types
covid19_data.dtypes

<font color='mediumblue'> Let's change the name of the columns that have a space and/or a not so common symbol in between names (like 'Last Update' and 'Province/State) because sometimes it can cause errors:</font>

In [None]:
covid19_data.rename(columns={'Last Update':'LastUpdate'},inplace=True)
covid19_data.rename(columns={'Province/State':'Province_State'},inplace=True)
covid19_data.rename(columns={'Country/Region':'Country_Region'},inplace=True)

<font color='mediumblue'>Now let's look at the unique values:</font>

In [None]:
# calculate unique values for all cols

col_names = covid19_data.columns.tolist()
lst=[]
for i in col_names:
    unique =covid19_data[i].unique()
    lst.append(pd.Series(data=unique,name=i))

In [None]:
# Built dataframe

lst = pd.DataFrame(lst)

lst['Unique_values'] = lst[lst.columns[0:]].apply(lambda x: '___'.join(x.dropna().astype(str)),axis=1)
lst = lst[['Unique_values']]
lst = pd.DataFrame(lst)
#lst = lst.replace({'.0',''},regex=True)

lst.index.name='Features'
#lst

In [None]:
columns_defs =[{'field':lst.index.name}] + [{'field':c} for c in lst.columns]

grid_options = {'columnDefs': columns_defs,
               'enableSorting': True,
               'enableFilter' :True,
               'enableColResize': True,
               'enableRangeSelection':True,'enableValue': True,
                'statusBar': {
        'statusPanels': [
            { 'statusPanel': 'agTotalAndFilteredRowCountComponent', 'align': 'left' },
            { 'statusPanel': 'agTotalRowCountComponent', 'align': 'center' },
            { 'statusPanel': 'agFilteredRowCountComponent' },
            { 'statusPanel': 'agSelectedRowCountComponent' },
            { 'statusPanel': 'agAggregationComponent' }
        ]
    }
               }


g = Grid(grid_data = lst,
        theme = 'ag-theme-fresh',
        quick_filter = True,
        show_toggle_delete = False,
        show_toggle_edit = False,
        grid_options = grid_options,
        index = True,
        width=1000,
        height=300,
        center = False,columns_fit='auto' 
        )

g

<font color='mediumblue'> Taking a closer look to the date columns, I see that `ObservationDate` has a date called 'alia', and `LastUpdate` has a date of the format '2/1/20201/31/2020 23:59'

<font color='mediumblue'>Pass date columns to right format, and replace bad entries with nan:</font>

In [None]:
#covid19_data[covid19_data['LastUpdate']=='2/1/20201/31/2020 23:59']

In [None]:
covid19_data['ObservationDate'] = pd.to_datetime(covid19_data['ObservationDate'],errors='coerce')
covid19_data['LastUpdate'] = pd.to_datetime(covid19_data['LastUpdate'],errors='coerce')

In [None]:
# confirm
covid19_data.dtypes

In [None]:
# shape of data
print('Covid19_data number of rows:', covid19_data.shape[0])
print('Covid19_data number of columns:',covid19_data.shape[1])

In [None]:
# missing values
missing = covid19_data.isnull().sum()
missing_perc = (100*(missing/len(covid19_data))).round(3)
missing_values = pd.DataFrame({'missing_values':missing, 'missing_values_%':missing_perc})
missing_values

<font color='mediumblue'>Great majority of missing values for `Province/State` because not all countries have a province/state</font>

In [None]:
# Timespan
print('Start observation date:', covid19_data.ObservationDate.min())
print('End observation Date:', covid19_data.ObservationDate.max())

<font color='mediumblue'>Calculate `ActiveCases = Confirmed - Recovered - Deaths` (cases without the outcome of either deaths or recovery)</font>
<br>
**source:** https://towardsdatascience.com/analyzing-coronavirus-covid-19-data-using-pandas-and-plotly-2e34fe2c4edc

In [None]:
covid19_data['ActiveCases'] = covid19_data['Confirmed'] - covid19_data['Recovered'] - covid19_data['Deaths']

<font color='mediumblue'>Take a quick peek:</font>

In [None]:
columns_defs = [{'field':c} for c in covid19_data.columns]

grid_options = {'columnDefs': columns_defs,
               'enableSorting': True,
               'enableFilter' :True,
               'enableColResize': True,
               'enableRangeSelection':True,'enableValue': True,
                'statusBar': {
        'statusPanels': [
            { 'statusPanel': 'agTotalAndFilteredRowCountComponent', 'align': 'left' },
            { 'statusPanel': 'agTotalRowCountComponent', 'align': 'center' },
            { 'statusPanel': 'agFilteredRowCountComponent' },
            { 'statusPanel': 'agSelectedRowCountComponent' },
            { 'statusPanel': 'agAggregationComponent' }
        ]
    }
               }
buttons = [{'name':'Table 1. Covid19_data'}]
g = Grid(grid_data = covid19_data,
        theme = 'ag-theme-fresh',
        quick_filter = True,
        show_toggle_delete = False,
        show_toggle_edit = False,
        grid_options = grid_options,
        index = True,
        width=1000,
        height=500,
        center = False,
        menu = {'buttons':buttons},columns_fit='auto'
        )

g

<font color='mediumblue'>Now I want to analyze data per country. Because we have a column with Province_State, we will end up with several duplicates for the `ObservationDate`. Therefore the next analysis will not include the provinces but the values of the columns for each country will be the total sum, meaning that the provinces values are included.</font>

In [None]:
covid19_data_no_province= covid19_data.copy()
covid19_data_no_province = covid19_data_no_province.drop(['Province_State','LastUpdate'],axis=1)
covid19_data_no_province = covid19_data_no_province.groupby(['Country_Region','ObservationDate']).sum().reset_index()
covid19_data_no_province = covid19_data_no_province.sort_values(by=['Country_Region', 'ObservationDate', 'Confirmed'])
# create column with new cases per day
covid19_data_no_province['NewCasesPerDay'] = covid19_data_no_province.groupby(['Country_Region'])['Confirmed'].diff().fillna(covid19_data_no_province['Confirmed'])


<font color='mediumblue'>Let's take a quick look at the dataset:</font>

In [None]:
columns_defs = [{'field':c} for c in covid19_data_no_province.columns]

grid_options = {'columnDefs': columns_defs,
               'enableSorting': True,
               'enableFilter' :True,
               'enableColResize': True,
               'enableRangeSelection':True,'enableValue': True,
                'statusBar': {
        'statusPanels': [
            { 'statusPanel': 'agTotalAndFilteredRowCountComponent', 'align': 'left' },
            { 'statusPanel': 'agTotalRowCountComponent', 'align': 'center' },
            { 'statusPanel': 'agFilteredRowCountComponent' },
            { 'statusPanel': 'agSelectedRowCountComponent' },
            { 'statusPanel': 'agAggregationComponent' }
        ]
    }
               }

buttons = [
{'name':'Table2. Covid19_data excluding Provinces_State'}]

g = Grid(grid_data = covid19_data_no_province,
        theme = 'ag-theme-fresh',
        quick_filter = True,
        show_toggle_delete = False,
        show_toggle_edit = False,
        grid_options = grid_options,
        index = True,
        width=1000,
        height=500,
        center = False, 
        menu = {'buttons':buttons},columns_fit='auto'
        )

g


<font color='mediumblue'>Now let's look at the Confirmed, Deaths and Recovered per country:</font> 

In [None]:

covid19_data_grouped = covid19_data_no_province.groupby(['Country_Region'],as_index=False)['Confirmed','Deaths','Recovered','ActiveCases','NewCasesPerDay'].agg(lambda x:x.max())
covid19_data_grouped.nlargest(15,'Confirmed') # for the second plot
#covid19_data_grouped['ObservationDate'] = pd.to_datetime(covid19_data_grouped['ObservationDate'])

plot_output_w = widgets.Output()
test = widgets.Output()

countries = ['All'] + sorted(covid19_data_grouped.Country_Region.unique().tolist())
columns_drop = widgets.Dropdown(options=countries,description='Select a Country',layout={'width': 'max-content'},style={'description_width': 'initial'})

display(columns_drop)
#display(plot_output)

def plot_country(country=''):
    df = covid19_data_grouped.copy()
    if columns_drop.value == 'All':
        df1 = df
        
    else:
        df1 = df[df['Country_Region']==columns_drop.value]
        
    #fig = go.Figure()
    fig=make_subplots(rows=2,cols=1,subplot_titles=['Confirmed, Deaths and Recovered cases per country (22-Jan-2020 until 26-Mar-2020)',
                                                    'Top 15 countries most affected by Covid19 (22-Jan-2020 until 26-Mar-2020)'],vertical_spacing=0.5)
    x = df1['Country_Region']
    fig.add_trace(go.Bar(name='Confirmed',x=x,y=df1.Confirmed,text=df1.Confirmed,textposition='auto',marker_color='blue'),row=1,col=1)
    fig.add_trace(go.Bar(name='Deaths',x=x,y=df1.Deaths,text=df1.Deaths,textposition='auto',marker_color='red'),row=1,col=1)
    fig.add_trace(go.Bar(name='Recovered',x=x,y=df1.Recovered,text=df1.Recovered,textposition='auto',marker_color='green'),row=1,col=1)
    fig.add_trace(go.Bar(name='ActiveCases',x=x,y=df1.ActiveCases,text=df1.ActiveCases,textposition='auto',marker_color='orange'),row=1,col=1)
    fig.update_layout(barmode='group',xaxis_type='category',template='simple_white')
    #fig.update_xaxes(title_text='Country')
    fig.update_yaxes(title_text='Number of cases')
    #fig.show()   
    
    # added for the second plot. If I want only the first plot, just delete between #
    #
    top = covid19_data_grouped.nlargest(15,'Confirmed')
    x1 = top['Country_Region']
    fig.add_trace(go.Bar(name='Confirmed',x=x1,y=top.Confirmed,text=top.Confirmed,textposition='auto',showlegend=False,marker_color='blue'),row=2,col=1)
    fig.add_trace(go.Bar(name='Deaths',x=x1,y=top.Deaths,text=top.Deaths,textposition='auto',showlegend=False,marker_color='red'),row=2,col=1)
    fig.add_trace(go.Bar(name='Recovered',x=x1,y=top.Recovered,text=top.Recovered,textposition='auto',showlegend=False,marker_color='green'),row=2,col=1)
    fig.add_trace(go.Bar(name='ActiveCases',x=x1,y=top.ActiveCases,text=top.ActiveCases,textposition='auto',showlegend=False,marker_color='orange'),row=2,col=1)
    fig.update_layout(barmode='group',xaxis_type='category',template='simple_white')
    fig.show()
    #
interactive_plot = widgets.interactive_output(plot_country,{'country':columns_drop})
display(interactive_plot)

In [None]:
covid19_data_grouped_date = covid19_data_no_province.groupby(['Country_Region','ObservationDate'],as_index=False)['Confirmed','Deaths','Recovered','ActiveCases','NewCasesPerDay'].agg(lambda x:x.sum())
countries = ['All'] + sorted(covid19_data_grouped_date.Country_Region.unique().tolist())
columns_drop = widgets.Dropdown(options=countries,description='Select a Country',layout={'width': 'max-content'},style={'description_width': 'initial'})
display(columns_drop)

def time_series(countries):
    df = covid19_data_grouped_date.copy()
    if columns_drop.value == 'All':
        #df1 = df
        df1 = covid19_data_grouped_date.groupby('ObservationDate').sum().reset_index()
    else:
        df1 = df[df['Country_Region']==columns_drop.value]
        
    fig=make_subplots(rows=3,cols=2,subplot_titles=['Confirmed cases','Deaths','Recovered','Active cases','New cases per 24h'],vertical_spacing=0.2,column_widths=[0.5, 0.5])
    
    x = df1['ObservationDate'].dt.date
    fig.add_trace(go.Scatter(x=x,y=df1['Confirmed'],showlegend=False,mode='markers + lines',marker_color='blue'),row=1,col=1)
    fig.add_trace(go.Scatter(x=x,y=df1['Deaths'],showlegend=False,mode='markers + lines',marker_color = 'red'),row=1,col=2)
    fig.add_trace(go.Scatter(x=x,y=df1['Recovered'],showlegend=False,mode='markers + lines',marker_color='green'),row=2,col=1)
    fig.add_trace(go.Scatter(x=x,y=df1['ActiveCases'],showlegend=False,mode='markers + lines',marker_color='orange'),row=2,col=2) 
    fig.add_trace(go.Scatter(x=x,y=df1['NewCasesPerDay'],showlegend=False,mode='markers + lines',marker_color='magenta'),row=3,col=1)    
    fig.update_layout(template='simple_white',height=800, width=1500)
    fig.update_xaxes(title_text='Date')
    fig.update_yaxes(title_text='Number of cases')
    fig.show()
    
interactive_plot = widgets.interactive_output(time_series,{'countries':columns_drop})
display(interactive_plot)     
        


<font color='mediumblue'>Let's bring bak the provinces:</font>

In [None]:
covid19_data_provinces = covid19_data.copy()

In [None]:
# not all countries have provinces or states
covid19_data_provinces = covid19_data_provinces[covid19_data_provinces['Province_State'].notna()]

In [None]:
# calculate again new cases per 24 h
covid19_data_provinces = covid19_data_provinces.groupby(['Country_Region','Province_State','ObservationDate']).sum().reset_index()
# create column with new cases per day
covid19_data_provinces['NewCasesPerDay'] = covid19_data_provinces.groupby(['Country_Region','Province_State'])['Confirmed'].diff().fillna(covid19_data_provinces['Confirmed'])

In [None]:
columns_defs = [{'field':c} for c in covid19_data_provinces.columns]

grid_options = {'columnDefs': columns_defs,
               'enableSorting': True,
               'enableFilter' :True,
               'enableColResize': True,
               'enableRangeSelection':True,'enableValue': True,
                'statusBar': {
        'statusPanels': [
            { 'statusPanel': 'agTotalAndFilteredRowCountComponent', 'align': 'left' },
            { 'statusPanel': 'agTotalRowCountComponent', 'align': 'center' },
            { 'statusPanel': 'agFilteredRowCountComponent' },
            { 'statusPanel': 'agSelectedRowCountComponent' },
            { 'statusPanel': 'agAggregationComponent' }
        ]
    }
               }

buttons = [
{'name':'Table3. Covid19_data including Provinces_State and excluding countries without Province_State'}]

g = Grid(grid_data = covid19_data_provinces,
        theme = 'ag-theme-fresh',
        quick_filter = True,
        show_toggle_delete = False,
        show_toggle_edit = False,
        grid_options = grid_options,
        index = True,
        width=1000,
        height=500,
        center = False, 
        menu = {'buttons':buttons},columns_fit='auto'
        )

g

<font color='mediumblue'>In this case the `NewCasesPerDay` has negative values (you can filter it for values less than zero in the above table, Table3.), which appears to be the result of some of the Provinces not having a cumulative sum. The worst case is for French Polynesia, which has a value of `Confirmed` = 15 on 2020-03-22, in the next day the confirmed cases is 19874, and on 2020-03-24, decreases to 25 (again you can filter dates and provinces in Table1 which has all data). The countries affected by this issue are: Australia (From Diamond Princess, Northern Territory, Queensland), Canada (Alberta), France (French Guiana, French Polynesia, Guadeloupe, Mayotte, Reunion), Mainland China (Guizhou), Others (Diamond Princess cruise ship), US (Fairfield County, Grand Princess, Lackland, Nevada, NY, Omaha, Rockingham, Travis, Utah and Washington), making a total of 25 rows of negative new cases of covid19 per day. I will just include everything in the next visualizations, and see if I can see something more.
<br>
Also there is a bad entry for `Country_Region` and `Province_State`, so let's drop it:</font>

In [None]:
#covid19_data_provinces[covid19_data_provinces['Country_Region']=='1.0']

In [None]:
#covid19_data_provinces = covid19_data_provinces.drop(0,axis=0)

In [None]:
covid19_data_grouped = covid19_data_provinces.groupby(['Country_Region','Province_State'],as_index=False)['Confirmed','Deaths','Recovered','ActiveCases','NewCasesPerDay'].agg(lambda x:x.max())
covid19_data_grouped.nlargest(15,'Confirmed') # for the second plot
#covid19_data_grouped['ObservationDate'] = pd.to_datetime(covid19_data_grouped['ObservationDate'])

plot_output_w = widgets.Output()
test = widgets.Output()

countries = sorted(covid19_data_grouped.Country_Region.unique().tolist())
provinces = sorted(covid19_data_grouped.Province_State.unique().tolist())
columns_drop = widgets.Dropdown(options=countries,description='Select a Country',layout={'width': 'max-content'},style={'description_width': 'initial'})
#province_drop = widgets.Dropdown(options=provinces,description='Select a Province',layout={'width': 'max-content'},style={'description_width': 'initial'})
#hbox = widgets.HBox([columns_drop,province_drop])
#display(hbox)

display(columns_drop)
#display(plot_output)

def plot_country(country='',provinces=''):
    df = covid19_data_grouped.copy()
    #df1 = df[df[(df['Country_Region']==columns_drop.value)&(df['Province_State']==province_drop.value)]]
    df1 = df[df['Country_Region']==columns_drop.value]
    #df1 = df[df['Province_State']==province_drop.value]
        
    fig = go.Figure()
    x = df1['Province_State']
    fig.add_trace(go.Bar(name='Confirmed',x=x,y=df1.Confirmed,text=df1.Confirmed,textposition='auto',marker_color='blue'))
    fig.add_trace(go.Bar(name='Deaths',x=x,y=df1.Deaths,text=df1.Deaths,textposition='auto',marker_color='red'))
    fig.add_trace(go.Bar(name='Recovered',x=x,y=df1.Recovered,text=df1.Recovered,textposition='auto',marker_color='green'))
    fig.add_trace(go.Bar(name='ActiveCases',x=x,y=df1.ActiveCases,text=df1.ActiveCases,textposition='auto',marker_color='orange'))
    fig.update_layout(barmode='group',xaxis_type='category',template='simple_white',height=700)
    fig.update_xaxes(title_text='Province_State')
    fig.update_yaxes(title_text='Number of cases')
    fig.show()   
    
interactive_plot = widgets.interactive_output(plot_country,{'country':columns_drop})
display(interactive_plot)

<font color='mediumblue'>Insights:
 - `Australia`: Diamond Princess cruise ship and From Diamond Princess are the same?
 - `Austria`: has 'None' as province with values
 - `Denmark`: has also Denmark in Provinces_State
 - `Iraq`: has 'None' as province with no values
 - `Israel`: has 'From Diamond Princess` as province. Is this the cruise?
 - `Lebanon`: has 'None' as province with values
 - `UK` : has 'UK' and 'United Kingdom' as provinces. Are they not the same?
 - Other countries have the countries as provinces.
    
So some cleaning to do in this part!</font>
    

<font color='mediumblue'>Main Insights covid19:
- `Australia`: Most cases are from 'New South Wales'
- `Canada`: Most of the cases are from 'Quebec'
- `France`: According to the data, 'French Polynesia' has more cases of covid19 than France...
- `Mainland China`: As expected, the great majority of the covid19 are in 'Hubei'
- `US`: Great majority of the cases are in New York.

## <font color='Blue'>2. EDA *covid19_line_list_data* dataset</font>

In [None]:
#covid19_line_list_data = pd.read_csv('gt.zip/COVID19_line_list_data.csv',index_col=0,error_bad_lines=False)

In [None]:
covid19_line_list_data.head(3)

In [None]:
# missing values

missing = covid19_line_list_data.isnull().sum()
missing_perc = (100*(missing/len(covid19_line_list_data))).round(1)
missing_values = pd.DataFrame({'missing_values':missing, 'missing_values_%':missing_perc})
missing_values


<font color='mediumblue'> Let's drop  the cols that have 100% of missing values and leave the others as they are (for now).</font>

In [None]:
# drop cols with 100% of missing values.
covid19_line_list_data = covid19_line_list_data.drop(['Unnamed: 3','link','Unnamed: 21','Unnamed: 22','Unnamed: 23',
                                                     'Unnamed: 24','Unnamed: 25','Unnamed: 26'],axis=1)

In [None]:
columns_defs = [{'field':c} for c in covid19_line_list_data.columns]

grid_options = {'columnDefs': columns_defs,
               'enableSorting': True,
               'enableFilter' :True,
               'enableColResize': True,
               'enableRangeSelection':True,'enableValue': True,
                'statusBar': {
        'statusPanels': [
            { 'statusPanel': 'agTotalAndFilteredRowCountComponent', 'align': 'left' },
            { 'statusPanel': 'agTotalRowCountComponent', 'align': 'center' },
            { 'statusPanel': 'agFilteredRowCountComponent' },
            { 'statusPanel': 'agSelectedRowCountComponent' },
            { 'statusPanel': 'agAggregationComponent' }
        ]
    }
               }

g = Grid(grid_data = covid19_line_list_data,
        theme = 'ag-theme-fresh',
        quick_filter = True,
        show_toggle_delete = False,
        show_toggle_edit = False,
        grid_options = grid_options,
        index = True,
        width=1000,
        height=500,
        center = False,
        )

g

In [None]:
covid19_line_list_data.dtypes

<font color='mediumblue'> I am going to change the name of the columns that have a space in between names (like 'reporting date') because sometimes it can cause errors:</font>

In [None]:
# change name columns
covid19_line_list_data.rename(columns={'reporting date':'ReportingDate'},inplace=True)
covid19_line_list_data.rename(columns={'visiting Wuhan':'visiting_Wuhan'},inplace=True)
covid19_line_list_data.rename(columns={'from Wuhan':'from_Wuhan'},inplace=True)


<font color='mediumblue'> Now let's change the date columns (which are as object) to datetime:</font>

In [None]:
# change to datetime
covid19_line_list_data['ReportingDate'] = pd.to_datetime(covid19_line_list_data['ReportingDate'])
covid19_line_list_data['symptom_onset'] = pd.to_datetime(covid19_line_list_data['symptom_onset'])
covid19_line_list_data['hosp_visit_date'] = pd.to_datetime(covid19_line_list_data['hosp_visit_date'])
covid19_line_list_data['exposure_start'] = pd.to_datetime(covid19_line_list_data['exposure_start'])
covid19_line_list_data['exposure_end'] = pd.to_datetime(covid19_line_list_data['exposure_end'])


<font color='mediumblue'>Although the column `recovered` appears to be a date type variable, the following line of code keeps on getting an error:
```
    covid19_line_list_data['recovered'] = pd.to_datetime(covid19_line_list_data['recovered'])
```
<font color='mediumblue'>So let's see what's going on:</font>

In [None]:
covid19_line_list_data['recovered'].value_counts()

<font color='mediumblue'>There are mixed dtypes in the `recovery` column, but it appears that '0' is the most common value, so maybe this is a binary feature. Also, the column `death` should not be object, so let's see what's going on:</font>

In [None]:
covid19_line_list_data['death'].value_counts()

<font color='mediumblue'>Ok, so we have here the same problem as with `recovered`:</font>

In [None]:
filtered = covid19_line_list_data[['recovered','death']]
columns_defs = [{'field':c} for c in filtered.columns]

grid_options = {'columnDefs': columns_defs,
               'enableSorting': True,
               'enableFilter' :True,
               'enableColResize': True,
               'enableRangeSelection':True,'enableValue': True,
                'statusBar': {
        'statusPanels': [
            { 'statusPanel': 'agTotalAndFilteredRowCountComponent', 'align': 'left' },
            { 'statusPanel': 'agTotalRowCountComponent', 'align': 'center' },
            { 'statusPanel': 'agFilteredRowCountComponent' },
            { 'statusPanel': 'agSelectedRowCountComponent' },
            { 'statusPanel': 'agAggregationComponent' }
        ]
    }
               }

g = Grid(grid_data = filtered,
        theme = 'ag-theme-fresh',
        quick_filter = True,
        show_toggle_delete = False,
        show_toggle_edit = False,
        grid_options = grid_options,
        index = True,
        width=200,
        height=500,
        center = False, columns_fit='auto'
        )

g

<font color='mediumblue'>From the table above, it appears that every time that `recovered` is a date, `death` is zero, and vice-versa, so we could argue that for the `recovered` column, 1 means recovered, 0 means not recovered (with the same way of thinking for the `death` column). However,recovered and death are most of the time both zero so this is a tricky one. Let it stay the way it is for now, and I will just look at some other variables, rather than recovered and death because we just don't have data for these two features.</font>

<font color='mediumblue'>Let's look at the symptoms:


In [None]:
filtered = covid19_line_list_data[['ReportingDate','country','gender', 'age', 'symptom_onset','hosp_visit_date','visiting_Wuhan',
       'from_Wuhan','symptom']].reset_index()

In [None]:
filtered

In [None]:
filtered_grouped = filtered.groupby(['symptom','gender']).size().reset_index(name='counts')
filtered_grouped_male = filtered_grouped[filtered_grouped['gender']=='male']
filtered_grouped_female = filtered_grouped[filtered_grouped['gender']=='female']

In [None]:
columns_defs = [{'field':c} for c in filtered_grouped.columns]

grid_options = {'columnDefs': columns_defs,
               'enableSorting': True,
               'enableFilter' :True,
               'enableColResize': True,
               'enableRangeSelection':True,'enableValue': True,
                'statusBar': {
        'statusPanels': [
            { 'statusPanel': 'agTotalAndFilteredRowCountComponent', 'align': 'left' },
            { 'statusPanel': 'agTotalRowCountComponent', 'align': 'center' },
            { 'statusPanel': 'agFilteredRowCountComponent' },
            { 'statusPanel': 'agSelectedRowCountComponent' },
            { 'statusPanel': 'agAggregationComponent' }
        ]
    }
               }

g = Grid(grid_data = filtered_grouped,
        theme = 'ag-theme-fresh',
        quick_filter = True,
        show_toggle_delete = False,
        show_toggle_edit = False,
        grid_options = grid_options,
        index = True,
        width=800,
        height=500,
        center = False, 
        
        )

g

In [None]:
fig=go.Figure()
fig.add_trace(go.Bar(name='Male',x=filtered_grouped_male.symptom,y=filtered_grouped_male.counts,marker_color='lime'))
fig.add_trace(go.Bar(name='Female',x=filtered_grouped_female.symptom,y=filtered_grouped_female.counts,marker_color='magenta'))
fig.update_layout(barmode='group',template='simple_white',width=1300,height=900)
fig.update_xaxes(title_text='Symptoms')
fig.update_yaxes(title_text='Count')
fig.show()

<font color='mediumblue'>There are several symptoms, but the most common ones, as expected are fever and cough.It appears that males have more complains than females. There is also a typo for fever ('feve\')

<font color='mediumblue'> Let's include age:

In [None]:
print('Minimum age:',filtered['age'].min())
print("maximum age:", filtered['age'].max())

In [None]:
bins = [0,10,20,30,40,50,60,70,80,90,100]
filtered['age_binned'] = pd.cut(filtered['age'],bins)
filtered

In [None]:
filtered_grouped_age = filtered.groupby(['symptom','age_binned']).size().reset_index(name='counts')
fig=go.Figure()
fig = px.bar(filtered_grouped_age, x='symptom', y='counts',color='age_binned',width=1300,height=900,template='simple_white')

fig.show()

(to be continued)