In [1]:
#Setting up a working directory
import os
os.chdir("C:/Users/Lenovo/Desktop/Data Insight/Analysis")

In [2]:
#Importing the necessarily libraries for the EDA
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
#Saving the raw data of the covid19 recovered cases & confirmed cases to their respective variables
#This data is updated on a daily basis from data source as per the respective url's: https://github.com/CSSEGISandData/COVID-19/tree/master/csse_covid_19_data/csse_covid_19_time_series
recovered_csv="https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_recovered_global.csv"
confirmed_csv="https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_confirmed_global.csv"

In [4]:
#Reading in the data to dataframes
recovered_timeseries_df=pd.read_csv(recovered_csv)
confirmed_timeseries_df=pd.read_csv(confirmed_csv)

In [5]:
recovered_timeseries_df.head()

In [6]:
confirmed_timeseries_df.head()

In [7]:
#Confirming that the column names in the two datasets are the same
confirmed_timeseries_df.columns==recovered_timeseries_df.columns

In [8]:
#Since the columns in the df's are the same we can come up with a function to massage the data
#we shall drop columns Province/State,Lat,Long as we don't require it in our analysis
def reshape_data(recovered_timeseries_df,confirmed_timeseries_df):
    confirmed=confirmed_timeseries_df.melt(id_vars=['Province/State','Country/Region','Lat','Long'],var_name='Date',value_name='Confirmed_cases').drop(columns=['Province/State','Lat','Long'])
    recovered=recovered_timeseries_df.melt(id_vars=['Province/State','Country/Region','Lat','Long'],var_name='Date',value_name='Recovered_cases').drop(columns=['Province/State','Lat','Long'])
    confirmed['Date']=pd.to_datetime(confirmed['Date'])
    recovered['Date']=pd.to_datetime(recovered['Date'])
    recovered.set_index("Date",inplace=True)
    confirmed.set_index("Date",inplace=True)
    recovered.index = recovered.index.strftime('%d/%m/%Y')
    confirmed.index = confirmed.index.strftime('%d/%m/%Y')
        
    return (confirmed,recovered)

In [9]:
Confirmed_data,Recovered_data=reshape_data(recovered_timeseries_df,confirmed_timeseries_df)

In [10]:
Confirmed_data.head()

In [11]:
Recovered_data.head()

In [12]:
Recovered_data.shape

In [13]:
Confirmed_data.info()

In [14]:
Recovered_data.info()

In [15]:
#Filtering the countries with a high number of confirmed cases registered for the month of April 2020
High_Cases_Confirmed=Confirmed_data['01/04/2020':'30/04/2020'].groupby("Country/Region").max().sort_values(by="Confirmed_cases",ascending=False).head(5)

In [16]:
High_Cases_Confirmed.index

In [17]:
#Create list of the countries with the highest no of confirmed cases as at end of April
Countries_List=High_Cases_Confirmed.index.to_list()

In [18]:
Countries_List

In [19]:
#Filtering the confirmed cases data with Countries list for the month of April 2020 & summing over Confirmed_cases for any cases  of duplicated Country/Region per day
def April_data(data,count_column):
    data1=data[data["Country/Region"].isin (Countries_List)]
    data1=data1['01/04/2020':'30/04/2020']
    data1 = data1.groupby([data1.index,'Country/Region']).agg({count_column: 'sum'})
    data1.reset_index(inplace=True)
    data1.set_index('level_0',inplace=True)
    data1.index.rename('Date',inplace=True)
    return data1

In [20]:
Highest_Cases_data=April_data(Confirmed_data,'Confirmed_cases')

In [21]:
Highest_Cases_data.index

In [22]:
Highest_Cases_data

In [23]:
Highest_Recovery_data=April_data(Recovered_data,'Recovered_cases')

In [24]:
Color=['Red','Blue','Green','Black','Purple']
c=0
fig,ax=plt.subplots(figsize=(14,9))
for i in Countries_List:
    a=Highest_Cases_data[Highest_Cases_data['Country/Region']==i]
    ax.plot(a.index,a['Confirmed_cases'],color=Color[c])
    c+=1
ax.legend(Countries_List,loc='best')
plt.xticks(rotation='vertical')
ax=plt.gca()
plt.title('Most Affected Countries ( Confirmed April Cases)')
plt.xlabel("Date")
plt.ylabel("Confirmed Cases")
plt.show
    

In [25]:
'''Since the US cases at 01/04/2020 is higher than the highest record among the other countries as at 30/04/2020, it would be 
more beneficial if we have two plots one for the US data & another for the other 4 countries'''

In [26]:
#Us confirmed cases in the month of April

In [27]:
US_cases=Highest_Cases_data[Highest_Cases_data['Country/Region']=='US']
fig,ax=plt.subplots(figsize=(15,5))
ax.plot(US_cases.index,US_cases['Confirmed_cases'],color='blue',marker='s')
plt.xticks(rotation='vertical')
ax.set_ylim([0,1200000])
plt.title('US Confirmed April Cases')
plt.xlabel("Date")
plt.ylabel("Confirmed Cases")
plt.show

In [28]:
#Plot of the other countries with the highest cases for the month of April

In [29]:
Countries_List_2=Countries_List[1:]

In [30]:
Countries_List_2

In [31]:
Color=['Red','Green','Black','Purple']
c=0
fig,ax=plt.subplots(figsize=(15,5))
for i in Countries_List_2:
    a=Highest_Cases_data[Highest_Cases_data['Country/Region']==i]
    ax.plot(a.index,a['Confirmed_cases'],color=Color[c])
    c+=1
ax.legend(Countries_List_2,loc='best')
plt.xticks(rotation='vertical')
ax=plt.gca()
plt.title('Most Affected Countries Except US( Confirmed April Cases)')
ax.set_ylim([0,250000])
plt.xlabel("Date")
plt.ylabel("Confirmed Cases")
plt.show

In [32]:
###Confirmed vs recovered for the whole dataset

In [33]:
Recovered_data.shape

In [34]:
Confirmed_data.shape

In [35]:
#Summing for values with duplicated region/Country
Confirmed_data_April=Confirmed_data.groupby([Confirmed_data.index,'Country/Region']).agg({'Confirmed_cases': 'sum'})

In [36]:
Confirmed_data_April=Confirmed_data_April.loc['01/04/2020':'30/04/2020']

In [37]:
Confirmed_data_April.tail()

In [38]:
Confirmed_data_April.head()

In [39]:
Confirmed_Country_Max = Confirmed_data_April.groupby(['Country/Region']).max()

In [40]:
Confirmed_Country_Max.reset_index(inplace=True)

In [41]:
Confirmed_Country_Max.head()

In [42]:
#Summing for values with duplicated region/Country
Recovered_data_April = Recovered_data.groupby([Recovered_data.index,'Country/Region']).agg({'Recovered_cases': 'sum'})

In [43]:
Recovered_data_April=Recovered_data_April.loc['01/04/2020':'30/04/2020']

In [44]:
#Getting the maximum recovery cases
Recovered_Country_Max = Recovered_data_April.groupby(['Country/Region']).max()

In [45]:
Recovered_Country_Max.reset_index(inplace=True)

In [46]:
Recovered_Country_Max.head()

In [47]:
Confirmed_Recovered_data=Confirmed_Country_Max

In [48]:
Confirmed_Recovered_data['Recovered_cases']=Recovered_Country_Max['Recovered_cases']

In [49]:
Confirmed_Recovered_data.head()

In [50]:
#Getting the recovery rates per country column

In [51]:
Confirmed_Recovered_data['Recovery rate']=Confirmed_Recovered_data['Recovered_cases']/Confirmed_Recovered_data['Confirmed_cases']

In [52]:
Confirmed_Recovered_data.head()

In [53]:
#Visualizing the recovery rate of the Most affected countries in the world(8 Countries)

In [54]:
Countries_Most_Affected=Confirmed_Recovered_data.groupby("Country/Region").max().sort_values(by="Confirmed_cases",ascending=False).head(8)

In [55]:
Countries_Most_Affected.reset_index(inplace=True)

In [56]:
Countries_Most_Affected.head()

In [57]:
sns.barplot(x='Country/Region',y='Recovery rate',data=Countries_Most_Affected,order=['United Kingdom','Russia','US','France','Brazil','Italy','Spain','Germany'])
plt.xticks(rotation='vertical')

In [58]:
#Interpretation, Results & Conclusion

In [59]:
#Analysis of Kenya data;what were the measures put in place?(graph them)(when?), how has it affected the incidence rate(new infectation)

In [60]:
Kenyan_Confirmed=Confirmed_data[Confirmed_data['Country/Region']=='Kenya']

In [61]:
Kenyan_Confirmed=Kenyan_Confirmed['01/04/2020':'30/04/2020']

In [62]:
Kenyan_Confirmed.head()

In [63]:
Kenyan_Confirmed.reset_index(inplace=True)

In [64]:
Kenyan_Confirmed.head()

In [65]:
Kenyan_Confirmed.rename(columns={'index':'Day'},inplace=True)

In [66]:
Kenyan_Confirmed.head()

In [67]:
#Plot of confirmed cases over_time

In [68]:
fig,ax=plt.subplots(figsize=(15,5))
sns.lineplot(y='Confirmed_cases',x='Day',ax=ax,data=Kenyan_Confirmed,marker='s')
plt.xticks(rotation="vertical")
ax.set_ylim([0,450])
sns.set_style("darkgrid")
plt.title("Total Confirmed Cases")
plt.show()

In [69]:
Kenyan_Confirmed['Daily_Confirmed_cases']=Kenyan_Confirmed['Confirmed_cases'].diff()

In [70]:
Kenyan_Confirmed.head()

In [71]:
#Dropping the first day
Kenyan_Daily_Confirmed=Kenyan_Confirmed.dropna(axis=0)

In [72]:
Kenyan_Daily_Confirmed.head()

In [73]:
fig,ax=plt.subplots(figsize=(15,5))
sns.lineplot(y='Daily_Confirmed_cases',x='Day',data=Kenyan_Daily_Confirmed,marker='o')
plt.xticks(rotation="vertical")
ax.set_ylim([0,30])
sns.set_style("darkgrid")
plt.title("April-Kenya Daily Cases")
plt.show()

In [74]:
#Kenya testing data scrapped from daily press briefings by the Ministry of Health Kenya

In [75]:
Dates=Kenyan_Confirmed['Day'].to_list()

In [76]:
#Converting the dates to a list
#It is worth noting that some days had missing information of test data thus we shall drop those days

In [77]:
#Dropping days with missing test data

In [78]:
Dates=Dates[1:]

In [79]:
Test_Missing=['06/04/2020','08/04/2020','16/04/2020','17/04/2020','20/04/2020','25/04/2020','26/04/2020','27/04/2020','28/04/2020']
Test_Dates=[item for item in Dates if item not in Test_Missing]

In [80]:
Kenyan_tested=[662,362,372,530,696,308,504,491,766,674,694,803,1115,1330,545,707,668,946,508,777]

In [81]:
d={'Test Days':Test_Dates,'Count Sample Tests':Kenyan_tested}

In [82]:
df_Tests=pd.DataFrame(d)

In [83]:
df_Tests

In [84]:
#Visualizing the same

In [85]:
fig,ax=plt.subplots(figsize=(12,5))
sns.barplot(y='Count Sample Tests',x='Test Days',data=df_Tests,color='b')
plt.xticks(rotation="vertical")
sns.set_style("darkgrid")
plt.title("Daily Test Cases")
plt.show()

In [86]:
##Confirmed Cases & Recovery Trends of COVID-19 Among the Most Affected Countries

In [87]:
## Confirmed Cases & Recovery Trends of COVID-19 Among the Most Affected Countries

In [88]:
#Setting up a working directory
import os
os.chdir("C:/Users/Lenovo/Desktop/Data Insight/Analysis")
%notebook "C:/Users/Lenovo/Desktop/Data Insight/Analysis/to/file/Covid 19 Analysis.ipynb"#Saving file 

In [89]:
#Setting up a working directory
import os
os.chdir("C:/Users/Lenovo/Desktop/Data Insight/Analysis")
#Saving the notebook
%notebook "C:/Users/Lenovo/Desktop/Data Insight/Analysis/to/file/Covid 19 Analysis.ipynb"

In [90]:
pwd

In [91]:
#Setting up a working directory
import os
os.chdir("C:/Users/Lenovo/Desktop/Data Insight/Analysis")
#Saving the notebook
%notebook "C:/Users/Lenovo/Desktop/Data Insight/Analysis/Covid 19 Analysis.ipynb"

In [92]:
#Setting up a working directory
import os
os.chdir("C:/Users/Lenovo/Desktop/Data Insight/Analysis")
#Saving the notebook
%notebook "C:/Users/Lenovo/Desktop/Data Insight/Analysis/Covid 19 Analysis.ipynb"

In [93]:
#Importing the necessarily libraries to carry out the EDA
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [94]:
# Importing raw datasets

In [95]:
#Saving the raw data of the covid19 recovered cases & confirmed cases to their respective variables
#This data is updated on a daily basis from data source as per the respective url's: https://github.com/CSSEGISandData/COVID-19/tree/master/csse_covid_19_data/csse_covid_19_time_series
recovered_csv="https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_recovered_global.csv"
confirmed_csv="https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_confirmed_global.csv"

In [96]:
#Reading in the data to dataframes
recovered_timeseries_df=pd.read_csv(recovered_csv)
confirmed_timeseries_df=pd.read_csv(confirmed_csv)

In [97]:
recovered_timeseries_df.head()

In [98]:
#Inspecting the data sets
recovered_timeseries_df.head()

In [99]:
confirmed_timeseries_df.head()

In [100]:
#Confirming that the column names in the two datasets are the same i.e the data structures are the same
confirmed_timeseries_df.columns==recovered_timeseries_df.columns

In [101]:
#Since the columns in the df's are the same we can come up with a function to massage the data
#we shall drop columns Province/State,Lat,Long as we don't require it in our analysis
def reshape_data(recovered_timeseries_df,confirmed_timeseries_df):
    confirmed=confirmed_timeseries_df.melt(id_vars=['Province/State','Country/Region','Lat','Long'],var_name='Date',value_name='Confirmed_cases').drop(columns=['Province/State','Lat','Long'])
    recovered=recovered_timeseries_df.melt(id_vars=['Province/State','Country/Region','Lat','Long'],var_name='Date',value_name='Recovered_cases').drop(columns=['Province/State','Lat','Long'])
    confirmed['Date']=pd.to_datetime(confirmed['Date'])
    recovered['Date']=pd.to_datetime(recovered['Date'])
    recovered.set_index("Date",inplace=True)
    confirmed.set_index("Date",inplace=True)
    recovered.index = recovered.index.strftime('%d/%m/%Y')
    confirmed.index = confirmed.index.strftime('%d/%m/%Y')
        
    return (confirmed,recovered)

In [102]:
sns.barplot(x='Country/Region',y='Recovery rate',data=Countries_Most_Affected,order=['United Kingdom','Russia','US','France','Brazil','Italy','Spain','Germany'])
plt.xticks(rotation='vertical')
plt.title("Recovery Rates for Countries Most Affected")

In [103]:
Confirmed_data,Recovered_data=reshape_data(recovered_timeseries_df,confirmed_timeseries_df)

In [104]:
Confirmed_data.head()

In [105]:
Recovered_data.head()

In [106]:
#Setting up a working directory
import os
os.chdir("C:/Users/Lenovo/Desktop/Data Insight/Analysis")
#Saving the notebook
%notebook "C:/Users/Lenovo/Desktop/Data Insight/Analysis/Covid 19 Analysis.ipynb"

In [107]:
#Importing the necessarily libraries to carry out the EDA
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [108]:
#Filtering the countries with a high number of confirmed cases registered for the month of April 2020
High_Cases_Confirmed=Confirmed_data['01/04/2020':'30/04/2020'].groupby("Country/Region").max().sort_values(by="Confirmed_cases",ascending=False).head(5)

In [109]:
High_Cases_Confirmed.index

In [110]:
#Create list of the countries with the highest no of confirmed cases as at end of April
Countries_List=High_Cases_Confirmed.index.to_list()

In [111]:
Countries_List

In [112]:
#Filtering the confirmed cases data with Countries list for the month of April 2020 & summing over Confirmed_cases for any cases  of duplicated Country/Region per day
def April_data(data,count_column):
    data1=data[data["Country/Region"].isin (Countries_List)]
    data1=data1['01/04/2020':'30/04/2020']
    data1 = data1.groupby([data1.index,'Country/Region']).agg({count_column: 'sum'})
    data1.reset_index(inplace=True)
    data1.set_index('level_0',inplace=True)
    data1.index.rename('Date',inplace=True)
    return data1

In [113]:
Highest_Cases_data=April_data(Confirmed_data,'Confirmed_cases')

In [114]:
Highest_Cases_data.index

In [115]:
Highest_Cases_data

In [116]:
Highest_Cases_data.head()

In [117]:
Highest_Cases_data.head()

In [118]:
Highest_Recovery_data=April_data(Recovered_data,'Recovered_cases')

In [119]:
Color=['Red','Blue','Green','Black','Purple']
c=0
fig,ax=plt.subplots(figsize=(14,9))
for i in Countries_List:
    a=Highest_Cases_data[Highest_Cases_data['Country/Region']==i]
    ax.plot(a.index,a['Confirmed_cases'],color=Color[c])
    c+=1
ax.legend(Countries_List,loc='best')
plt.xticks(rotation='vertical')
ax=plt.gca()
plt.title('Most Affected Countries ( Confirmed April Cases)')
plt.xlabel("Date")
plt.ylabel("Confirmed Cases")
plt.show
    

In [120]:
### Observation

In [121]:
####Us confirmed cases in the month of April

In [122]:
#Filtering US cases
US_cases=Highest_Cases_data[Highest_Cases_data['Country/Region']=='US']

In [123]:
fig,ax=plt.subplots(figsize=(15,5))
ax.plot(US_cases.index,US_cases['Confirmed_cases'],color='blue',marker='s')
plt.xticks(rotation='vertical')
ax.set_ylim([0,1200000])
plt.title('US Confirmed April Cases')
plt.xlabel("Date")
plt.ylabel("Confirmed Cases")
plt.show

In [124]:
#US April cases plot
fig,ax=plt.subplots(figsize=(15,5))
ax.plot(US_cases.index,US_cases['Confirmed_cases'],color='blue',marker='s')
plt.xticks(rotation='vertical')
ax.set_ylim([0,1200000])
plt.title('US Confirmed April Cases')
plt.xlabel("Date")
plt.ylabel("Confirmed Cases")
plt.show

In [125]:
Countries_List_2=Countries_List[1:]

In [126]:
#Subsetting the Countries_List data to exclude US
Countries_List_2=Countries_List[1:]

In [127]:
Countries_List_2

In [128]:
#Plot
Color=['Red','Green','Black','Purple']
c=0
fig,ax=plt.subplots(figsize=(15,5))
for i in Countries_List_2:
    a=Highest_Cases_data[Highest_Cases_data['Country/Region']==i]
    ax.plot(a.index,a['Confirmed_cases'],color=Color[c])
    c+=1
ax.legend(Countries_List_2,loc='best')
plt.xticks(rotation='vertical')
ax=plt.gca()
plt.title('Most Affected Countries Except US( Confirmed April Cases)')
ax.set_ylim([0,250000])
plt.xlabel("Date")
plt.ylabel("Confirmed Cases")
plt.show

In [129]:
#Mean of US confirmed cases in April
Us_mean_April=np.mean(US_cases)

In [130]:
Us_mean_April

In [131]:
#Mean of US confirmed cases in April
Us_mean_April=np.min(US_cases)

In [132]:
Us_mean_April

In [133]:
#Mean of US confirmed cases in April
Us_mean_April=np.mean(US_cases)

In [134]:
Us_mean_April

In [135]:
#### Observation

In [136]:
#### Observations

In [137]:
#### From the above plot we can observe that:
1. Generally there seems to be an almost linear growth of confirmed cases amongst the four countries
2. On 12/04/2020 

In [138]:
#Summing for values with duplicated region/Country
Confirmed_data_April=Confirmed_data.groupby([Confirmed_data.index,'Country/Region']).agg({'Confirmed_cases': 'sum'})

In [139]:
Confirmed_Country_Max = Confirmed_data_April.groupby(['Country/Region']).max()

In [140]:
Confirmed_Country_Max.reset_index(inplace=True)

In [141]:
Confirmed_Country_Max.head()

In [142]:
#Summing for values with duplicated region/Country
Recovered_data_April = Recovered_data.groupby([Recovered_data.index,'Country/Region']).agg({'Recovered_cases': 'sum'})

In [143]:
Recovered_data_April=Recovered_data_April.loc['01/04/2020':'30/04/2020']

In [144]:
#Getting the maximum recovery cases
Recovered_Country_Max = Recovered_data_April.groupby(['Country/Region']).max()

In [145]:
Recovered_Country_Max.reset_index(inplace=True)

In [146]:
Recovered_Country_Max.head()

In [147]:
#Adding a Recovered_cases column to create a new dataframe
Confirmed_Recovered_data['Recovered_cases']=Recovered_Country_Max['Recovered_cases']

In [148]:
Confirmed_Recovered_data.head()

In [149]:
Confirmed_Recovered_data.head()

In [150]:
#Getting the recovery rates per country column

In [151]:
Confirmed_Recovered_data['Recovery rate']=Confirmed_Recovered_data['Recovered_cases']/Confirmed_Recovered_data['Confirmed_cases']

In [152]:
Confirmed_Recovered_data.head()

In [153]:
#Visualizing the recovery rate of the Most affected countries in the world(8 Countries)

In [154]:
Countries_Most_Affected=Confirmed_Recovered_data.groupby("Country/Region").max().sort_values(by="Confirmed_cases",ascending=False).head(8)

In [155]:
Countries_Most_Affected=Confirmed_Recovered_data.groupby("Country/Region").max().sort_values(by="Confirmed_cases",ascending=False).head(8)

In [156]:
Countries_Most_Affected.reset_index(inplace=True)

In [157]:
Countries_Most_Affected.head()

In [158]:
sns.barplot(x='Country/Region',y='Recovery rate',data=Countries_Most_Affected,order=['United Kingdom','Russia','US','France','Brazil','Italy','Spain','Germany'])
plt.xticks(rotation='vertical')
plt.title("Recovery Rates for Countries Most Affected")

In [159]:
Date_x=['01/02/2020','06/04/2020','08/04/2020','16/04/2020','17/04/2020','20/04/2020','25/04/2020','26/04/2020','27/04/2020','28/04/2020']
for i in Date_x:

In [160]:
Date_x=['01/02/2020','06/04/2020','08/04/2020','16/04/2020','17/04/2020','20/04/2020','25/04/2020','26/04/2020','27/04/2020','28/04/2020']
Kenya_x=Kenyan_Confirmed.drop(Date_x)
    

In [161]:
Kenya_x=Kenyan_Confirmed.drop(['01/02/2020','06/04/2020','08/04/2020','16/04/2020','17/04/2020','20/04/2020','25/04/2020','26/04/2020','27/04/2020','28/04/2020'])
    

In [162]:
Kenya_x=Kenyan_Confirmed.set_index(columns='Day')
Kenya_x=Kenyan_Confirmed.drop(['01/02/2020','06/04/2020','08/04/2020','16/04/2020','17/04/2020','20/04/2020','25/04/2020','26/04/2020','27/04/2020','28/04/2020'])

In [163]:
Kenya_x=Kenyan_Confirmed.set_index(columns={'Day'})
Kenya_x=Kenyan_Confirmed.drop(['01/02/2020','06/04/2020','08/04/2020','16/04/2020','17/04/2020','20/04/2020','25/04/2020','26/04/2020','27/04/2020','28/04/2020'])

In [164]:
Kenya_x=Kenyan_Confirmed.set_index('Day')
Kenya_x=Kenyan_Confirmed.drop(['01/02/2020','06/04/2020','08/04/2020','16/04/2020','17/04/2020','20/04/2020','25/04/2020','26/04/2020','27/04/2020','28/04/2020'])

In [165]:
Kenya_x=Kenyan_Confirmed.set_index('Day')
#Kenya_x=Kenyan_Confirmed.drop(['01/02/2020','06/04/2020','08/04/2020','16/04/2020','17/04/2020','20/04/2020','25/04/2020','26/04/2020','27/04/2020','28/04/2020'])
Kenya_x.head()

In [166]:
Kenya_x=Kenyan_Confirmed.set_index('Day')
Kenya_x=Kenyan_x.drop(['01/02/2020','06/04/2020','08/04/2020','16/04/2020','17/04/2020','20/04/2020','25/04/2020','26/04/2020','27/04/2020','28/04/2020'])
Kenya_x.head()

In [167]:
Kenya_x=Kenyan_Confirmed.set_index('Day')
Kenya_x=Kenya_x.drop(['01/02/2020','06/04/2020','08/04/2020','16/04/2020','17/04/2020','20/04/2020','25/04/2020','26/04/2020','27/04/2020','28/04/2020'])
Kenya_x.head()

In [168]:
Kenya_x=Kenyan_Confirmed.set_index('Day')
Kenya_x=Kenya_x.drop(['01/04/2020','06/04/2020','08/04/2020','16/04/2020','17/04/2020','20/04/2020','25/04/2020','26/04/2020','27/04/2020','28/04/2020'])
Kenya_x.head()

In [169]:
Kenya_x=Kenyan_Confirmed.set_index('Day')
Kenya_x=Kenya_x.drop(['01/04/2020','06/04/2020','08/04/2020','16/04/2020','17/04/2020','20/04/2020','25/04/2020','26/04/2020','27/04/2020','28/04/2020'])
Kenya_x.head()
len(Kenya_x)

In [170]:
Kenya_x=Kenyan_Confirmed.set_index('Day')
Kenya_x=Kenya_x.drop(['01/04/2020','06/04/2020','08/04/2020','16/04/2020','17/04/2020','20/04/2020','25/04/2020','26/04/2020','27/04/2020','28/04/2020'])
Kenya_x.head()

In [171]:
Kenya_x=Kenyan_Confirmed.set_index('Day')
Kenya_x=Kenya_x.drop(['01/04/2020','06/04/2020','08/04/2020','16/04/2020','17/04/2020','20/04/2020','25/04/2020','26/04/2020','27/04/2020','28/04/2020'])
Kenya_x['Daily Tests']=df_Tests['Count Sample Tests']
Kenya_x.head()

In [172]:
df_Tests

In [173]:
Kenya_x=Kenyan_Confirmed.set_index('Day')
Kenya_x=Kenya_x.drop(['01/04/2020','06/04/2020','08/04/2020','16/04/2020','17/04/2020','20/04/2020','25/04/2020','26/04/2020','27/04/2020','28/04/2020'])
Kenya_x['Daily Tests']=df_Tests.Count Sample Tests
Kenya_x.head()

In [174]:
Kenya_x=Kenyan_Confirmed.set_index('Day')
Kenya_x=Kenya_x.drop(['01/04/2020','06/04/2020','08/04/2020','16/04/2020','17/04/2020','20/04/2020','25/04/2020','26/04/2020','27/04/2020','28/04/2020'])
Kenya_x['Daily Tests']=df_Tests['Count Sample Tests']
Kenya_x.head()

In [175]:
Kenya_x=Kenyan_Confirmed.set_index('Day')
Kenya_x=Kenya_x.drop(['01/04/2020','06/04/2020','08/04/2020','16/04/2020','17/04/2020','20/04/2020','25/04/2020','26/04/2020','27/04/2020','28/04/2020'])
df_Tests['Count Sample Tests']
#Kenya_x.head()

In [176]:
Kenya_x=Kenyan_Confirmed.set_index('Day')
Kenya_x=Kenya_x.drop(['01/04/2020','06/04/2020','08/04/2020','16/04/2020','17/04/2020','20/04/2020','25/04/2020','26/04/2020','27/04/2020','28/04/2020'])
Kenya_x['Daily Tests']=df_Tests['Count Sample Tests']
#Kenya_x.head()

In [177]:
Kenya_x=Kenyan_Confirmed.set_index('Day')
Kenya_x=Kenya_x.drop(['01/04/2020','06/04/2020','08/04/2020','16/04/2020','17/04/2020','20/04/2020','25/04/2020','26/04/2020','27/04/2020','28/04/2020'])
Kenya_x['Daily Tests']=df_Tests['Count Sample Tests']
Kenya_x.head()

In [178]:
Kenya_x=Kenyan_Confirmed.set_index('Day')
Kenya_x=Kenya_x.drop(['01/04/2020','06/04/2020','08/04/2020','16/04/2020','17/04/2020','20/04/2020','25/04/2020','26/04/2020','27/04/2020','28/04/2020'])
a=df_Tests['Count Sample Tests']
#Kenya_x['Daily Tests']=df_Tests['Count Sample Tests']
a.astype

In [179]:
Kenya_x=Kenyan_Confirmed.set_index('Day')
Kenya_x=Kenya_x.drop(['01/04/2020','06/04/2020','08/04/2020','16/04/2020','17/04/2020','20/04/2020','25/04/2020','26/04/2020','27/04/2020','28/04/2020'])
a=[df_Tests['Count Sample Tests']]
#Kenya_x['Daily Tests']=df_Tests['Count Sample Tests']

In [180]:
Kenya_x=Kenyan_Confirmed.set_index('Day')
Kenya_x=Kenya_x.drop(['01/04/2020','06/04/2020','08/04/2020','16/04/2020','17/04/2020','20/04/2020','25/04/2020','26/04/2020','27/04/2020','28/04/2020'])
a=[df_Tests['Count Sample Tests']]
#Kenya_x['Daily Tests']=df_Tests['Count Sample Tests']
a

In [181]:
Kenya_x=Kenyan_Confirmed.set_index('Day')
Kenya_x=Kenya_x.drop(['01/04/2020','06/04/2020','08/04/2020','16/04/2020','17/04/2020','20/04/2020','25/04/2020','26/04/2020','27/04/2020','28/04/2020'])
Kenya_x.head()

In [182]:
Kenya_x=Kenyan_Confirmed.set_index('Day')
Kenya_x=Kenya_x.drop(['01/04/2020','06/04/2020','08/04/2020','16/04/2020','17/04/2020','20/04/2020','25/04/2020','26/04/2020','27/04/2020','28/04/2020'])
Kenya_x.reset_index()
Kenya_x.head()

In [183]:
Kenya_x=Kenyan_Confirmed.set_index('Day')
Kenya_x=Kenya_x.drop(['01/04/2020','06/04/2020','08/04/2020','16/04/2020','17/04/2020','20/04/2020','25/04/2020','26/04/2020','27/04/2020','28/04/2020'])
Kenya_x.reset_index(inplace=True)
Kenya_x.head()

In [184]:
Kenya_x=Kenyan_Confirmed.set_index('Day')
Kenya_x=Kenya_x.drop(['01/04/2020','06/04/2020','08/04/2020','16/04/2020','17/04/2020','20/04/2020','25/04/2020','26/04/2020','27/04/2020','28/04/2020'])
Kenya_x.reset_index(inplace=True)
Kenya_x.rename(columns={'Day':'Test Days'})
Kenya_x.head()

In [185]:
Kenya_x=Kenyan_Confirmed.set_index('Day')
Kenya_x=Kenya_x.drop(['01/04/2020','06/04/2020','08/04/2020','16/04/2020','17/04/2020','20/04/2020','25/04/2020','26/04/2020','27/04/2020','28/04/2020'])
Kenya_x.reset_index(inplace=True)
Kenya_x=Kenya_x.rename(columns={'Day':'Test Days'})
Kenya_x.head()

In [186]:
Kenya_x=Kenyan_Confirmed.set_index('Day')
Kenya_x=Kenya_x.drop(['01/04/2020','06/04/2020','08/04/2020','16/04/2020','17/04/2020','20/04/2020','25/04/2020','26/04/2020','27/04/2020','28/04/2020'])
Kenya_x.reset_index(inplace=True)
Kenya_x=Kenya_x.rename(columns={'Day':'Test Days'})
Kenya_x=pd.merge(df_Tests,Kenya_x,on='Test Days')
Kenya_x.head()

In [187]:
fig,ax=plt.subplots(figsize=(15,5))
ax2=ax.twinx()
sns.lineplot(y='Daily_Confirmed_cases',x='Test Days',data=Kenya_x,ax=ax,marker='o')
sns.lineplot(y='Count Sample Tests',x='Test Days',data=Kenya_x,ax=ax2,marker='x')
plt.xticks(rotation="vertical")
ax.set_ylim([0,30])
sns.set_style("darkgrid")
plt.title("April-Kenya Daily Cases")
plt.show()

In [188]:
fig,ax=plt.subplots(figsize=(12,6))
ax2=ax.twinx()
sns.lineplot(y='Daily_Confirmed_cases',x='Test Days',data=Kenya_x,ax=ax,marker='o',color='red')
sns.lineplot(y='Count Sample Tests',x='Test Days',data=Kenya_x,ax=ax2,marker='x')
plt.xticks(rotation="vertical")
ax.set_ylim([0,30])
sns.set_style("darkgrid")
plt.title("Kenya Daily Cases & Test Conducted")
plt.show()

In [189]:
fig,ax=plt.subplots(figsize=(12,6))
ax2=ax.twinx()
sns.lineplot(y='Daily_Confirmed_cases',x='Test Days',data=Kenya_x,ax=ax,marker='o',color='red')
sns.lineplot(y='Count Sample Tests',x='Test Days',data=Kenya_x,ax=ax2,marker='x')
ax.xticks(rotation="vertical")
ax.set_ylim([0,30])
sns.set_style("darkgrid")
plt.title("Kenya Daily Cases & Test Conducted")
plt.show()

In [190]:
fig,ax=plt.subplots(figsize=(12,6))
ax2=ax.twinx()
sns.lineplot(y='Daily_Confirmed_cases',x='Test Days',data=Kenya_x,ax=ax,marker='o',color='red')
sns.lineplot(y='Count Sample Tests',x='Test Days',data=Kenya_x,ax=ax2,marker='x')
ax.set_xticks(rotation="vertical")
ax.set_ylim([0,30])
sns.set_style("darkgrid")
plt.title("Kenya Daily Cases & Test Conducted")
plt.show()

In [191]:
fig,ax=plt.subplots(figsize=(12,6))
ax2=ax.twinx()
sns.lineplot(y='Daily_Confirmed_cases',x='Test Days',data=Kenya_x,ax=ax,marker='o',color='red')
sns.lineplot(y='Count Sample Tests',x='Test Days',data=Kenya_x,ax=ax2,marker='x')
ax2.xticks(rotation="vertical")
ax.set_ylim([0,30])
sns.set_style("darkgrid")
plt.title("Kenya Daily Cases & Test Conducted")
plt.show()

In [192]:
fig,ax=plt.subplots(figsize=(12,6))
ax2=ax.twinx()
sns.lineplot(y='Daily_Confirmed_cases',x='Test Days',data=Kenya_x,ax=ax,marker='o',color='red')
sns.lineplot(y='Count Sample Tests',x='Test Days',data=Kenya_x,ax=ax2,marker='x')
plt.xticks(rotation="vertical")
ax.set_ylim([0,30])
sns.set_style("darkgrid")
plt.title("Kenya Daily Cases & Test Conducted")
plt.show()

In [193]:
fig,ax=plt.subplots(figsize=(12,6))
ax2=ax.twinx()
sns.lineplot(y='Daily_Confirmed_cases',x='Test Days',data=Kenya_x,ax=ax,marker='o',color='red')
sns.lineplot(y='Count Sample Tests',x='Test Days',data=Kenya_x,ax=ax2,marker='x')
plt.set_xticklabels(rotation="vertical")
ax.set_ylim([0,30])
sns.set_style("darkgrid")
plt.title("Kenya Daily Cases & Test Conducted")
plt.show()

In [194]:
fig,ax=plt.subplots(figsize=(12,6))
ax2=ax.twinx()
sns.lineplot(y='Daily_Confirmed_cases',x='Test Days',data=Kenya_x,ax=ax,marker='o',color='red')
sns.lineplot(y='Count Sample Tests',x='Test Days',data=Kenya_x,ax=ax2,marker='x')
ax.set_xticklabels(rotation="vertical")
ax.set_ylim([0,30])
sns.set_style("darkgrid")
plt.title("Kenya Daily Cases & Test Conducted")
plt.show()

In [195]:
fig,ax=plt.subplots(figsize=(12,6))
ax2=ax.twinx()
sns.lineplot(y='Daily_Confirmed_cases',x='Test Days',data=Kenya_x,ax=ax,marker='o',color='red')
sns.lineplot(y='Count Sample Tests',x='Test Days',data=Kenya_x,ax=ax2,marker='x')
ax.set_xticklabels(ax.get_xticklabels(),rotation="vertical")
ax.set_ylim([0,30])
sns.set_style("darkgrid")
plt.title("Kenya Daily Cases & Test Conducted")
plt.show()

In [196]:
fig,ax=plt.subplots(figsize=(12,6))
ax2=ax.twinx()
sns.lineplot(y='Daily_Confirmed_cases',x='Test Days',data=Kenya_x,ax=ax,marker='o',color='red')
sns.lineplot(y='Count Sample Tests',x='Test Days',data=Kenya_x,ax=ax2,marker='x')
ax.set_xticklabels(x.get_xticklabels(),rotation="vertical")
ax.set_ylim([0,30])
sns.set_style("darkgrid")
plt.title("Kenya Daily Cases & Test Conducted")
plt.show()

In [197]:
fig,ax=plt.subplots(figsize=(12,6))
ax2=ax.twinx()
a=sns.lineplot(y='Daily_Confirmed_cases',x='Test Days',data=Kenya_x,ax=ax,marker='o',color='red')
sns.lineplot(y='Count Sample Tests',x='Test Days',data=Kenya_x,ax=ax2,marker='x')
ax.set_xticklabels(a.get_xticklabels(),rotation="vertical")
ax.set_ylim([0,30])
sns.set_style("darkgrid")
plt.title("Kenya Daily Cases & Test Conducted")
plt.show()

In [198]:
fig,ax=plt.subplots(figsize=(12,6))
ax2=ax.twinx()
sns.lineplot(y='Daily_Confirmed_cases',x='Test Days',data=Kenya_x,ax=ax,marker='o',color='red')
sns.lineplot(y='Count Sample Tests',x='Test Days',data=Kenya_x,ax=ax2,marker='x')
ax.set_ylim([0,30])
sns.set_style("darkgrid")
plt.title("Kenya Daily Cases & Test Conducted")
plt.show()

In [199]:
#Setting up a working directory
import os
os.chdir("C:/Users/Lenovo/Desktop/Data Insight/Analysis")
#Saving the notebook
%notebook "C:/Users/Lenovo/Desktop/Data Insight/Analysis/Covid 19 Analysis.ipynb"

In [200]:
#Importing the necessarily libraries to carry out the EDA
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [201]:
#Saving the raw data of the covid19 recovered cases & confirmed cases to their respective variables
#This data is updated on a daily basis from data source as per the respective url's: https://github.com/CSSEGISandData/COVID-19/tree/master/csse_covid_19_data/csse_covid_19_time_series
recovered_csv="https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_recovered_global.csv"
confirmed_csv="https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_confirmed_global.csv"

In [202]:
#Reading in the data to dataframes
recovered_timeseries_df=pd.read_csv(recovered_csv)
confirmed_timeseries_df=pd.read_csv(confirmed_csv)

In [203]:
#Inspecting the data sets
recovered_timeseries_df.head()

In [204]:
confirmed_timeseries_df.head()

In [205]:
#Confirming that the column names in the two datasets are the same i.e the data structures are the same
confirmed_timeseries_df.columns==recovered_timeseries_df.columns

In [206]:
#Since the columns in the df's are the same we can come up with a function to massage the data
#we shall drop columns Province/State,Lat,Long as we don't require it in our analysis
def reshape_data(recovered_timeseries_df,confirmed_timeseries_df):
    confirmed=confirmed_timeseries_df.melt(id_vars=['Province/State','Country/Region','Lat','Long'],var_name='Date',value_name='Confirmed_cases').drop(columns=['Province/State','Lat','Long'])
    recovered=recovered_timeseries_df.melt(id_vars=['Province/State','Country/Region','Lat','Long'],var_name='Date',value_name='Recovered_cases').drop(columns=['Province/State','Lat','Long'])
    confirmed['Date']=pd.to_datetime(confirmed['Date'])
    recovered['Date']=pd.to_datetime(recovered['Date'])
    recovered.set_index("Date",inplace=True)
    confirmed.set_index("Date",inplace=True)
    recovered.index = recovered.index.strftime('%d/%m/%Y')
    confirmed.index = confirmed.index.strftime('%d/%m/%Y')
        
    return (confirmed,recovered)

In [207]:
#Wrangling the raw data and unpacking the massaged data to new dataframes
Confirmed_data,Recovered_data=reshape_data(recovered_timeseries_df,confirmed_timeseries_df)

In [208]:
#Inspecting the unpacked data
Confirmed_data.head()

In [209]:
Recovered_data.head()

In [210]:
Confirmed_data.info()

In [211]:
Recovered_data.info()

In [212]:
#Filtering the countries with a high number of confirmed cases registered for the month of April 2020
High_Cases_Confirmed=Confirmed_data['01/04/2020':'30/04/2020'].groupby("Country/Region").max().sort_values(by="Confirmed_cases",ascending=False).head(5)

In [213]:
High_Cases_Confirmed.index

In [214]:
#Create list of the countries with the highest no of confirmed cases as at end of April
Countries_List=High_Cases_Confirmed.index.to_list()

In [215]:
Countries_List

In [216]:
#Filtering the confirmed cases data with Countries list for the month of April 2020 & summing over Confirmed_cases for any cases  of duplicated Country/Region per day
def April_data(data,count_column):
    data1=data[data["Country/Region"].isin (Countries_List)]
    data1=data1['01/04/2020':'30/04/2020']
    data1 = data1.groupby([data1.index,'Country/Region']).agg({count_column: 'sum'})
    data1.reset_index(inplace=True)
    data1.set_index('level_0',inplace=True)
    data1.index.rename('Date',inplace=True)
    return data1

In [217]:
Highest_Cases_data=April_data(Confirmed_data,'Confirmed_cases')

In [218]:
Highest_Cases_data.head()

In [219]:
Highest_Recovery_data=April_data(Recovered_data,'Recovered_cases')

In [220]:
Color=['Red','Blue','Green','Black','Purple']
c=0
fig,ax=plt.subplots(figsize=(14,9))
for i in Countries_List:
    a=Highest_Cases_data[Highest_Cases_data['Country/Region']==i]
    ax.plot(a.index,a['Confirmed_cases'],color=Color[c])
    c+=1
ax.legend(Countries_List,loc='best')
plt.xticks(rotation='vertical')
ax=plt.gca()
plt.title('Most Affected Countries ( Confirmed April Cases)')
plt.xlabel("Date")
plt.ylabel("Confirmed Cases")
plt.show
    

In [221]:
#Filtering US cases
US_cases=Highest_Cases_data[Highest_Cases_data['Country/Region']=='US']

In [222]:
#US April cases plot
fig,ax=plt.subplots(figsize=(15,5))
ax.plot(US_cases.index,US_cases['Confirmed_cases'],color='blue',marker='s')
plt.xticks(rotation='vertical')
ax.set_ylim([0,1200000])
plt.title('US Confirmed April Cases')
plt.xlabel("Date")
plt.ylabel("Confirmed Cases")
plt.show

In [223]:
#Subsetting the Countries_List data to exclude US
Countries_List_2=Countries_List[1:]

In [224]:
Countries_List_2

In [225]:
#Plot
Color=['Red','Green','Black','Purple']
c=0
fig,ax=plt.subplots(figsize=(15,5))
for i in Countries_List_2:
    a=Highest_Cases_data[Highest_Cases_data['Country/Region']==i]
    ax.plot(a.index,a['Confirmed_cases'],color=Color[c])
    c+=1
ax.legend(Countries_List_2,loc='best')
plt.xticks(rotation='vertical')
ax=plt.gca()
plt.title('Most Affected Countries Except US( Confirmed April Cases)')
ax.set_ylim([0,250000])
plt.xlabel("Date")
plt.ylabel("Confirmed Cases")
plt.show

In [226]:
#Summing for values with duplicated region/Country
Confirmed_data_April=Confirmed_data.groupby([Confirmed_data.index,'Country/Region']).agg({'Confirmed_cases': 'sum'})

In [227]:
Confirmed_data_April=Confirmed_data_April.loc['01/04/2020':'30/04/2020']

In [228]:
Confirmed_Country_Max = Confirmed_data_April.groupby(['Country/Region']).max()

In [229]:
Confirmed_Country_Max.reset_index(inplace=True)

In [230]:
Confirmed_Country_Max.head()

In [231]:
#Summing for values with duplicated region/Country
Recovered_data_April = Recovered_data.groupby([Recovered_data.index,'Country/Region']).agg({'Recovered_cases': 'sum'})

In [232]:
Recovered_data_April=Recovered_data_April.loc['01/04/2020':'30/04/2020']

In [233]:
#Getting the maximum recovery cases
Recovered_Country_Max = Recovered_data_April.groupby(['Country/Region']).max()

In [234]:
Recovered_Country_Max.reset_index(inplace=True)

In [235]:
Recovered_Country_Max.head()

In [236]:
Confirmed_Recovered_data=Confirmed_Country_Max

In [237]:
#Adding a Recovered_cases column to create a new dataframe
Confirmed_Recovered_data['Recovered_cases']=Recovered_Country_Max['Recovered_cases']

In [238]:
Confirmed_Recovered_data.head()

In [239]:
#Getting the recovery rates per country column

In [240]:
Confirmed_Recovered_data['Recovery rate']=Confirmed_Recovered_data['Recovered_cases']/Confirmed_Recovered_data['Confirmed_cases']

In [241]:
Confirmed_Recovered_data.head()

In [242]:
Countries_Most_Affected=Confirmed_Recovered_data.groupby("Country/Region").max().sort_values(by="Confirmed_cases",ascending=False).head(8)

In [243]:
Countries_Most_Affected.reset_index(inplace=True)

In [244]:
Countries_Most_Affected.head()

In [245]:
sns.barplot(x='Country/Region',y='Recovery rate',data=Countries_Most_Affected,order=['United Kingdom','Russia','US','France','Brazil','Italy','Spain','Germany'])
plt.xticks(rotation='vertical')
plt.title("Recovery Rates for Countries Most Affected")

In [246]:
#Subseting the Kenyan data
Kenyan_Confirmed=Confirmed_data[Confirmed_data['Country/Region']=='Kenya']

In [247]:
Kenyan_Confirmed=Kenyan_Confirmed['01/04/2020':'30/04/2020']

In [248]:
Kenyan_Confirmed.head()

In [249]:
Kenyan_Confirmed.reset_index(inplace=True)

In [250]:
Kenyan_Confirmed.head()

In [251]:
Kenyan_Confirmed.rename(columns={'index':'Day'},inplace=True)

In [252]:
Kenyan_Confirmed.head()

In [253]:
#Plot of confirmed cases over_time

In [254]:
fig,ax=plt.subplots(figsize=(15,5))
sns.lineplot(y='Confirmed_cases',x='Day',ax=ax,data=Kenyan_Confirmed,marker='s')
plt.xticks(rotation="vertical")
ax.set_ylim([0,450])
sns.set_style("darkgrid")
plt.title("Total Confirmed Cases")
plt.show()

In [255]:
Kenyan_Confirmed['Daily_Confirmed_cases']=Kenyan_Confirmed['Confirmed_cases'].diff()

In [256]:
Kenyan_Confirmed.head()

In [257]:
#Dropping the first day
Kenyan_Daily_Confirmed=Kenyan_Confirmed.dropna(axis=0)

In [258]:
Kenyan_Daily_Confirmed.head()

In [259]:
fig,ax=plt.subplots(figsize=(15,5))
sns.lineplot(y='Daily_Confirmed_cases',x='Day',data=Kenyan_Daily_Confirmed,marker='o')
plt.xticks(rotation="vertical")
ax.set_ylim([0,30])
sns.set_style("darkgrid")
plt.title("April-Kenya Daily Cases")
plt.show()

In [260]:
#Kenya testing data scrapped from daily press briefings by the Ministry of Health Kenya

In [261]:
Dates=Kenyan_Confirmed['Day'].to_list()

In [262]:
#Converting the dates to a list
#It is worth noting that some days had missing information of test data thus we shall drop those days

In [263]:
#Dropping days with missing test data

In [264]:
Dates=Dates[1:]

In [265]:
Test_Missing=['06/04/2020','08/04/2020','16/04/2020','17/04/2020','20/04/2020','25/04/2020','26/04/2020','27/04/2020','28/04/2020']
Test_Dates=[item for item in Dates if item not in Test_Missing]

In [266]:
Kenyan_tested=[662,362,372,530,696,308,504,491,766,674,694,803,1115,1330,545,707,668,946,508,777]

In [267]:
d={'Test Days':Test_Dates,'Count Sample Tests':Kenyan_tested}

In [268]:
df_Tests=pd.DataFrame(d)

In [269]:
df_Tests

In [270]:
#Visualizing the same

In [271]:
fig,ax=plt.subplots(figsize=(12,5))
sns.barplot(y='Count Sample Tests',x='Test Days',data=df_Tests,color='b')
plt.xticks(rotation="vertical")
sns.set_style("darkgrid")
plt.title("Daily Test Cases")
plt.show()

In [272]:
Kenya_x=Kenyan_Confirmed.set_index('Day')
Kenya_x=Kenya_x.drop(['01/04/2020','06/04/2020','08/04/2020','16/04/2020','17/04/2020','20/04/2020','25/04/2020','26/04/2020','27/04/2020','28/04/2020'])
Kenya_x.reset_index(inplace=True)
Kenya_x=Kenya_x.rename(columns={'Day':'Test Days'})
Kenya_x=pd.merge(df_Tests,Kenya_x,on='Test Days')
Kenya_x.head()

In [273]:
fig,ax=plt.subplots(figsize=(12,6))
ax2=ax.twinx()
sns.lineplot(y='Daily_Confirmed_cases',x='Test Days',data=Kenya_x,ax=ax,marker='o',color='red')
sns.lineplot(y='Count Sample Tests',x='Test Days',data=Kenya_x,ax=ax2,marker='x')
ax.set_ylim([0,30])
sns.set_style("darkgrid")
plt.title("Kenya Daily Cases & Test Conducted")
plt.show()

In [274]:
#Setting up a working directory
import os
os.chdir("C:/Users/Lenovo/Desktop/Data Insight/Analysis")
#Saving the notebook
%notebook "C:/Users/Lenovo/Desktop/Data Insight/Analysis/Covid 19 Analysis.ipynb"

In [275]:
#Importing the necessarily libraries to carry out the EDA
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [276]:
#Saving the raw data of the covid19 recovered cases & confirmed cases to their respective variables
#This data is updated on a daily basis from data source as per the respective url's: https://github.com/CSSEGISandData/COVID-19/tree/master/csse_covid_19_data/csse_covid_19_time_series
recovered_csv="https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_recovered_global.csv"
confirmed_csv="https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_confirmed_global.csv"

In [277]:
#Reading in the data to dataframes
recovered_timeseries_df=pd.read_csv(recovered_csv)
confirmed_timeseries_df=pd.read_csv(confirmed_csv)

In [278]:
#Inspecting the data sets
recovered_timeseries_df.head()

In [279]:
confirmed_timeseries_df.head()

In [280]:
#Confirming that the column names in the two datasets are the same i.e the data structures are the same
confirmed_timeseries_df.columns==recovered_timeseries_df.columns

In [281]:
#Since the columns in the df's are the same we can come up with a function to massage the data
#we shall drop columns Province/State,Lat,Long as we don't require it in our analysis
def reshape_data(recovered_timeseries_df,confirmed_timeseries_df):
    confirmed=confirmed_timeseries_df.melt(id_vars=['Province/State','Country/Region','Lat','Long'],var_name='Date',value_name='Confirmed_cases').drop(columns=['Province/State','Lat','Long'])
    recovered=recovered_timeseries_df.melt(id_vars=['Province/State','Country/Region','Lat','Long'],var_name='Date',value_name='Recovered_cases').drop(columns=['Province/State','Lat','Long'])
    confirmed['Date']=pd.to_datetime(confirmed['Date'])
    recovered['Date']=pd.to_datetime(recovered['Date'])
    recovered.set_index("Date",inplace=True)
    confirmed.set_index("Date",inplace=True)
    recovered.index = recovered.index.strftime('%d/%m/%Y')
    confirmed.index = confirmed.index.strftime('%d/%m/%Y')
        
    return (confirmed,recovered)

In [282]:
#Wrangling the raw data and unpacking the massaged data to new dataframes
Confirmed_data,Recovered_data=reshape_data(recovered_timeseries_df,confirmed_timeseries_df)

In [283]:
#Inspecting the unpacked data
Confirmed_data.head()

In [284]:
Recovered_data.head()

In [285]:
Confirmed_data.info()

In [286]:
Recovered_data.info()

In [287]:
#Filtering the countries with a high number of confirmed cases registered for the month of April 2020
High_Cases_Confirmed=Confirmed_data['01/04/2020':'30/04/2020'].groupby("Country/Region").max().sort_values(by="Confirmed_cases",ascending=False).head(5)

In [288]:
High_Cases_Confirmed.index

In [289]:
#Create list of the countries with the highest no of confirmed cases as at end of April
Countries_List=High_Cases_Confirmed.index.to_list()

In [290]:
Countries_List

In [291]:
#Filtering the confirmed cases data with Countries list for the month of April 2020 & summing over Confirmed_cases for any cases  of duplicated Country/Region per day
def April_data(data,count_column):
    data1=data[data["Country/Region"].isin (Countries_List)]
    data1=data1['01/04/2020':'30/04/2020']
    data1 = data1.groupby([data1.index,'Country/Region']).agg({count_column: 'sum'})
    data1.reset_index(inplace=True)
    data1.set_index('level_0',inplace=True)
    data1.index.rename('Date',inplace=True)
    return data1

In [292]:
Highest_Cases_data=April_data(Confirmed_data,'Confirmed_cases')

In [293]:
Highest_Cases_data.head()

In [294]:
Highest_Recovery_data=April_data(Recovered_data,'Recovered_cases')

In [295]:
Color=['Red','Blue','Green','Black','Purple']
c=0
fig,ax=plt.subplots(figsize=(14,9))
for i in Countries_List:
    a=Highest_Cases_data[Highest_Cases_data['Country/Region']==i]
    ax.plot(a.index,a['Confirmed_cases'],color=Color[c])
    c+=1
ax.legend(Countries_List,loc='best')
plt.xticks(rotation='vertical')
ax=plt.gca()
plt.title('Most Affected Countries ( Confirmed April Cases)')
plt.xlabel("Date")
plt.ylabel("Confirmed Cases")
plt.show
    

In [296]:
#Filtering US cases
US_cases=Highest_Cases_data[Highest_Cases_data['Country/Region']=='US']

In [297]:
#US April cases plot
fig,ax=plt.subplots(figsize=(15,5))
ax.plot(US_cases.index,US_cases['Confirmed_cases'],color='blue',marker='s')
plt.xticks(rotation='vertical')
ax.set_ylim([0,1200000])
plt.title('US Confirmed April Cases')
plt.xlabel("Date")
plt.ylabel("Confirmed Cases")
plt.show

In [298]:
#Subsetting the Countries_List data to exclude US
Countries_List_2=Countries_List[1:]

In [299]:
Countries_List_2

In [300]:
#Plot
Color=['Red','Green','Black','Purple']
c=0
fig,ax=plt.subplots(figsize=(15,5))
for i in Countries_List_2:
    a=Highest_Cases_data[Highest_Cases_data['Country/Region']==i]
    ax.plot(a.index,a['Confirmed_cases'],color=Color[c])
    c+=1
ax.legend(Countries_List_2,loc='best')
plt.xticks(rotation='vertical')
ax=plt.gca()
plt.title('Most Affected Countries Except US( Confirmed April Cases)')
ax.set_ylim([0,250000])
plt.xlabel("Date")
plt.ylabel("Confirmed Cases")
plt.show

In [301]:
#Summing for values with duplicated region/Country
Confirmed_data_April=Confirmed_data.groupby([Confirmed_data.index,'Country/Region']).agg({'Confirmed_cases': 'sum'})

In [302]:
Confirmed_data_April=Confirmed_data_April.loc['01/04/2020':'30/04/2020']

In [303]:
Confirmed_Country_Max = Confirmed_data_April.groupby(['Country/Region']).max()

In [304]:
Confirmed_Country_Max.reset_index(inplace=True)

In [305]:
Confirmed_Country_Max.head()

In [306]:
#Summing for values with duplicated region/Country
Recovered_data_April = Recovered_data.groupby([Recovered_data.index,'Country/Region']).agg({'Recovered_cases': 'sum'})

In [307]:
Recovered_data_April=Recovered_data_April.loc['01/04/2020':'30/04/2020']

In [308]:
#Getting the maximum recovery cases
Recovered_Country_Max = Recovered_data_April.groupby(['Country/Region']).max()

In [309]:
Recovered_Country_Max.reset_index(inplace=True)

In [310]:
Recovered_Country_Max.head()

In [311]:
Confirmed_Recovered_data=Confirmed_Country_Max

In [312]:
#Adding a Recovered_cases column to create a new dataframe
Confirmed_Recovered_data['Recovered_cases']=Recovered_Country_Max['Recovered_cases']

In [313]:
Confirmed_Recovered_data.head()

In [314]:
#Getting the recovery rates per country column

In [315]:
Confirmed_Recovered_data['Recovery rate']=Confirmed_Recovered_data['Recovered_cases']/Confirmed_Recovered_data['Confirmed_cases']

In [316]:
Confirmed_Recovered_data.head()

In [317]:
Countries_Most_Affected=Confirmed_Recovered_data.groupby("Country/Region").max().sort_values(by="Confirmed_cases",ascending=False).head(8)

In [318]:
Countries_Most_Affected.reset_index(inplace=True)

In [319]:
Countries_Most_Affected.head()

In [320]:
sns.barplot(x='Country/Region',y='Recovery rate',data=Countries_Most_Affected,order=['United Kingdom','Russia','US','France','Brazil','Italy','Spain','Germany'])
plt.xticks(rotation='vertical')
plt.title("Recovery Rates for Countries Most Affected")

In [321]:
#Subseting the Kenyan data
Kenyan_Confirmed=Confirmed_data[Confirmed_data['Country/Region']=='Kenya']

In [322]:
Kenyan_Confirmed=Kenyan_Confirmed['01/04/2020':'30/04/2020']

In [323]:
Kenyan_Confirmed.head()

In [324]:
Kenyan_Confirmed.reset_index(inplace=True)

In [325]:
Kenyan_Confirmed.head()

In [326]:
Kenyan_Confirmed.rename(columns={'index':'Day'},inplace=True)

In [327]:
Kenyan_Confirmed.head()

In [328]:
#Plot of confirmed cases over_time

In [329]:
fig,ax=plt.subplots(figsize=(15,5))
sns.lineplot(y='Confirmed_cases',x='Day',ax=ax,data=Kenyan_Confirmed,marker='s')
plt.xticks(rotation="vertical")
ax.set_ylim([0,450])
sns.set_style("darkgrid")
plt.title("Total Confirmed Cases")
plt.show()

In [330]:
Kenyan_Confirmed['Daily_Confirmed_cases']=Kenyan_Confirmed['Confirmed_cases'].diff()

In [331]:
Kenyan_Confirmed.head()

In [332]:
#Dropping the first day
Kenyan_Daily_Confirmed=Kenyan_Confirmed.dropna(axis=0)

In [333]:
Kenyan_Daily_Confirmed.head()

In [334]:
fig,ax=plt.subplots(figsize=(15,5))
sns.lineplot(y='Daily_Confirmed_cases',x='Day',data=Kenyan_Daily_Confirmed,marker='o')
plt.xticks(rotation="vertical")
ax.set_ylim([0,30])
sns.set_style("darkgrid")
plt.title("April-Kenya Daily Cases")
plt.show()

In [335]:
#Kenya testing data scrapped from daily press briefings by the Ministry of Health Kenya

In [336]:
Dates=Kenyan_Confirmed['Day'].to_list()

In [337]:
#Converting the dates to a list
#It is worth noting that some days had missing information of test data thus we shall drop those days

In [338]:
#Dropping days with missing test data

In [339]:
Dates=Dates[1:]

In [340]:
Test_Missing=['06/04/2020','08/04/2020','16/04/2020','17/04/2020','20/04/2020','25/04/2020','26/04/2020','27/04/2020','28/04/2020']
Test_Dates=[item for item in Dates if item not in Test_Missing]

In [341]:
Kenyan_tested=[662,362,372,530,696,308,504,491,766,674,694,803,1115,1330,545,707,668,946,508,777]

In [342]:
d={'Test Days':Test_Dates,'Count Sample Tests':Kenyan_tested}

In [343]:
df_Tests=pd.DataFrame(d)

In [344]:
df_Tests

In [345]:
#Visualizing the same

In [346]:
fig,ax=plt.subplots(figsize=(12,5))
sns.barplot(y='Count Sample Tests',x='Test Days',data=df_Tests,color='b')
plt.xticks(rotation="vertical")
sns.set_style("darkgrid")
plt.title("Daily Test Cases")
plt.show()

In [347]:
Kenya_x=Kenyan_Confirmed.set_index('Day')
Kenya_x=Kenya_x.drop(['01/04/2020','06/04/2020','08/04/2020','16/04/2020','17/04/2020','20/04/2020','25/04/2020','26/04/2020','27/04/2020','28/04/2020'])
Kenya_x.reset_index(inplace=True)
Kenya_x=Kenya_x.rename(columns={'Day':'Test Days'})
Kenya_x=pd.merge(df_Tests,Kenya_x,on='Test Days')
Kenya_x.head()

In [348]:
fig,ax=plt.subplots(figsize=(12,6))
ax2=ax.twinx()
sns.lineplot(y='Daily_Confirmed_cases',x='Test Days',data=Kenya_x,ax=ax,marker='o',color='red')
sns.lineplot(y='Count Sample Tests',x='Test Days',data=Kenya_x,ax=ax2,marker='x')
ax.set_ylim([0,30])
sns.set_style("darkgrid")
plt.title("Kenya Daily Cases & Test Conducted")
plt.show()

In [349]:
#Setting up a working directory
import os
os.chdir("C:/Users/Lenovo/Desktop/Data Insight/Analysis")
#Saving the notebook
%notebook "C:/Users/Lenovo/Desktop/Data Insight/Analysis/Covid 19 Analysis.ipynb"