# Overview
Using a daily updated global pandemic data set, we would perform exploratory analysis to find out the most impacted countries due to Pandemic.
#### Understanding structure of data 
 - Identifying Continuous and Categorical data
 - Correct the datatype for date column
 - Identify top countries with highest covid cases and deaths
 - Plot top countries with highest death %

In [None]:
# Load Libraries
import pandas as pd
import matplotlib.pylab as plt
import seaborn as sns

In [None]:
# Read File
df = pd.read_csv("s3:/dsba-6190-rfox12/covid-us-counties.csv")

In [None]:
# Read top 10 rows
df.head(10)

In [None]:
#Looking at all the variables and their types
df.info()
df.describe()

In [None]:
# Change the datatype of date column to datetime format
df['date']= pd.to_datetime(df['date']) 
df.info()

In [None]:
# Check if there are any missing values in the data
df.isna().sum()
# Note that there are missing values for fips

In [None]:
# Find states with highest no of Covid cases
df.groupby('state').sum().sort_values(by=['cases'], ascending=False).head(10)

In [None]:
# Find states with highest no of Covid death
df.groupby('state').sum().sort_values(by=['deaths'], ascending=False).head(10)

In [None]:
# Plot Cases vs deaths
plt.scatter(df['cases'], df['deaths'], alpha = 0.1)
plt.xlabel("Total Number of Cases")
plt.ylabel("Total Deaths")
plt.title("Total Number of Cases vs Total Deaths")

In [None]:
# Add new column of death % after aggregating the numerical columns
state_wise = df.groupby(["state"]).agg({"fips" : "sum", "cases" : "sum", "deaths" : "sum"})
state_wise["deaths %"] = (state_wise["deaths"]/state_wise["cases"])*100
state_wise.head(10)

In [None]:
# Plot the top states with most no of covid cases and high % of deaths
fig, (ax1, ax2) = plt.subplots(1, 2, figsize = (25, 10))
top_total_cases = state_wise.sort_values(["cases"], ascending = False).head(10)
top_total_deaths = state_wise.sort_values(["deaths %"], ascending = False).head(10)
sns.barplot(x = top_total_cases["cases"], y = top_total_cases.index, ax = ax1)
ax1.set_title("Top States with most no of Covid Cases")
sns.barplot(x = top_total_deaths["deaths %"], y = top_total_deaths.index, ax = ax2)
ax2.set_title("Top States with high % of Covid deaths")

plt.show()

In [None]:
import socket
print(socket.gethostname())

In [None]:
!aws configure list