<a href="https://colab.research.google.com/github/N-Vasu-Reddy/Exploring-COVID19-Data/blob/main/project_Covid_19_EDA.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Importing necessary libraries

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
import folium
from ipywidgets import interact
import warnings
warnings.filterwarnings('ignore')

#Loading datasets

In [None]:
cases_df = pd.read_csv('/content/WHO-COVID-19-global-table-data.csv',encoding='latin1')
vacc_df = pd.read_csv('/content/vaccination-data.csv',encoding='latin1')
loc_df = pd.read_csv('/content/location.csv',encoding='latin1')

###Initial Exploration

In [None]:
cases_df.head()

In [None]:
vacc_df.head()

In [None]:
loc_df.head()

In [None]:
cases_df.info()

In [None]:
vacc_df.info()

#Data Cleaning

In [None]:
cases_df.drop("Deaths - newly reported in last 7 days per 100000 population",axis=1,inplace=True)

In [None]:
cases_df.drop(["WHO Region"],axis=1,inplace=True)
vacc_df.drop(["WHO_REGION", "DATA_SOURCE","DATE_UPDATED","VACCINES_USED","NUMBER_VACCINES_TYPES_USED","FIRST_VACCINE_DATE"],axis=1,inplace=True)

In [None]:
vacc_df = vacc_df.dropna(subset=['TOTAL_VACCINATIONS'])

In [None]:
cases_df.rename(columns={"Cases - cumulative total":"cases_tot","Cases - cumulative total per 100000 population":"cases_tot_per10000","Cases - newly reported in last 7 days":"new_cases_7d","Cases - newly reported in last 7 days per 100000 population":"new_cases_7d_per10000","Cases - newly reported in last 24 hours":"new_cases_24h"},inplace=True)
cases_df.rename(columns={"Deaths - cumulative total":"deaths_tot","Deaths - cumulative total per 100000 population":"deaths_tot_per10000","Deaths - newly reported in last 7 days":"new_deaths_7d","Deaths - newly reported in last 24 hours":"new_deaths_24h"},inplace=True)

In [None]:
cases_df.rename(columns={"Name":"country"},inplace=True)
vacc_df.rename(columns={"COUNTRY":"country"},inplace=True)
loc_df.rename(columns={"country":"code","name":"country"},inplace=True)

In [None]:
cases_df.head()

In [None]:
vacc_df.head()

##Feature Engineering

In this stage, I am going to create a covid-19 dataframe(covid_df) from the three available dataframes(loc_df,cases_df and vacc_df). This process requires treating of the three dataframes and merging them.

In [None]:
covid_df = pd.merge_ordered(loc_df,cases_df,on="country",how='inner')

In [None]:
covid_df = pd.merge_ordered(covid_df,vacc_df,how='left')

In [None]:
covid_df.rename(columns={covid_df.columns[0]:"code"},inplace = True)

In [None]:
covid_df.head()

In [None]:
covid_df['VACCINATION_GAP'] = covid_df['PERSONS_VACCINATED_1PLUS_DOSE'] - covid_df['PERSONS_LAST_DOSE']

In [None]:
cols_to_replace = ['cases_tot', 'cases_tot_per10000', 'new_cases_7d', 'new_cases_7d_per10000',
                   'new_cases_24h', 'deaths_tot', 'deaths_tot_per10000', 'new_deaths_7d',
                   'new_deaths_24h', 'TOTAL_VACCINATIONS', 'PERSONS_VACCINATED_1PLUS_DOSE',
                   'TOTAL_BOOSTERS', 'PERSONS_BOOSTER_ADD_DOSE']
for col in cols_to_replace:
    if col in covid_df.columns:
        covid_df[col] = covid_df[col].replace(0, np.nan)

covid_df.head()

In [None]:
covid_df.shape

In [None]:
covid_df.info()

In [None]:
covid_df.to_csv("covid_df.csv",index=False)

#EDA

##Profile Reporting

In [None]:
#!pip install ydata-profiling --q

In [None]:
import ydata_profiling

In [None]:
profile = ydata_profiling.ProfileReport(covid_df)
profile.to_notebook_iframe()

##Distribution Analysis

In [None]:
import plotly.graph_objects as go

fig = go.Figure(data=[go.Pie(
    labels=['Cummulative sum of cases','Cummulative sum of vaccinations','Cummulative sum of deaths'],
    values=[covid_df['cases_tot'].sum(),covid_df['TOTAL_VACCINATIONS'].sum(),covid_df['deaths_tot'].sum()],
    hole=0.4,
    marker=dict(colors=['#ffc107', '#28a745','#dc3545'])
)])
fig.update_layout(
    title_text='COVID-19 Cases Distribution',
    title_x=0.5,
    annotations=[dict(text='Cases', x=0.5, y=0.5, font_size=20, showarrow=False)]
)
fig.show()

##Correlation analysis and Heatmap

In [None]:
numeric_cols = covid_df.select_dtypes(include=np.number).drop(columns=['latitude', 'longitude'], errors='ignore')
corr_matrix = numeric_cols.corr()
plt.figure(figsize=(15,10))
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', fmt=".2f")
plt.title('Correlation Matrix of COVID-19 Data')
plt.show()

##Univarate Analysis

In [None]:
fig,axes=plt.subplots(1,3,figsize=(20,10))
top_10_countries = covid_df.nlargest(10, 'cases_tot')
colors = sns.color_palette("Blues_r", n_colors=20)
sns.barplot(x='country', y='cases_tot', data=top_10_countries, palette=colors,ax=axes[0])
axes[0].set_title('Top 10 Countries with Highest Total Cases')
axes[0].set_xlabel('Country')
axes[0].set_ylabel('Total Cases')
axes[0].tick_params(axis='x',rotation=90)

top_10_countries = covid_df.nlargest(10, 'deaths_tot').sort_values(by='deaths_tot', ascending=False)
colors = sns.color_palette("Reds_r", n_colors=30)
sns.barplot(x='country', y='deaths_tot', data=top_10_countries,palette=colors,ax=axes[1])
axes[1].set_title('Top 10 Countries with Highest Death Cases')
axes[1].set_xlabel('Country')
axes[1].set_ylabel('Total Death Cases')
axes[1].tick_params(axis='x',rotation=90)

top_10_countries = covid_df.nlargest(10, 'TOTAL_VACCINATIONS')
colors = sns.color_palette("Greens_r", n_colors=20)
sns.barplot(x='country', y='TOTAL_VACCINATIONS', data=top_10_countries,palette=colors,ax = axes[2])
axes[2].set_title('Top 10 Countries with Highest Vaccinated Persons')
axes[2].set_xlabel('Country')
axes[2].set_ylabel('Total Vaccinations')
axes[2].tick_params(rotation=90)
plt.show()

In [None]:
covid_df[['TOTAL_VACCINATIONS','TOTAL_VACCINATIONS_PER100']].describe()

In [None]:
sns.boxplot(x='TOTAL_VACCINATIONS',data=covid_df.sort_values(by='TOTAL_VACCINATIONS',ascending=False).iloc[2:,:])
plt.title('Boxplot of Total Vaccinations')
plt.xlabel('Total Vaccinations')
plt.show()

In [None]:
covid_df['TOTAL_VACCINATIONS_PER100'].describe()

In [None]:
sns.boxplot(x='TOTAL_VACCINATIONS_PER100',data=covid_df.sort_values(by='TOTAL_VACCINATIONS_PER100',ascending=False).iloc[2:,:])
plt.title('Boxplot of Total Vaccinations Per 100')
plt.xlabel('Total Vaccinations Per 100')
plt.show()

In [None]:
sns.histplot(x='TOTAL_VACCINATIONS_PER100',kde=True,bins=30,data=covid_df)
plt.title('Histogram of Total Vaccinations Per 100')
plt.xlabel('Total Vaccinations Per 100')
plt.ylabel('Frequency')
plt.show()

### Observations:


1.  The highest frequency occurs around 100 vaccinations per 100 people. This indicates that a significant number of countries (or regions) have achieved approximately this level of vaccination.
2.   The distribution appears to be right-skewed, with a long tail extending towards higher vaccination rates. This suggests that while many countries have moderate vaccination levels, a few countries have exceptionally high rates.
3. The density curve illustrates a bimodal distribution, indicating two distinct clusters of countries: one centered around 100 vaccinations per 100 people, and the other around 200 vaccinations per 100 people.

In [None]:
from scipy.stats import iqr
Q1 = np.quantile(covid_df['TOTAL_VACCINATIONS'],0.25)
Q3 = np.quantile(covid_df['TOTAL_VACCINATIONS'],0.75)
iqr = iqr(covid_df['TOTAL_VACCINATIONS'])
lower_bound = Q1 - 1.5*iqr
upper_bound = Q3 + 1.5*iqr
outliers = covid_df[(covid_df['TOTAL_VACCINATIONS']<lower_bound) | (covid_df['TOTAL_VACCINATIONS']>upper_bound)]
print(len(outliers))

##Bivariate analysis

###TOTAL_VACCINATIONS vs TOTAL_VACCINATIONS_PER100

In [None]:
sns.scatterplot(x='TOTAL_VACCINATIONS',y='TOTAL_VACCINATIONS_PER100',data=covid_df)
plt.title('Total Vaccinations vs Total Vaccinations Per 100')
plt.xlabel('Total Vaccinations')
plt.ylabel('Total Vaccinations Per 100')
plt.show()

In [None]:
correlation = covid_df[['TOTAL_VACCINATIONS', 'TOTAL_VACCINATIONS_PER100']].corr()
print(correlation)

###PERSONS_LAST_DOSE_PER100 vs PERSONS_BOOSTER_ADD_DOSE_PER100

In [None]:
plt.figure(figsize=(8, 6))
sns.scatterplot(data=covid_df, x='PERSONS_LAST_DOSE_PER100', y='PERSONS_BOOSTER_ADD_DOSE_PER100')
plt.title('Full Vaccination vs. Booster Uptake')
plt.xlabel('Full Vaccination Rate (per 100)')
plt.ylabel('Booster Uptake Rate (per 100)')
plt.show()

In [None]:
covid_df['PERSONS_VACCINATED_1PLUS_DOSE'].corr(covid_df['PERSONS_LAST_DOSE'])

There is a high correlation between the last vaccine uptake rate and booster dose uptake rate.This provides the conclusion that regions where people didn't take the last dose vaccinations also didn't take booster dose vaccination. So awareness about the "covid pandemic and vaccination intake" has to be taught.

In [None]:
covid_df.iloc[:,13:].head()

In [None]:
covid_df['alive']=covid_df['cases_tot']-covid_df['deaths_tot']
covid_df.head()

###Choropleth Analysis

In [None]:
fig = px.choropleth(
    covid_df,
    locations='ISO3',
    color='TOTAL_VACCINATIONS_PER100',
    hover_name='country',
    title='Vaccination Rate per 100 People by Country',
    color_continuous_scale='matter'
)
fig.show()


In [None]:
def create_map(column):
    base_map = folium.Map(location=[0, 0], zoom_start=3,tiles="cartodbpositron",control_scale=True,no_wrap=True,max_bounds=True,)
    bounds=[]

    for _, row in covid_df.iterrows():
        color = "blue" if column == "cases_tot" else "red" if column == "deaths_tot" else "green"
        folium.CircleMarker(
            location=[row["latitude"], row["longitude"]],
            radius=5,
            color=color,
            fill=True,
            fill_color=color,
            fill_opacity=0.6,
            tooltip=f"{column}: {row[column]}<br>Country: {row['country']}"
        ).add_to(base_map)


    return base_map

interact(create_map, column=["cases_tot", "deaths_tot", "TOTAL_VACCINATIONS", "TOTAL_VACCINATIONS_PER100"])
