# COVID-19 Data Analysis and Insights
This notebook explores COVID-19 data, focusing on cases, deaths, and vaccination progress across selected countries.

In [None]:
# Data Cleaning

import pandas as pd

# Load the original dataset
df = pd.read_csv('owid-covid-data-old.csv')

# Filter for countries of interest
countries_of_interest = ['Kenya', 'USA', 'India']
filtered_df = df[df['location'].isin(countries_of_interest)]

# Drop rows with missing critical values
filtered_df = filtered_df.dropna(subset=['date', 'total_cases', 'total_deaths', 'population', 'iso_code'])

# Convert date to datetime
filtered_df['date'] = pd.to_datetime(filtered_df['date'])

# Interpolate missing numeric values
numeric_columns = ['new_cases', 'new_deaths', 'total_vaccinations']
filtered_df[numeric_columns] = filtered_df[numeric_columns].interpolate()

# Display cleaned dataset preview
print("\nCleaned dataset preview:")
print(filtered_df[['location', 'date', 'total_cases', 'total_deaths', 'total_vaccinations', 'population', 'iso_code']].head())
print("\nMissing values:")
print(filtered_df[['total_cases', 'total_deaths', 'total_vaccinations', 'population', 'iso_code']].isna().sum())

# Save to a new CSV file
filtered_df.to_csv('cleaned_owid.csv', index=False)
print("\nCleaned data saved to 'cleaned_owid.csv'")

In [None]:
# Choropleth map

import pandas as pd
import plotly.express as px
import plotly.io as pio
import numpy as np

# Set Plotly renderer for Jupyter
pio.renderers.default = 'notebook'

# Load the original dataset
df = pd.read_csv('owid-covid-data-old.csv')

# Ensure date is datetime
df['date'] = pd.to_datetime(df['date'])

# Prepare data for the latest date
latest_data = df[df['date'] == df['date'].max()].copy()

# Handle missing data
latest_data = latest_data.dropna(subset=['iso_code', 'total_cases', 'total_vaccinations', 'population'])
latest_data = latest_data[latest_data['total_cases'] > 0]

# Calculate vaccination rate and log cases
latest_data['vaccination_rate'] = (latest_data['total_vaccinations'] / latest_data['population']) * 100
latest_data['vaccination_rate'] = latest_data['vaccination_rate'].clip(upper=100)
latest_data['log_total_cases'] = np.log1p(latest_data['total_cases'])

# Optional: Filter for Kenya, USA, India only
# latest_data = latest_data[latest_data['location'].isin(['Kenya', 'USA', 'India'])]

# Choropleth for total cases
fig = px.choropleth(
    latest_data,
    locations="iso_code",
    color="log_total_cases",
    hover_name="location",
    title="Total COVID-19 Cases by Country (Log Scale)",
    color_continuous_scale="Reds",
    projection="natural earth",
    coloraxis_colorbar=dict(title="Log(Total Cases)")
)
fig.update_layout(
    title={'text': "Total COVID-19 Cases by Country", 'x': 0.5, 'xanchor': 'center'},
    geo=dict(showframe=False, showcoastlines=True, coastlinecolor="Black", showland=True, landcolor="LightGrey")
)
# Optional: Zoom to Kenya, USA, India
# fig.update_geos(fitbounds="locations", visible=True)
fig.show()

# Choropleth for vaccination rates
fig = px.choropleth(
    latest_data,
    locations="iso_code",
    color="vaccination_rate",
    hover_name="location",
    title="Vaccination Rates by Country",
    color_continuous_scale="Blues",
    projection="natural earth",
    coloraxis_colorbar=dict(title="% Vaccinated")
)
fig.update_layout(
    title={'text': "Vaccination Rates by Country", 'x': 0.5, 'xanchor': 'center'},
    geo=dict(showframe=False, showcoastlines=True, coastlinecolor="Black", showland=True, landcolor="LightGrey")
)
# Optional: Zoom to Kenya, USA, India
# fig.update_geos(fitbounds="locations", visible=True)
fig.show()

# Debug data
print("\nLatest data preview:")
print(latest_data[['location', 'iso_code', 'total_cases', 'vaccination_rate']].head())
print("\nMissing values:")
print(latest_data[['iso_code', 'total_cases', 'total_vaccinations', 'population']].isna().sum())

In [None]:
# Data Loading 

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Define countries of interest
countries_of_interest = ['Kenya', 'USA', 'India']

# Load the filtered dataset
filtered_df = pd.read_csv('cleaned_owid.csv')

# Ensure date is datetime
filtered_df['date'] = pd.to_datetime(filtered_df['date'])

# Handle missing data
filtered_df = filtered_df.dropna(subset=['total_vaccinations', 'population'])
filtered_df = filtered_df[filtered_df['total_vaccinations'] > 0]

# Calculate percent vaccinated
filtered_df['percent_vaccinated'] = (filtered_df['total_vaccinations'] / filtered_df['population']) * 100

# Line plot: Cumulative vaccinations
plt.figure(figsize=(12, 7))
colors = ['blue', 'green', 'red']
for i, country in enumerate(countries_of_interest):
    country_data = filtered_df[filtered_df['location'] == country]
    plt.plot(country_data['date'], country_data['total_vaccinations'], label=country, color=colors[i], linewidth=2)
plt.title('Cumulative Vaccinations Over Time', fontsize=14)
plt.xlabel('Date', fontsize=12)
plt.ylabel('Total Vaccinations', fontsize=12)
plt.legend(fontsize=10)
plt.grid(True, linestyle='--', alpha=0.7)
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

# Bar plot: Percent vaccinated (latest date per country)
latest_vax = filtered_df.sort_values('date').groupby('location').tail(1)
plt.figure(figsize=(12, 7))
sns.barplot(data=latest_vax, x='location', y='percent_vaccinated', order=countries_of_interest, palette='Blues_d')
plt.title('Percentage of Vaccinated Population (Latest Date)', fontsize=14)
plt.xlabel('Country', fontsize=12)
plt.ylabel('% Vaccinated', fontsize=12)
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

# Debug data
print("\nData preview:")
print(filtered_df[['location', 'date', 'total_vaccinations', 'population', 'percent_vaccinated']].head())
print("\nMissing values:")
print(filtered_df[['total_vaccinations', 'population', 'percent_vaccinated']].isna().sum())

In [None]:
# Exploratory Data Analysis (EDA)

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Define countries of interest
countries_of_interest = ['Kenya', 'USA', 'India']

# Load the filtered dataset
filtered_df = pd.read_csv('cleaned_owid.csv')

# Ensure date is datetime
filtered_df['date'] = pd.to_datetime(filtered_df['date'])

# Handle missing data
filtered_df = filtered_df.dropna(subset=['total_vaccinations', 'population', 'total_cases', 'total_deaths', 'new_cases'])
filtered_df = filtered_df[filtered_df['total_vaccinations'] > 0]

# Calculate percent vaccinated and death rate
filtered_df['percent_vaccinated'] = (filtered_df['total_vaccinations'] / filtered_df['population']) * 100
filtered_df['death_rate'] = filtered_df['total_deaths'] / filtered_df['total_cases'].replace(0, 1)

# Line plot: Cumulative vaccinations
plt.figure(figsize=(12, 7))
colors = ['blue', 'green', 'red']
for i, country in enumerate(countries_of_interest):
    country_data = filtered_df[filtered_df['location'] == country]
    plt.plot(country_data['date'], country_data['total_vaccinations'], label=country, color=colors[i], linewidth=2)
plt.title('Cumulative Vaccinations Over Time', fontsize=14)
plt.xlabel('Date', fontsize=12)
plt.ylabel('Total Vaccinations', fontsize=12)
plt.legend(fontsize=10)
plt.grid(True, linestyle='--', alpha=0.7)
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

# Bar plot: Percent vaccinated (latest date)
latest_data = filtered_df.sort_values('date').groupby('location').tail(1)
plt.figure(figsize=(12, 7))
sns.barplot(data=latest_data, x='location', y='percent_vaccinated', order=countries_of_interest, palette='Blues_d')
plt.title('Percentage of Population Vaccinated (Latest Date)', fontsize=14)
plt.xlabel('Country', fontsize=12)
plt.ylabel('% Vaccinated', fontsize=12)
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

# Line plot: Total Cases Over Time
plt.figure(figsize=(12, 7))
for i, country in enumerate(countries_of_interest):
    country_data = filtered_df[filtered_df['location'] == country]
    plt.plot(country_data['date'], country_data['total_cases'], label=country, color=colors[i], linewidth=2)
plt.title('Total COVID-19 Cases Over Time', fontsize=14)
plt.xlabel('Date', fontsize=12)
plt.ylabel('Total Cases', fontsize=12)
plt.legend(fontsize=10)
plt.grid(True, linestyle='--', alpha=0.7)
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

# Line plot: Total Deaths Over Time
plt.figure(figsize=(12, 7))
for i, country in enumerate(countries_of_interest):
    country_data = filtered_df[filtered_df['location'] == country]
    plt.plot(country_data['date'], country_data['total_deaths'], label=country, color=colors[i], linewidth=2)
plt.title('Total Deaths Over Time', fontsize=14)
plt.xlabel('Date', fontsize=12)
plt.ylabel('Total Deaths', fontsize=12)
plt.legend(fontsize=10)
plt.grid(True, linestyle='--', alpha=0.7)
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

# Bar plot: Daily new cases (latest date)
plt.figure(figsize=(12, 7))
sns.barplot(data=latest_data, x='location', y='new_cases', order=countries_of_interest, palette='Reds_d')
plt.title('Daily New Cases (Latest Date)', fontsize=14)
plt.xlabel('Country', fontsize=12)
plt.ylabel('New Cases', fontsize=12)
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

# Line plot: Death Rate Over Time
plt.figure(figsize=(12, 7))
for i, country in enumerate(countries_of_interest):
    country_data = filtered_df[filtered_df['location'] == country]
    plt.plot(country_data['date'], country_data['death_rate'], label=country, color=colors[i], linewidth=2)
plt.title('Death Rate Over Time', fontsize=14)
plt.xlabel('Date', fontsize=12)
plt.ylabel('Death Rate', fontsize=12)
plt.legend(fontsize=10)
plt.grid(True, linestyle='--', alpha=0.7)
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

# Debug data
print("\nData preview:")
print(filtered_df[['location', 'date', 'total_vaccinations', 'population', 'total_cases', 'total_deaths', 'new_cases']].head())
print("\nMissing values:")
print(filtered_df[['total_vaccinations', 'population', 'total_cases', 'total_deaths', 'new_cases']].isna().sum())

## Key Insights
1. The USA has the highest number of total vaccinations, followed by India.
2. Kenya has a significantly lower vaccination rate compared to the USA and India.
3. The vaccination rollout in the USA was faster compared to other countries.
4. Anomalies: Some countries have missing data for certain dates, which may affect the analysis.
5. The choropleth map highlights regions with high case density, such as the USA and India.