# COVID-19 Global Data Tracker
This notebook analyzes COVID-19 global data including trends in cases, deaths, and vaccinations.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
from datetime import datetime

# Suppress warnings
warnings.filterwarnings('ignore')

# Set matplotlib and seaborn styles
plt.style.use('ggplot')
sns.set_palette("deep")
sns.set_context("notebook")

print("COVID-19 Global Data Analysis")
print("=" * 30)


In [None]:
try:
    df = pd.read_csv("owid-covid-data.csv")
    df['date'] = pd.to_datetime(df['date'])
    
    print(f"Dataset shape: {df.shape}")
    print(f"Date range: {df['date'].min()} to {df['date'].max()}")
except FileNotFoundError:
    print("Error: The file 'owid-covid-data.csv' was not found.")
    print("Please download the dataset from: https://github.com/owid/covid-19-data/tree/master/public/data")
    raise


In [None]:
default_countries = ['Kenya', 'United States', 'India', 'Germany', 'Brazil', 
                    'South Africa', 'Australia', 'United Kingdom', 'China']

existing_countries = [country for country in default_countries if country in df['location'].unique()]
missing_countries = set(default_countries) - set(existing_countries)

if missing_countries:
    print(f"Warning: The following countries are not in the dataset: {missing_countries}")
    print(f"Using available countries: {existing_countries}")
    default_countries = existing_countries

countries_df = df[df['location'].isin(default_countries)].copy()
countries_df = countries_df.dropna(subset=['total_cases', 'total_deaths'])
countries_df[['new_cases', 'new_deaths']] = countries_df[['new_cases', 'new_deaths']].fillna(0)

if 'total_vaccinations' in countries_df.columns:
    countries_df['total_vaccinations'] = countries_df.groupby('location')['total_vaccinations'].transform(
        lambda x: x.fillna(method='ffill'))
else:
    print("Warning: 'total_vaccinations' column not found in the dataset.")

print(f"Processed data shape: {countries_df.shape}")


In [None]:
plt.figure(figsize=(14, 8))

for country in default_countries:
    subset = countries_df[countries_df['location'] == country]
    if not subset.empty:
        plt.plot(subset['date'], subset['total_cases'], label=country, linewidth=2)

plt.title("Total COVID-19 Cases Over Time", fontsize=16)
plt.xlabel("Date", fontsize=12)
plt.ylabel("Total Cases (log scale)", fontsize=12)
plt.yscale('log')
plt.grid(True, alpha=0.3)
plt.legend(fontsize=10)
plt.tight_layout()
plt.show()


In [None]:
cols_to_check = ['location', 'total_cases', 'total_deaths', 'new_cases', 'new_deaths']
optional_cols = ['total_vaccinations', 'hosp_patients']
cols_to_check.extend([col for col in optional_cols if col in df.columns])

missing_data = df[cols_to_check].isnull().sum()
missing_percentage = (df[cols_to_check].isnull().sum() / len(df)) * 100

missing_info = pd.DataFrame({
    'Missing Values': missing_data,
    'Percentage (%)': missing_percentage.round(2)
})

print("Missing Data Analysis:")
print(missing_info)

plt.figure(figsize=(10, 6))
sns.heatmap(df[cols_to_check].isnull(), cmap='viridis', yticklabels=False, cbar=False)
plt.title('Missing Data Visualization')
plt.tight_layout()
plt.show()


In [None]:
if 'total_vaccinations' in countries_df.columns:
    latest_data = countries_df.sort_values('date').groupby('location').last().reset_index()
    
    if 'population' in latest_data.columns:
        latest_data['vaccination_rate'] = (latest_data['total_vaccinations'] / latest_data['population']) * 100
        latest_data = latest_data.sort_values('vaccination_rate', ascending=False)
        
        plt.figure(figsize=(12, 8))
        bars = sns.barplot(x='location', y='vaccination_rate', data=latest_data)
        plt.title('COVID-19 Vaccination Rate by Country (%)')
        plt.xlabel('Country')
        plt.ylabel('Vaccination Rate (%)')
        plt.xticks(rotation=45)
        plt.grid(True, alpha=0.3)
        
        for bar in bars.patches:
            bars.text(bar.get_x() + bar.get_width()/2., bar.get_height() + 1,
                      f"{bar.get_height():.1f}%", ha='center', va='bottom')
        
        plt.tight_layout()
        plt.show()


In [None]:
countries_df['death_rate'] = (countries_df['total_deaths'] / countries_df['total_cases']) * 100

latest_data = countries_df.sort_values('date').groupby('location').last().reset_index()
latest_data = latest_data.sort_values('death_rate', ascending=False)

plt.figure(figsize=(12, 6))
bars = sns.barplot(x='location', y='death_rate', data=latest_data)
plt.title("Latest COVID-19 Death Rates (%)")
plt.xlabel("Country")
plt.ylabel("Death Rate (%)")
plt.xticks(rotation=45)
plt.grid(True, alpha=0.3)

for bar in bars.patches:
    bars.text(bar.get_x() + bar.get_width()/2., bar.get_height() + 0.1,
              f"{bar.get_height():.2f}%", ha='center', va='bottom')

plt.tight_layout()
plt.show()


In [None]:
print("Calculating 7-day rolling averages...")
plt.figure(figsize=(14, 8))

for country in default_countries:
    country_data = countries_df[countries_df['location'] == country].copy()
    if not country_data.empty:
        country_data['rolling_new_cases'] = country_data['new_cases'].rolling(window=7).mean()
        plt.plot(country_data['date'], country_data['rolling_new_cases'], label=country, linewidth=2)

plt.title("7-Day Rolling Average of New COVID-19 Cases")
plt.xlabel("Date")
plt.ylabel("7-Day Avg New Cases")
plt.grid(True, alpha=0.3)
plt.legend(fontsize=10)
plt.tight_layout()
plt.show()


In [None]:
latest_date = countries_df['date'].max().strftime('%Y-%m-%d')
print(f"Summary statistics as of {latest_date}:")

latest_data = countries_df.sort_values('date').groupby('location').last().reset_index()

print("\nTotal Cases by Country:")
for _, row in latest_data.sort_values('total_cases', ascending=False).iterrows():
    print(f"{row['location']}: {row['total_cases']:,.0f}")

print("\nTotal Deaths by Country:")
for _, row in latest_data.sort_values('total_deaths', ascending=False).iterrows():
    print(f"{row['location']}: {row['total_deaths']:,.0f}")

print("\nDeath Rate by Country:")
for _, row in latest_data.sort_values('death_rate', ascending=False).iterrows():
    print(f"{row['location']}: {row['death_rate']:.2f}%")


In [None]:
try:
    countries_df.to_csv("cleaned_covid_data.csv", index=False)
    print("Cleaned dataset saved as 'cleaned_covid_data.csv'")
except Exception as e:
    print(f"Error saving cleaned data: {str(e)}")
