In [13]:
import pandas as pd

# Load dataset
url = "https://covid.ourworldindata.org/data/owid-covid-data.csv"
df = pd.read_csv(url)


KeyboardInterrupt: 

In [None]:
# Explore structure
print("Columns:", df.columns.tolist())
print("\nFirst 5 rows:")
display(df.head())

# Check missing values
print("\nMissing values:")
print(df[['date', 'location', 'total_cases', 'total_deaths', 'total_vaccinations']].isnull().sum())

In [None]:
# Filter countries
countries = ['Kenya', 'United States', 'India']
df_filtered = df[df['location'].isin(countries)].copy()

# Convert date to datetime
df_filtered['date'] = pd.to_datetime(df_filtered['date'])

# Handle missing values
cols_to_fill = ['total_cases', 'total_deaths', 'total_vaccinations']
df_filtered[cols_to_fill] = df_filtered[cols_to_fill].fillna(0)  # Replace missing with 0

# Drop rows with missing critical data
df_clean = df_filtered.dropna(subset=['date', 'location'])

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Plot total cases over time
plt.figure(figsize=(12, 6))
sns.lineplot(data=df_clean, x='date', y='total_cases', hue='location')
plt.title('Total COVID-19 Cases Over Time')
plt.xlabel('Date')
plt.ylabel('Cases')
plt.show()

# Calculate death rate
df_clean['death_rate'] = (df_clean['total_deaths'] / df_clean['total_cases']) * 100

# Compare death rates
plt.figure(figsize=(10, 5))
sns.barplot(data=df_clean, x='location', y='death_rate')
plt.title('Death Rate by Country (%)')
plt.show()

In [None]:
# Plot vaccinations over time
plt.figure(figsize=(12, 6))
sns.lineplot(data=df_clean, x='date', y='total_vaccinations', hue='location')
plt.title('Cumulative Vaccinations Over Time')
plt.xlabel('Date')
plt.ylabel('Vaccinations')
plt.show()

In [None]:
# %% [markdown]
# # 🌍 Global COVID-19 Data Analysis
# **Author**: Your Name  
# **Last Updated**: [Date]

# %% [code]
# ======================
# 1. Data Collection & Setup
# ======================
import pandas as pd
import seaborn as sns

# Load data with error handling
try:
    df = pd.read_csv("https://covid.ourworldindata.org/data/owid-covid-data.csv")
    print("Data loaded successfully!")
except Exception as e:
    print(f"Error: {e}\nUsing local backup...")
    df = pd.read_csv("./data/owid-covid-data.csv")

# %% [code]
# ======================
# 2. Data Cleaning
# ======================
# Filter countries & handle missing values
countries = ['Kenya', 'United States', 'India', 'Brazil', 'Germany']
df = df[df['location'].isin(countries)]
df['date'] = pd.to_datetime(df['date'])

# Fill missing values and handle zeros
cols = ['total_cases', 'total_deaths', 'people_vaccinated']
df[cols] = df[cols].fillna(0)
df['total_cases'] = df['total_cases'].replace(0, 1)  # Avoid division by zero

# Calculate death rate
df['death_rate'] = (df['total_deaths'] / df['total_cases']) * 100

# %% [code]
# ======================
# 3. Core Analysis
# ======================
# Aggregate latest stats
latest = df.groupby('location').last()
display(latest[['total_cases', 'death_rate', 'people_vaccinated']])

# %% [code]
# ======================
# 4. Visualizations
# ======================
# Line plot for cases
plt.figure(figsize=(14, 6))
sns.lineplot(data=df, x='date', y='total_cases', hue='location')
plt.title("Total COVID-19 Cases (2020-2023)")
plt.show()

# Bar plot for vaccinations
plt.figure(figsize=(14, 6))
sns.barplot(data=latest.reset_index(), x='location', y='people_vaccinated')
plt.title("Total Vaccinated Population")
plt.show()

# Choropleth map (with ISO code check)
latest_global = df.drop_duplicates(subset='location', keep='last').dropna(subset=['iso_code'])

# %% [code]
# ======================
# 5. Stretch Goals (Revised)
# ======================
# User input handling with validation
try:
    countries_input = input("Enter countries (comma-separated): ").strip().split(',')
    countries_input = [c.strip() for c in countries_input]
    valid_countries = [c for c in countries_input if c in df['location'].unique()]
    
    if not valid_countries:
        valid_countries = ['Kenya', 'United States']
        print("Defaulting to Kenya and United States.")
    
    start_date = pd.to_datetime(input("Start date (YYYY-MM-DD): ")).date()
    end_date = pd.to_datetime(input("End date (YYYY-MM-DD): ")).date()
    
    filtered = df[
        (df['location'].isin(valid_countries)) &
        (df['date'].dt.date >= start_date) &
        (df['date'].dt.date <= end_date)
    ]
    
    if not filtered.empty:
        plt.figure(figsize=(14, 6))
        sns.lineplot(data=filtered, x='date', y='total_cases', hue='location')
        plt.title(f"Cases from {start_date} to {end_date}")
        plt.show()
    else:
        print("No data found for the selected range.")
        
except Exception as e:
    print(f"Input Error: {e}")

# %% [code]
# ======================
# 6. Hospitalization Data
# ======================
if 'hosp_patients' in df.columns:
    plt.figure(figsize=(14, 6))
    sns.lineplot(data=df, x='date', y='hosp_patients', hue='location')
    plt.title("Hospitalized Patients")
    plt.show()
else:
    print("Hospitalization data unavailable in this dataset.")