In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
# Optional (for maps later)
#import plotly.express as px

# Settings for visuals
sns.set(style="whitegrid")
%matplotlib inline


In [None]:
df = pd.read_csv("owid-covid-data.csv")
print(f"Dataset shape: {df.shape}")
df.head()


In [None]:
print("Columns in dataset:")
print(df.columns)

print("\nMissing values per column:")
print(df.isnull().sum())


In [None]:
# Select countries
countries = ["Kenya", "United States", "India"]

# Filter rows by country
df_countries = df[df['location'].isin(countries)].copy()

# Convert 'date' column to datetime
df_countries['date'] = pd.to_datetime(df_countries['date'])

# Fill missing numeric values forward for smooth plotting
df_countries[['total_cases', 'total_deaths', 'new_cases', 'new_deaths', 'total_vaccinations']] = \
    df_countries[['total_cases', 'total_deaths', 'new_cases', 'new_deaths', 'total_vaccinations']].fillna(method='ffill')

df_countries.head()


In [None]:
plt.figure(figsize=(12,6))
for country in countries:
    subset = df_countries[df_countries['location'] == country]
    plt.plot(subset['date'], subset['total_cases'], label=country)

plt.title('Total COVID-19 Cases Over Time')
plt.xlabel('Date')
plt.ylabel('Total Cases')
plt.legend()
plt.show()


In [None]:
plt.figure(figsize=(12,6))
for country in countries:
    subset = df_countries[df_countries['location'] == country]
    plt.plot(subset['date'], subset['total_deaths'], label=country)

plt.title('Total COVID-19 Deaths Over Time')
plt.xlabel('Date')
plt.ylabel('Total Deaths')
plt.legend()
plt.show()


In [None]:
df_countries['death_rate'] = df_countries['total_deaths'] / df_countries['total_cases']

plt.figure(figsize=(12,6))
for country in countries:
    subset = df_countries[df_countries['location'] == country]
    plt.plot(subset['date'], subset['death_rate'], label=country)

plt.title('COVID-19 Death Rate Over Time')
plt.xlabel('Date')
plt.ylabel('Death Rate')
plt.legend()
plt.show()


# Insights from the COVID-19 Data

- The United States has had the highest number of total cases and vaccinations among the selected countries.
- India experienced rapid increases in cases and deaths during peaks in the pandemic.
- Kenya shows slower vaccination progress compared to the USA and India.
- The death rate has generally decreased over time, possibly reflecting better treatments and vaccine effects.



In [None]:
# Get the latest date in the dataset
latest_date = df['date'].max()

# Filter data for the latest date
latest_data = df[df['date'] == latest_date]

# Select top 10 countries by total cases
top_countries = latest_data[['location', 'total_cases']].sort_values(by='total_cases', ascending=False).head(10)

plt.figure(figsize=(12,6))
sns.barplot(x='total_cases', y='location', data=top_countries, palette='Reds_r')
plt.title('Top 10 Countries by Total COVID-19 Cases (Latest Date)')
plt.xlabel('Total Cases')
plt.ylabel('Country')
plt.show()


In [None]:
# Select numeric columns for correlation
numeric_cols = ['total_cases', 'total_deaths', 'new_cases', 'new_deaths', 'total_vaccinations']

# Calculate correlation matrix
corr = df_countries[numeric_cols].corr()

plt.figure(figsize=(8,6))
sns.heatmap(corr, annot=True, cmap='coolwarm', fmt='.2f')
plt.title('Correlation Heatmap of COVID-19 Metrics')
plt.show()


In [None]:
import plotly.express as px

# Prepare data for map - latest total cases per country
map_data = latest_data[['iso_code', 'location', 'total_cases']].dropna()

fig = px.choropleth(map_data,
                    locations='iso_code',
                    color='total_cases',
                    hover_name='location',
                    color_continuous_scale='Reds',
                    title='Global COVID-19 Total Cases (Latest Date)')

fig.show()


# Summary and Conclusions

- COVID-19 cases and deaths peaked at different times across countries.
- Vaccination rollout has varied widely, impacting case and death rates.
- Correlation analysis shows strong relationships between cases, deaths, and new cases.
- Choropleth map highlights global hotspots in terms of total cases.
- Further analysis could include hospitalization rates, ICU usage, or socio-economic factors.


In [None]:
import ipywidgets as widgets
from IPython.display import display

# Dropdown widget for country selection
country_selector = widgets.Dropdown(
    options=df['location'].unique(),
    description='Country:',
    value='Kenya',
    disabled=False,
)

# Date range slider widget
date_slider = widgets.SelectionRangeSlider(
    options=sorted(df['date'].unique()),
    index=(0, len(df['date'].unique()) - 1),
    description='Date Range',
    orientation='horizontal',
    layout={'width': '600px'}
)

def plot_interactive(country, date_range):
    start_date, end_date = date_range
    mask = (df['location'] == country) & (df['date'] >= start_date) & (df['date'] <= end_date)
    data = df.loc[mask]

    plt.figure(figsize=(12,5))
    plt.plot(data['date'], data['total_cases'], label='Total Cases')
    plt.plot(data['date'], data['total_deaths'], label='Total Deaths')
    plt.plot(data['date'], data['total_vaccinations'], label='Total Vaccinations')
    plt.title(f'COVID-19 Data for {country}')
    plt.xlabel('Date')
    plt.ylabel('Count')
    plt.legend()
    plt.show()

# Use interact_manual to update plot on button click
widgets.interact_manual(
    plot_interactive,
    country=country_selector,
    date_range=date_slider
)
