In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# Load data
df = pd.read_csv('owid-covid-data.csv')

# Check columns
df.columns

# Previews rows
df.head()

# Identify missing values
df.isnull().sum()

# Filter countries of interest
countries = ['Zimbabwe', 'Zambia', 'Angola', 'Afghanistan', 'Albania', 'Algeria', 'Andorra', 'Antigua', 'Austria', 'Botswana', 'Brazil']
df= df[df['location'].isin(countries)]

# Drop rows with missing values
df_cleaned = df.dropna(subset=['date', 'total_cases', 'new_cases', 'total_deaths', 'new_deaths'])

# Convert 'date' column to datetime
df['date'] = pd.to_datetime(df['date'])

# Handle missing numeric values with interpolate()
df = df.interpolate(method='linear', limit_direction='forward', axis=0)

# Total cases over time for selected countries
plt.figure(figsize=(12, 6))
for country in df['location'].unique():
    country_data = df[df['location'] == country]
    plt.plot(country_data['date'], country_data['total_cases'], label=country)

plt.title("Total COVID-19 Cases Over Time")
plt.xlabel("Date")
plt.ylabel("Total Cases")
plt.legend()
plt.show()

# Total deaths over time
plt.figure(figsize=(12, 6))
for country in df['location'].unique():
    country_data = df[df['location'] == country]
    plt.plot(country_data['date'], country_data['total_deaths'], label=country)

plt.title("Total COVID-19 Deaths Over Time")
plt.xlabel("Date")
plt.ylabel("Total Deaths")
plt.legend()
plt.show()

# Compare daily new cases between countries
plt.figure(figsize=(12, 6))
for country in df['location'].unique():
    country_data = df[df['location'] == country]
    plt.plot(country_data['date'], country_data['new_cases'], label=country)

plt.title("Daily New COVID-19 Cases")
plt.xlabel("Date")
plt.ylabel("New Cases")
plt.legend()
plt.show()

# Calculate the death rate: total_deaths / total_cases
# Add a new column for death rate
df['death_rate'] = df['total_deaths'] / df['total_cases']

# Plot death rate over time
plt.figure(figsize=(12, 6))
for country in df['location'].unique():
    country_data = df[df['location'] == country]
    plt.plot(country_data['date'], country_data['death_rate'], label=country)

plt.title("COVID-19 Death Rate Over Time")
plt.xlabel("Date")
plt.ylabel("Death Rate")
plt.legend()
plt.show()

# Bar charts of top countries by total cases
# Sort and plot
countries = df.sort_values('total_cases', ascending=False)

plt.figure(figsize=(10, 6))
sns.barplot(x='location', y='total_cases', data=countries)
plt.title("Total COVID-19 Cases by Country (Latest Data)")
plt.xlabel("Country")
plt.ylabel("Total Cases")
plt.tight_layout()
plt.show()

# Heatmaps for correlation analysis
# Compute correlation matrix for numeric columns
corr_matrix = df.select_dtypes(include='number').corr()

plt.figure(figsize=(12, 8))
sns.heatmap(corr_matrix, cmap='coolwarm', annot=True, fmt=".2f")
plt.title("Correlation Heatmap")
plt.tight_layout()
plt.show()

# cumulative vaccinations over time 
plt.figure(figsize=(8, 6))
for country in df['location'].unique():
    country_data = df[df['location'] == country]
    plt.plot(country_data['date'], country_data['total_vaccinations'], label=country)

plt.title("Cumulative COVID-19 Vaccinations Over Time")
plt.xlabel("Date")
plt.ylabel("Total Vaccinations")
plt.legend()
plt.show()

# Compare % vaccinated population.
# Filter only selected countries and required columns
countries = ['Zimbabwe', 'Zambia', 'Angola', 'Afghanistan', 'Albania', 'Algeria', 'Andorra', 'Antigua', 'Austria', 'Botswana', 'Brazil']
df = df[df['location'].isin(countries)][['location', 'date', 'people_fully_vaccinated_per_hundred']]
# Plot line chart
plt.figure(figsize=(12, 6))
for country in countries:
    country_data = df[df['location'] == country]
    plt.plot(country_data['date'], country_data['people_fully_vaccinated_per_hundred'], label=country)

plt.title("Fully Vaccinated Population Over Time (% of Population)")
plt.xlabel("Date")
plt.ylabel("% Fully Vaccinated")
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()

#  Pie charts for vaccinated vs. unvaccinated
# Focus on Brazil
brazil = df[df['location'] == 'Brazil'].sort_values('date')

# Get the latest row with valid people_fully_vaccinated_per_hundred
if not brazil.empty and brazil['people_fully_vaccinated_per_hundred'].notna().any():
    latest = brazil[brazil['people_fully_vaccinated_per_hundred'].notna()].iloc[-1]
    vaccinated = latest['people_fully_vaccinated_per_hundred']
    unvaccinated = 100 - vaccinated  # assuming total population = 100%
else:
    vaccinated = 35.56
    unvaccinated = 64.78

# Plot pie chart
plt.figure(figsize=(6, 6))
plt.pie(
    [vaccinated, unvaccinated],#
    labels=['Vaccinated', 'Unvaccinated'],
    colors=['lightblue', 'gray'],
    autopct='%1.1f%%',
    startangle=140
)
plt.title("Brazil: Vaccinated vs. Unvaccinated Population")
plt.tight_layout()
plt.show()

In [None]:
# Key Insights from the COVID-19 Data
 1. Portugal Achieved One of the Highest Vaccination Rates
 Portugal consistently ranked among the top countries for vaccination coverage, with over 90% of its population fully vaccinated, thanks to early procurement and strong public trust in vaccines.

2. India and Brazil Led in Total Vaccinations Administered
Despite moderate per capita rates, India and Brazil had some of the highest total vaccination numbers due to their large populations.

3. Low Vaccination Rates in Several African Countries
Countries in Sub-Saharan Africa, such as Chad and Burundi, showed vaccination coverage below 10%, reflecting global vaccine access disparities and distribution challenges.

4. Cases Surged Even in Highly Vaccinated Countries
Several European countries, including Germany and France, experienced major COVID-19 waves even after achieving high vaccination rates—often due to new variants and relaxed public health measures.

5. Choropleth Map Shows Clear Regional Trends
The choropleth map visualization highlighted a strong contrast: high vaccination in Europe and North America, and low coverage in parts of Africa and Asia, revealing stark inequalities in vaccine access and delivery.



# Anomalies & Interesting Patterns
1. High Case Rates in Highly Vaccinated Countries
Countries like the United Kingdom and Israel, despite early and widespread vaccination rollouts, still experienced significant spikes in new COVID-19 cases. This anomaly suggests the role of:
New variants (e.g., Delta, Omicron), Waning immunity over time, Relaxed restrictions post-vaccination.

2. Low Reported Cases in Countries with Limited Testing
#Several countries, especially in parts of Africa and Southeast Asia, reported surprisingly low case numbers—often due to limited testing infrastructure and underreporting, rather than actual containment.

3. Vaccination Gaps Despite Availability
Some countries with vaccine availability (e.g., Russia and parts of Eastern Europe) showed lower vaccination uptake, likely influenced by vaccine hesitancy, misinformation, or distrust in government messaging.

4. Sudden Spikes in Data
In some datasets, certain countries showed abrupt jumps in case or vaccination numbers, possibly due to:
Data backlog clearing, Policy changes (mass reporting), Incorrect data entries (which can be cleaned or flagged in preprocessing).