# ============================================
# COVID-19 Exploratory Data Analysis (OWID Dataset)
# ============================================

In [None]:
# 1. Libraries & Data Loading
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px

In [None]:
# Visualization settings
plt.style.use("seaborn")
plt.rcParams["figure.figsize"] = (12,6)


In [None]:
# Load dataset (update path to your CSV file)
df = pd.read_csv("owid-covid-data.csv")

In [None]:
# Explore dataset
print("Shape:", df.shape)
print("Columns:", df.columns.tolist())
display(df.head())

In [None]:
# Missing values check
df.isnull().sum().sort_values(ascending=False).head(20)

# ============================================
# 2. Data Cleaning
# ============================================

In [None]:
# Countries of interest
countries = ["Kenya", "United States", "India"]

In [None]:
# Filter dataset
df_countries = df[df["location"].isin(countries)].copy()

In [None]:
# Convert date
df_countries["date"] = pd.to_datetime(df_countries["date"])

In [None]:
# Drop rows missing critical info
df_countries = df_countries.dropna(subset=["date", "total_cases"])


In [None]:
# Fill missing values forward by country
df_countries = df_countries.groupby("location").apply(lambda g: g.fillna(method="ffill"))
df_countries.reset_index(drop=True, inplace=True)

df_countries.head()

# ============================================
# 3. Exploratory Data Analysis (EDA)
# ============================================


In [None]:
# Total cases
for country in countries:
    temp = df_countries[df_countries["location"] == country]
    plt.plot(temp["date"], temp["total_cases"], label=country)
plt.title("Total COVID-19 Cases Over Time")
plt.legend()
plt.show()

In [None]:
# Total deaths
for country in countries:
    temp = df_countries[df_countries["location"] == country]
    plt.plot(temp["date"], temp["total_deaths"], label=country)
plt.title("Total COVID-19 Deaths Over Time")
plt.legend()
plt.show()

In [None]:
# Daily new cases
sns.lineplot(x="date", y="new_cases", hue="location", data=df_countries)
plt.title("Daily New Cases (Kenya, USA, India)")
plt.show()

In [None]:
# Death rate = deaths / cases
df_countries["death_rate"] = df_countries["total_deaths"] / df_countries["total_cases"]

sns.lineplot(x="date", y="death_rate", hue="location", data=df_countries)
plt.title("COVID-19 Death Rate Over Time")
plt.show()

# ============================================
# 4. Vaccination Progress
# ============================================

In [None]:
# Total vaccinations
sns.lineplot(x="date", y="total_vaccinations", hue="location", data=df_countries)
plt.title("Total Vaccinations Over Time")
plt.show()

In [None]:
# % fully vaccinated population
df_countries["vaccinated_pct"] = (df_countries["people_fully_vaccinated"] / df_countries["population"]) * 100

sns.lineplot(x="date", y="vaccinated_pct", hue="location", data=df_countries)
plt.title("% Population Fully Vaccinated Over Time")
plt.show()

# ============================================
# 5. Global Overview
# ============================================

In [None]:
latest_date = df["date"].max()
latest = df[df["date"] == latest_date]

In [None]:
# Top 10 by total cases
top_cases = latest.nlargest(10, "total_cases")[["location", "total_cases"]]
sns.barplot(x="total_cases", y="location", data=top_cases)
plt.title("Top 10 Countries by Total Cases")
plt.show()

In [None]:
# Correlation heatmap
numeric = df_countries[["total_cases", "total_deaths", "new_cases", "new_deaths", "total_vaccinations"]].dropna()
sns.heatmap(numeric.corr(), annot=True, cmap="coolwarm")
plt.title("Correlation Heatmap (Selected Countries)")
plt.show()

# ============================================
# 6. Choropleth Maps
# ============================================

In [None]:
# Prepare mapping dataset
map_data = latest[["iso_code", "location", "total_cases", "population", "people_fully_vaccinated"]].dropna()
map_data["cases_per_million"] = (map_data["total_cases"] / map_data["population"]) * 1e6
map_data["vaccination_rate"] = (map_data["people_fully_vaccinated"] / map_data["population"]) * 100

In [None]:
# Cases per million map
fig_cases = px.choropleth(
    map_data,
    locations="iso_code",
    color="cases_per_million",
    hover_name="location",
    hover_data=["total_cases", "population"],
    color_continuous_scale="Reds",
    title=f"COVID-19 Cases per Million (as of {latest_date})"
)
fig_cases.show()

In [None]:
# Vaccination rate map
fig_vax = px.choropleth(
    map_data,
    locations="iso_code",
    color="vaccination_rate",
    hover_name="location",
    hover_data=["people_fully_vaccinated", "population"],
    color_continuous_scale="Greens",
    title=f"COVID-19 Vaccination Rate (% Fully Vaccinated) (as of {latest_date})"
)
fig_vax.show()

# ============================================
# 7. Insights & Reporting
# ============================================

## Key Insights (example placeholders):
1. **India** had the highest spike in daily new cases among Kenya, USA, and India.  
2. **USA** recorded the highest cumulative deaths.  
3. **Kenya** showed a delayed vaccination rollout compared to the USA and India.  
4. Death rates dropped over time, stabilizing after major vaccination campaigns.  
5. Global choropleth maps highlight uneven vaccine distribution — high in developed nations, low in some African regions.