# COVID-19 Global Impact Dashboard

This notebook performs data preprocessing and exploratory data analysis (EDA) on COVID-19 global data.

In [None]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Set visualization style
sns.set(style='whitegrid')


In [None]:
# Load dataset (replace with actual file path)
# Example: data/covid_data.csv
df = pd.read_csv('data/sample_covid_data.csv')
df.head()

In [None]:
# Data info and summary
df.info()
df.describe()

In [None]:
# Handle missing values
df = df.fillna(method='ffill')  # forward fill
df = df.dropna()  # drop rows with remaining missing values

In [None]:
# Convert date column to datetime
df['date'] = pd.to_datetime(df['date'])

In [None]:
# Summary statistics
print(df[['new_cases', 'new_deaths', 'total_cases', 'total_deaths']].describe())

In [None]:
# Visualizations
plt.figure(figsize=(12, 6))
sns.lineplot(data=df, x='date', y='new_cases', label='New Cases')
sns.lineplot(data=df, x='date', y='new_deaths', label='New Deaths')
plt.title('Daily COVID-19 Cases and Deaths')
plt.xlabel('Date')
plt.ylabel('Count')
plt.legend()
plt.tight_layout()
plt.show()

In [None]:
# Correlation heatmap
plt.figure(figsize=(8, 6))
sns.heatmap(df[['new_cases', 'new_deaths', 'total_cases', 'total_deaths']].corr(), annot=True, cmap='coolwarm')
plt.title('Correlation between Key Variables')
plt.show()

## Country-wise Analysis

Compare total cases and deaths across top affected countries.

In [None]:
# Top 10 countries with highest total cases
top_countries = df.groupby('location')['total_cases'].max().sort_values(ascending=False).head(10).index
df_top = df[df['location'].isin(top_countries)]

plt.figure(figsize=(14, 7))
sns.lineplot(data=df_top, x='date', y='total_cases', hue='location')
plt.title('Top 10 Countries by Total COVID-19 Cases Over Time')
plt.xlabel('Date')
plt.ylabel('Total Cases')
plt.legend(title='Country')
plt.tight_layout()
plt.show()

## Vaccination Analysis

Explore vaccination progress globally.

In [None]:
# Assuming df contains 'total_vaccinations' column
if 'total_vaccinations' in df.columns:
    plt.figure(figsize=(12, 6))
    sns.lineplot(data=df_top, x='date', y='total_vaccinations', hue='location')
    plt.title('COVID-19 Vaccinations Over Time in Top Countries')
    plt.xlabel('Date')
    plt.ylabel('Total Vaccinations')
    plt.tight_layout()
    plt.show()
else:
    print("Vaccination data not available in the dataset.")

## Conclusion

- The analysis highlights the variation in COVID-19 impact across countries.
- Effective data cleaning and transformation steps are crucial.
- Insights from this analysis will guide the Power BI dashboard design.