### Libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from datetime import datetime  # For statistical data visualization
import warnings
warnings.filterwarnings('ignore')

### Load the Dataset's

In [None]:
# Read CSV files into Pandas DataFrames
death = pd.read_csv("COVID DEATHS.csv")
vaccine = pd.read_csv("COVID_VACCINATIONS.csv")

### Exploring dataset

In [None]:
# Exploring the shape and information of the data
print(death.shape)
print(vaccine.shape)

In [None]:
print(death.info())
print(vaccine.info())

In [None]:
## Display all rows and columns of a dataframe instead of a truncated version
from IPython.display import display
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [None]:
death.head()

In [None]:

vaccine.head()

## Observation : We have lot of missing values

### Missing Values

In [None]:
### Checking Missing Values
# Visualizing missing data
plt.figure(figsize=(12,6))
sns.heatmap(death.isnull(), cbar=False, cmap='viridis')
plt.title("Missing Values in COVID Deaths Dataset")
plt.show()


In [None]:

plt.figure(figsize=(12,6))
sns.heatmap(vaccine.isnull(), cbar=False, cmap='plasma')
plt.title("Missing Values in COVID Vaccinations Dataset")
plt.show()


In [None]:

# Percentage of missing values
missing_deaths = death.isnull().sum() / len(death) * 100
missing_vaccinations = vaccine.isnull().sum() / len(vaccine) * 100

# Show missing values above 10%
missing_deaths[missing_deaths > 10].sort_values(ascending=False), missing_vaccinations[missing_vaccinations > 10].sort_values(ascending=False)


In [None]:
# Merge death and vaccine DataFrames based on a common key
combined = pd.merge(death, vaccine, how='outer')

In [None]:
combined.isnull().sum()

### Data Cleaning

In [None]:
# null %
null_perc=combined.isnull().sum()/len(combined)*100

In [None]:
null_perc.sort_values(ascending=False)

In [None]:
# > 45% of missing data we'll drop
null_columns=combined.isnull().sum().sort_values(ascending=False)
null_columns

In [None]:
def remove_null_columns(data):
    perc=0.45
    df=data.copy()
    remove_cols=(df.isnull().sum()/len(df))
    remove_cols=list(remove_cols[remove_cols.values>=perc].index)
    df.drop(labels=remove_cols,axis=1,inplace=True)
    return df

In [None]:
combined1=remove_null_columns(combined)

In [None]:
# null %
null_perc_1=combined1.isnull().sum()/len(combined)*100

In [None]:
null_perc_1.sort_values(ascending=False)

In [None]:
combined1.dropna(subset=['continent'], inplace=True)


In [None]:
combined1.shape

In [None]:
# Checking for duplicate rows
print(death[death.duplicated()])
print(vaccine[vaccine.duplicated()])

### Data Transformation

In [None]:
combined1.head()

In [None]:
combined1.head()

In [None]:
combined1.columns

### COlumns to keep

In [None]:
# List of columns to retain
columns_to_keep = [ 'continent', 'location', 'date', 'population',
                   'total_cases', 'new_cases', 'total_deaths', 'new_deaths',
                   'reproduction_rate', 'stringency_index', 'population_density',
                   'median_age', 'gdp_per_capita', 'diabetes_prevalence',
                   'hospital_beds_per_thousand', 'life_expectancy']

# Dropping redundant columns
combined1 = combined1[columns_to_keep]

In [None]:
# Calculate percentage of missing values
missing_percentage = (combined1.isnull().sum() / len(combined1)) * 100

# Display only columns with missing values
missing_percentage = missing_percentage[missing_percentage > 0].sort_values(ascending=False)

# Print missing values percentage
print("Percentage of Missing Values in Retained Columns:")
print(missing_percentage)


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# List of columns with missing values
missing_cols = ['reproduction_rate', 'stringency_index', 'hospital_beds_per_thousand',
                'total_deaths', 'gdp_per_capita', 'median_age', 'diabetes_prevalence',
                'total_cases', 'population_density', 'life_expectancy', 'new_cases', 'new_deaths']

# Plot histograms
plt.figure(figsize=(15, 10))
for i, col in enumerate(missing_cols, 1):
    plt.subplot(4, 3, i)
    sns.histplot(combined1[col], bins=30, kde=True)
    plt.title(f'Distribution of {col}')
plt.tight_layout()
plt.show()

# Plot boxplots for skewed data detection
plt.figure(figsize=(15, 10))
for i, col in enumerate(missing_cols, 1):
    plt.subplot(4, 3, i)
    sns.boxplot(y=combined1[col])
    plt.title(f'Boxplot of {col}')
plt.tight_layout()
plt.show()


In [None]:
# Handling missing values based on distribution

# 1️⃣ Mean Imputation for Normally Distributed Features
mean_impute_cols = ['reproduction_rate', 'stringency_index', 'median_age', 'life_expectancy']
combined1[mean_impute_cols] = combined1[mean_impute_cols].fillna(combined1[mean_impute_cols].mean())

# 2️⃣ Median Imputation for Skewed Features
median_impute_cols = ['hospital_beds_per_thousand', 'gdp_per_capita', 'diabetes_prevalence', 'population_density']
combined1[median_impute_cols] = combined1[median_impute_cols].fillna(combined1[median_impute_cols].median())

# 3️⃣ Forward Fill for Time-Series Features
time_series_fill_cols = ['total_cases', 'total_deaths', 'new_cases', 'new_deaths']
combined1[time_series_fill_cols] = combined1[time_series_fill_cols].fillna(method='ffill').fillna(method='bfill')

# Verify that missing values are handled
print("Remaining Missing Values After Imputation:")
print(combined1.isnull().sum()[combined1.isnull().sum() > 0])


In [None]:
combined1.isnull().sum()

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

def univariate_analysis(df, column):
    """
    Generate histogram & boxplot for a given column.
    """
    plt.figure(figsize=(14, 5))

    # Histogram & KDE plot
    plt.subplot(1, 2, 1)
    sns.histplot(df[column], bins=30, kde=True)
    plt.title(f"Distribution of {column}")

    # Boxplot
    plt.subplot(1, 2, 2)
    sns.boxplot(y=df[column])
    plt.title(f"Boxplot of {column}")

    plt.show()

# Example: Run this function for each column separately
column_name = 'gdp_per_capita'  # Change column name for each analysis
univariate_analysis(combined1, column_name)


### What are the total cases, deaths, tests, vaccinations based on the year

In [None]:
import pandas as pd
import plotly.express as px

def plot_yearly_univariate(df, column, title):
    """
    Function to generate a line plot for a single column over the years.

    Parameters:
    df (DataFrame): The dataset containing COVID-19 data
    column (str): Column to plot
    title (str): Title of the plot
    """
    # Convert date to datetime format and extract year
    df['date'] = pd.to_datetime(df['date'])
    df['year'] = df['date'].dt.year

    # Group data by year and sum the selected column
    yearly_data = df.groupby('year')[column].sum().reset_index()

    # Create a line plot for a single column
    fig = px.line(yearly_data, x='year', y=column,
                  markers=True, title=title, template="plotly_dark")

    fig.update_layout(yaxis_title="Count", xaxis_title="Year")
    fig.show()

# Call function separately for each metric
plot_yearly_univariate(combined1, 'total_cases', "Yearly Trend of Total Cases")


In [None]:
plot_yearly_univariate(combined1, 'new_cases', "Yearly Trend of New Cases")


In [None]:
plot_yearly_univariate(combined1, 'total_deaths', "Yearly Trend of Total Deaths")


In [None]:
plot_yearly_univariate(combined1, 'new_deaths', "Yearly Trend of New Deaths")


### Visualization of Top 10 Countries by COVID-19 Cases Over Years

In [None]:
import pandas as pd
import plotly.express as px

def plot_top_countries_cases(df, year):
    """
    Function to visualize the Top 10 countries by total COVID-19 cases for a given year.

    Parameters:
    df (DataFrame): The dataset containing COVID-19 data
    year (int): The year for which to plot the top 10 countries
    """
    # Filter data for the selected year
    df_year = df[df['year'] == year]

    # Group by country (location) and sum total cases
    country_cases = df_year.groupby('location')['total_cases'].max().reset_index()

    # Select the top 10 countries with the highest cases
    top_countries = country_cases.nlargest(10, 'total_cases')

    # Create a bar chart using Plotly
    fig = px.bar(top_countries, x='location', y='total_cases',
                 text='total_cases', title=f"Top 10 Countries by COVID-19 Cases in {year}",
                 labels={'total_cases': 'Total Cases', 'location': 'Country'},
                 template="plotly_dark", color='total_cases')

    fig.update_traces(texttemplate='%{text:.2s}', textposition='outside')
    fig.update_layout(xaxis_title="Country", yaxis_title="Total Cases")
    fig.show()

# Example: Plot for different years
plot_top_countries_cases(combined1, 2020)



In [None]:
plot_top_countries_cases(combined1, 2021)


In [None]:
plot_top_countries_cases(combined1, 2022)


In [None]:
plot_top_countries_cases(combined1, 2023)

### Realtionship between GDP and cases

In [None]:
def plot_gdp_vs_cases(df):
    fig = px.scatter(df, x='gdp_per_capita', y='total_cases', color='continent',
                     size='population', title="GDP per Capita vs Total Cases",
                     hover_name='location', log_x=True, template="plotly_dark")
    fig.show()

plot_gdp_vs_cases(combined1)


 ### Impact of Population Density on COVID-19 Spread

In [None]:
def plot_population_density_vs_cases(df):
    fig = px.scatter(df, x='population_density', y='total_cases', color='continent',
                     title="Population Density vs Total Cases",
                     hover_name='location', log_x=True, template="plotly_dark")
    fig.show()

plot_population_density_vs_cases(combined1)

# Hypothesis: Densely populated areas may have higher COVID-19 cases.

Correlation Between Stringency Index & Cases Over Time
🔍 Hypothesis: Stricter lockdown policies may reduce the number of cases.

In [None]:
def plot_stringency_vs_cases(df):
    fig = px.scatter(df, x='stringency_index', y='new_cases', color='continent',
                     title="Stringency Index vs New Cases",
                     hover_name='location', template="plotly_dark")
    fig.show()

plot_stringency_vs_cases(combined1)


# Treemap – Total Cases by Continent & Country
🔍 Hypothesis: Visualize which countries had the most cases within each continent.


In [None]:
def plot_treemap_cases(df):
    fig = px.treemap(df, path=['continent', 'location'], values='total_cases',
                     title="Treemap of COVID-19 Cases by Continent & Country",
                     color='total_cases', template="plotly_dark")
    fig.show()

plot_treemap_cases(combined1)


## Bar Chart – Top 10 Countries by Total Deaths
🔍 Hypothesis: Identify which countries had the highest death tolls

In [None]:
def plot_top_10_countries_deaths(df):
    top_countries = df.groupby('location')['total_deaths'].max().reset_index()
    top_countries = top_countries.nlargest(10, 'total_deaths')

    fig = px.bar(top_countries, x='location', y='total_deaths', color='total_deaths',
                 title="Top 10 Countries by COVID-19 Deaths", template="plotly_dark")
    fig.show()

plot_top_10_countries_deaths(combined1)


 ###  Hospital Beds per Thousand vs Death Rate
🔍 Hypothesis: Countries with more hospital beds per capita may have lower fatality rates.

In [None]:
def plot_hospital_beds_vs_deaths(df):
    fig = px.scatter(df, x='hospital_beds_per_thousand', y='total_deaths',
                     color='continent', title="Hospital Beds per Thousand vs Deaths",
                     hover_name='location', template="plotly_dark")
    fig.show()

plot_hospital_beds_vs_deaths(combined1)
