# Project 3: COVID-19 Pandemic Data Analysis

In [17]:
#EDA
import pandas as pd
import statsmodels.api as sm
import numpy as np
from sklearn.impute import SimpleImputer
import plotly.express as px

df = pd.read_csv("/content/owid-covid-data.csv", encoding='latin-1', engine='python', on_bad_lines='warn')

pd.set_option('display.max_columns', None)
print(df.head(5))

# Getting The Shape Of The DataSet
print(df.shape)
print(df.dtypes)

for col in df.columns:
  print(col)
# Replacing Missing Values Using Simple Imputer
imputer=SimpleImputer(strategy='mean')
df[["total_cases","total_vaccinations","population","excess_mortality_cumulative_absolute","excess_mortality_cumulative","excess_mortality","excess_mortality_cumulative_per_million"]] = imputer.fit_transform(df[["total_cases","total_vaccinations","population","excess_mortality_cumulative_absolute","excess_mortality_cumulative","excess_mortality","excess_mortality_cumulative_per_million"]])
imputer=SimpleImputer(strategy='most_frequent')
df[["continent"]] = imputer.fit_transform(df[["continent"]])
print(df.isnull().sum())

#Checking For Duplicates
df1 = df.duplicated()
print(df1)

# Dataset Description
print(df.describe())

  iso_code continent     location        date  total_cases  new_cases  \
0      AFG      Asia  Afghanistan  2020-01-05          0.0        0.0   
1      AFG      Asia  Afghanistan  2020-01-06          0.0        0.0   
2      AFG      Asia  Afghanistan  2020-01-07          0.0        0.0   
3      AFG      Asia  Afghanistan  2020-01-08          0.0        0.0   
4      AFG      Asia  Afghanistan  2020-01-09          0.0        0.0   

   new_cases_smoothed  total_deaths  new_deaths  new_deaths_smoothed  \
0                 NaN           0.0         0.0                  NaN   
1                 NaN           0.0         0.0                  NaN   
2                 NaN           0.0         0.0                  NaN   
3                 NaN           0.0         0.0                  NaN   
4                 NaN           0.0         0.0                  NaN   

   total_cases_per_million  new_cases_per_million  \
0                      0.0                    0.0   
1                     

In [13]:
# 1. Which countries had the highest infection and death rates?

import plotly.express as px
#For infection rates
death_count = df.groupby(['location', 'continent']).size().reset_index(name='total_cases')
inny = px.bar(
    death_count.sort_values(by='total_cases', ascending=False).head(20),
    x='location',
    y='total_cases',
    color='continent',
    title='Covid 19 Infection Rates Trend By Countries',
    labels={'location': 'Country', 'total_cases': 'Infection Count'},
)
inny.show()
#For death rates
death_count = df.groupby(['location', 'continent']).size().reset_index(name='total_deaths')
inny = px.bar(
    death_count.sort_values(by='total_deaths', ascending=False).head(20),
    x='location',
    y='total_deaths',
    color='continent',
    title='Covid 19 Death Rates Trend By Countries',
    labels={'location': 'Country', 'total_deaths': 'Death Count'},
)
inny.show()

#Based on this visualization, the countires with the highest infection and death rates are the High-Income Countries in Africa

In [18]:
# 5. How did vaccination rates influence case declines?

# I used OLS Regression mainly to shows association (relationship) between variables.

X = sm.add_constant(df["total_vaccinations"])
Y = df["total_cases"]
model = sm.OLS(Y, X).fit()
print(model.summary())


#According to the OLS Regression Results, for every increase in total_vaccination by 0.0364, there is an increase in total_cases, meaning vaccination rate did not influnce case decline.

                            OLS Regression Results                            
Dep. Variable:            total_cases   R-squared:                       0.466
Model:                            OLS   Adj. R-squared:                  0.466
Method:                 Least Squares   F-statistic:                 3.744e+05
Date:                Sun, 27 Apr 2025   Prob (F-statistic):               0.00
Time:                        05:57:13   Log-Likelihood:            -8.0312e+06
No. Observations:              429435   AIC:                         1.606e+07
Df Residuals:                  429433   BIC:                         1.606e+07
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                         coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------------
const              -1.309e+07   5.92

In [16]:
print(df["total_cases"])

0               0.0
1               0.0
2               0.0
3               0.0
4               0.0
            ...    
181575    1716165.0
181576    1717009.0
181577    1717009.0
181578    1717009.0
181579    1717009.0
Name: total_cases, Length: 181580, dtype: float64
