In [1]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from datetime import datetime, timedelta
import warnings
warnings.filterwarnings('ignore')

# Set visualization style
plt.style.use('ggplot')
sns.set_palette("viridis")
%matplotlib inline

# Set Plotly to work in offline mode for Jupyter
from plotly.offline import init_notebook_mode
init_notebook_mode(connected=True)

In [2]:
# Function to automatically download the latest data
def download_latest_data():
    """
    Downloads the latest COVID-19 data from Our World in Data
    Returns a pandas DataFrame with the data
    """
    try:
        url = "https://covid.ourworldindata.org/data/owid-covid-data.csv"
        print(f"Downloading latest data from {url}...")
        df = pd.read_csv(url)
        print(f"Data downloaded successfully on {datetime.now().strftime('%Y-%m-%d %H:%M')}")
        print(f"Dataset contains {df.shape[0]} rows and {df.shape[1]} columns")
        return df
    except Exception as e:
        print(f"Error downloading data: {e}")
        # Load from local backup if available
        try:
            print("Attempting to load from local file...")
            df = pd.read_csv("owid-covid-data.csv")
            print("Loaded from local file successfully")
            return df
        except:
            print("Could not load data from any source")
            return None

In [None]:
# Download the latest data
covid_df = download_latest_data()

if covid_df is None:
    print("Failed to load data. Please check your internet connection or ensure the data file is in your working directory.")
# Read the CSV file directly
covid_df = pd.read_csv("owid-covid-data.csv")

if covid_df is None or covid_df.empty:
    print("Failed to load data. Please check if 'owid-covid-data.csv' is in your working directory.")
else:
    
    # Display basic information about the dataset
    print("\n=== DATASET INFORMATION ===")
    print("Dataset shape:", covid_df.shape)
    print("\nFirst few rows:")
    display(covid_df.head())
    
        # Display basic information about the dataset
    print("\n=== DATASET INFORMATION ===")
    print("Dataset shape:", covid_df.shape)
    print("\nFirst few rows:")
    display(covid_df.head())
    
    print("\nColumns:", covid_df.columns.tolist())
    print("\nData types:\n", covid_df.dtypes)
    
    # Check for missing values
    print("\n=== MISSING VALUES ANALYSIS ===")
    missing_data = covid_df.isnull().sum()
    missing_percentage = (missing_data / len(covid_df)) * 100
    missing_info = pd.DataFrame({'Missing Values': missing_data, 'Percentage': missing_percentage})
    display(missing_info.sort_values('Percentage', ascending=False).head(15))
    
    # Basic statistics for numerical columns
    print("\n=== BASIC STATISTICS ===")
    display(covid_df.describe())
    
    # Data Cleaning
    print("\n=== DATA CLEANING ===")
    

    # Convert date column to datetime
covid_df['date'] = pd.to_datetime(covid_df['date'])

# Ensure we're working with a clean DataFrame structure
if isinstance(covid_df.index, pd.DatetimeIndex):
    covid_df = covid_df.reset_index()

# Make sure all columns are the right data type for calculations
numeric_cols = ['total_deaths', 'total_cases', 'population', 'people_vaccinated']
for col in numeric_cols:
    if col in covid_df.columns:
        covid_df[col] = pd.to_numeric(covid_df[col], errors='coerce')

# Now proceed with your calculations
if 'total_deaths' in covid_df.columns and 'total_cases' in covid_df.columns:
    covid_df['death_rate'] = covid_df['total_deaths'] / covid_df['total_cases']
    
    # Filter for countries (exclude continents and income groups)
    exclude_locations = ['World', 'European Union', 'International', 'Asia', 'Europe', 'North America', 
                         'South America', 'Africa', 'Oceania', 'High income', 'Upper middle income',
                         'Lower middle income', 'Low income']
    covid_df = covid_df[~covid_df['location'].isin(exclude_locations)]
    print(f"After filtering locations, dataset has {covid_df.shape[0]} rows")
    
    # Select key columns for analysis
    key_columns = ['date', 'location', 'continent', 'population', 'total_cases', 'new_cases',
                   'total_deaths', 'new_deaths', 'total_cases_per_million', 'total_deaths_per_million',
                   'total_vaccinations', 'people_vaccinated', 'people_fully_vaccinated',
                   'new_vaccinations', 'population_density', 'median_age', 'gdp_per_capita',
                   'hospital_beds_per_thousand', 'life_expectancy']
    
    # Keep only the columns that exist in the dataset
    existing_columns = [col for col in key_columns if col in covid_df.columns]
    covid_df = covid_df[existing_columns]
    print(f"Selected {len(existing_columns)} key columns for analysis")
    
    # Handle missing values - for time-series data, we'll forward fill then backward fill for each country
    covid_df = covid_df.sort_values(['location', 'date'])
    for col in ['total_cases', 'total_deaths', 'total_vaccinations', 'people_vaccinated', 'people_fully_vaccinated']:
        if col in covid_df.columns:
            covid_df[col] = covid_df.groupby('location')[col].ffill().bfill()
    
    # Fill remaining missing values with 0 for numerical columns
    numeric_cols = covid_df.select_dtypes(include=[np.number]).columns
    covid_df[numeric_cols] = covid_df[numeric_cols].fillna(0)
    
    # Calculate additional metrics
    if 'total_deaths' in covid_df.columns and 'total_cases' in covid_df.columns:
        covid_df['death_rate'] = covid_df['total_deaths'] / covid_df['total_cases']
        covid_df['death_rate'] = covid_df['death_rate'].replace([np.inf, -np.inf], 0)
        # Cap death rate at 0.2 (20%) to remove outliers
        covid_df['death_rate'] = covid_df['death_rate'].apply(lambda x: x if x <= 0.2 else 0.2)
    
    if 'people_vaccinated' in covid_df.columns and 'population' in covid_df.columns:
        covid_df['vaccination_rate'] = covid_df['people_vaccinated'] / covid_df['population']
        covid_df['vaccination_rate'] = covid_df['vaccination_rate'].replace([np.inf, -np.inf], 0)
    
    # Get the latest data for each country
    latest_dates = covid_df.groupby('location')['date'].max()
    latest_data = covid_df[covid_df['date'].isin(latest_dates)]
    print(f"Latest data contains {latest_data.shape[0]} countries/regions")
    
    # Exploratory Data Analysis (EDA)
    print("\n=== EXPLORATORY DATA ANALYSIS ===")
    
    # Global daily new cases and deaths
    global_daily = covid_df.groupby('date').agg({
        'new_cases': 'sum',
        'new_deaths': 'sum'
    }).reset_index()
    
    # Calculate 7-day rolling averages
    global_daily['new_cases_7day_avg'] = global_daily['new_cases'].rolling(window=7).mean()
    global_daily['new_deaths_7day_avg'] = global_daily['new_deaths'].rolling(window=7).mean()
    
    # Create interactive Plotly visualizations for global trends
    fig = make_subplots(rows=2, cols=1, subplot_titles=('Global Daily New COVID-19 Cases', 'Global Daily New COVID-19 Deaths'))
    
    # Add cases trace
    fig.add_trace(
        go.Scatter(x=global_daily['date'], y=global_daily['new_cases'], 
                  name='Daily Cases', mode='lines', line=dict(color='blue', width=1), opacity=0.3),
        row=1, col=1
    )
    
    fig.add_trace(
        go.Scatter(x=global_daily['date'], y=global_daily['new_cases_7day_avg'], 
                  name='7-Day Avg', mode='lines', line=dict(color='red', width=2)),
        row=1, col=1
    )
    
    # Add deaths trace
    fig.add_trace(
        go.Scatter(x=global_daily['date'], y=global_daily['new_deaths'], 
                  name='Daily Deaths', mode='lines', line=dict(color='grey', width=1), opacity=0.3),
        row=2, col=1
    )
    
    fig.add_trace(
        go.Scatter(x=global_daily['date'], y=global_daily['new_deaths_7day_avg'], 
                  name='7-Day Avg', mode='lines', line=dict(color='black', width=2)),
        row=2, col=1
    )
    
    fig.update_layout(height=600, showlegend=True, title_text="Global COVID-19 Trends")
    fig.show()
    
    # Country comparison - Top 10 countries by total cases
    if 'total_cases' in latest_data.columns:
        top_countries_cases = latest_data.nlargest(10, 'total_cases')[['location', 'total_cases', 'total_deaths']]
        top_countries_cases_per_million = latest_data.nlargest(10, 'total_cases_per_million')[['location', 'total_cases_per_million', 'total_deaths_per_million']]
        
        # Create interactive bar charts
        fig = make_subplots(rows=1, cols=2, subplot_titles=('Top 10 Countries by Total Cases', 'Top 10 Countries by Cases per Million'))
        
        fig.add_trace(
            go.Bar(x=top_countries_cases['total_cases'], y=top_countries_cases['location'], 
                   orientation='h', name='Total Cases', marker_color='lightblue'),
            row=1, col=1
        )
        
        fig.add_trace(
            go.Bar(x=top_countries_cases_per_million['total_cases_per_million'], y=top_countries_cases_per_million['location'], 
                   orientation='h', name='Cases per Million', marker_color='lightcoral'),
            row=1, col=2
        )
        
        fig.update_layout(height=500, showlegend=False, title_text="COVID-19 Cases by Country")
        fig.show()
    
    # Death Rate Analysis
    if 'death_rate' in latest_data.columns:
        # Filter countries with significant cases (> 1000) and reasonable death rates
        significant_cases = latest_data[latest_data['total_cases'] > 1000]
        significant_cases = significant_cases[significant_cases['death_rate'] > 0]  # Remove countries with 0 death rate
        
        # Top 10 countries by death rate
        top_death_rates = significant_cases.nlargest(10, 'death_rate')[['location', 'death_rate', 'total_cases', 'total_deaths']]
        
        # Create interactive bar chart
        fig = go.Figure()
        fig.add_trace(go.Bar(
            x=top_death_rates['death_rate'],
            y=top_death_rates['location'],
            orientation='h',
            marker_color='red',
            text=[f'{rate:.2%}' for rate in top_death_rates['death_rate']],
            textposition='auto'
        ))
        
        fig.update_layout(
            title='Top 10 Countries by Death Rate (min 1000 cases)',
            xaxis_title='Death Rate',
            yaxis_title='Country',
            height=500
        )
        fig.show()
        
        # Scatter plot: Death rate vs. median age
        if 'median_age' in significant_cases.columns:
            fig = px.scatter(significant_cases, x='median_age', y='death_rate', 
                             size='total_cases', color='continent',
                             hover_name='location', log_x=False, size_max=60,
                             title='Death Rate vs. Median Age by Country')
            fig.show()
    
    # Vaccination Analysis
    if 'people_vaccinated' in covid_df.columns:
        # Global vaccination progress
        global_vaccination = covid_df.groupby('date').agg({
            'new_vaccinations': 'sum',
            'people_vaccinated': 'sum',
            'people_fully_vaccinated': 'sum'
        }).reset_index()
        
        # Calculate 7-day average for new vaccinations
        global_vaccination['new_vaccinations_7day_avg'] = global_vaccination['new_vaccinations'].rolling(window=7).mean()
        
        # Create interactive vaccination charts
        fig = make_subplots(rows=2, cols=1, subplot_titles=('Global Daily New Vaccinations', 'Global Cumulative Vaccinations'))
        
        # New vaccinations
        fig.add_trace(
            go.Scatter(x=global_vaccination['date'], y=global_vaccination['new_vaccinations'], 
                      name='Daily Vaccinations', mode='lines', line=dict(color='lightgreen', width=1), opacity=0.3),
            row=1, col=1
        )
        
        fig.add_trace(
            go.Scatter(x=global_vaccination['date'], y=global_vaccination['new_vaccinations_7day_avg'], 
                      name='7-Day Average', mode='lines', line=dict(color='darkgreen', width=2)),
            row=1, col=1
        )
        
        # Cumulative vaccinations
        fig.add_trace(
            go.Scatter(x=global_vaccination['date'], y=global_vaccination['people_vaccinated'], 
                      name='At Least One Dose', mode='lines', line=dict(color='blue', width=2)),
            row=2, col=1
        )
        
        fig.add_trace(
            go.Scatter(x=global_vaccination['date'], y=global_vaccination['people_fully_vaccinated'], 
                      name='Fully Vaccinated', mode='lines', line=dict(color='purple', width=2)),
            row=2, col=1
        )
        
        fig.update_layout(height=600, showlegend=True, title_text="Global COVID-19 Vaccination Progress")
        fig.show()
        
        # Top 10 countries by vaccination rate
        if 'vaccination_rate' in latest_data.columns:
            top_vaccination = latest_data.nlargest(10, 'vaccination_rate')[['location', 'vaccination_rate', 'people_vaccinated', 'population']]
            top_vaccination['pct_vaccinated'] = top_vaccination['vaccination_rate'] * 100
            
            fig = go.Figure()
            fig.add_trace(go.Bar(
                x=top_vaccination['pct_vaccinated'],
                y=top_vaccination['location'],
                orientation='h',
                marker_color='green',
                text=[f'{pct:.1f}%' for pct in top_vaccination['pct_vaccinated']],
                textposition='auto'
            ))
            
            fig.update_layout(
                title='Top 10 Countries by Vaccination Rate',
                xaxis_title='Percentage of Population Vaccinated (%)',
                yaxis_title='Country',
                height=500
            )
            fig.show()
    
    # Geographic Visualization
    print("\n=== GEOGRAPHIC VISUALIZATION ===")
    
    # Choropleth map for total cases per million
    if 'total_cases_per_million' in latest_data.columns:
        fig = px.choropleth(latest_data, 
                            locations="location",
                            locationmode='country names',
                            color="total_cases_per_million",
                            hover_name="location",
                            hover_data=["total_cases", "total_deaths", "population"],
                            color_continuous_scale=px.colors.sequential.Plasma,
                            title="Global COVID-19 Cases per Million")
        fig.show()
    
    # Choropleth map for vaccination rate
    if 'vaccination_rate' in latest_data.columns:
        fig = px.choropleth(latest_data, 
                            locations="location",
                            locationmode='country names',
                            color="vaccination_rate",
                            hover_name="location",
                            hover_data=["people_vaccinated", "population"],
                            color_continuous_scale=px.colors.sequential.Viridis,
                            title="Global COVID-19 Vaccination Rate")
        fig.show()
    
    # Key Insights and Conclusion
    print("\n=== KEY INSIGHTS AND CONCLUSION ===")
    
    # Calculate key global metrics
    total_cases_global = latest_data['total_cases'].sum()
    total_deaths_global = latest_data['total_deaths'].sum()
    
    if 'people_vaccinated' in latest_data.columns:
        total_vaccinated_global = latest_data['people_vaccinated'].sum()
        global_population = latest_data['population'].sum()
        global_vaccination_rate = total_vaccinated_global / global_population
    else:
        global_vaccination_rate = 0
    
    print("GLOBAL COVID-19 SUMMARY")
    print("=======================")
    print(f"Total Cases: {total_cases_global:,.0f}")
    print(f"Total Deaths: {total_deaths_global:,.0f}")
    if global_vaccination_rate > 0:
        print(f"Global Vaccination Rate: {global_vaccination_rate:.2%}")
    
    # Top 3 countries by different metrics
    metrics_data = {}
    
    if 'total_cases' in latest_data.columns:
        metrics_data['Total Cases'] = latest_data.nlargest(3, 'total_cases')['location'].tolist()
    
    if 'total_cases_per_million' in latest_data.columns:
        metrics_data['Cases per Million'] = latest_data.nlargest(3, 'total_cases_per_million')['location'].tolist()
    
    if 'death_rate' in latest_data.columns and len(significant_cases) > 0:
        metrics_data['Death Rate'] = significant_cases.nlargest(3, 'death_rate')['location'].tolist()
    
    if 'vaccination_rate' in latest_data.columns:
        metrics_data['Vaccination Rate'] = latest_data.nlargest(3, 'vaccination_rate')['location'].tolist()
    
    print("\nTOP COUNTRIES BY METRIC:")
    for metric, countries in metrics_data.items():
        print(f"{metric}: {', '.join(countries)}")
    
    # Additional insights
    print("\nKEY INSIGHTS:")
    print("1. The pandemic has affected countries differently based on factors like population density, healthcare capacity, and government response.")
    print("2. Vaccination rates vary significantly across countries, with some nations achieving high coverage while others lag behind.")
    print("3. Death rates tend to be higher in countries with older populations and limited healthcare resources.")
    print("4. The data shows waves of infection corresponding to different variants and seasons.")
    
    # Save the cleaned data for future use
    try:
        covid_df.to_csv('cleaned_covid_data.csv', index=False)
        print("\nCleaned data saved to 'cleaned_covid_data.csv'")
    except:
        print("\nCould not save cleaned data to file")

print("\n=== ANALYSIS COMPLETE ===")

Downloading latest data from https://covid.ourworldindata.org/data/owid-covid-data.csv...
Error downloading data: <urlopen error [Errno 11001] getaddrinfo failed>
Attempting to load from local file...
Loaded from local file successfully
Converting date columns to proper datetime format...
Converted 15 date columns to datetime format
Date range: NaT to NaT

=== DATASET INFORMATION ===
Dataset shape: (20, 19)

First few rows:


Unnamed: 0,date,location,continent,population,total_cases,new_cases,total_deaths,new_deaths,total_cases_per_million,total_deaths_per_million,total_vaccinations,people_vaccinated,people_fully_vaccinated,new_vaccinations,population_density,median_age,gdp_per_capita,hospital_beds_per_thousand,life_expectancy
0,2023-01-01,United States,North America,331000000,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT
1,2023-01-01,India,Asia,1380000000,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT
2,2023-01-01,Brazil,South America,213000000,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT
3,2023-01-01,France,Europe,67500000,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT
4,2023-01-01,Germany,Europe,83200000,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT



=== DATASET INFORMATION ===
Dataset shape: (20, 19)

First few rows:


Unnamed: 0,date,location,continent,population,total_cases,new_cases,total_deaths,new_deaths,total_cases_per_million,total_deaths_per_million,total_vaccinations,people_vaccinated,people_fully_vaccinated,new_vaccinations,population_density,median_age,gdp_per_capita,hospital_beds_per_thousand,life_expectancy
0,2023-01-01,United States,North America,331000000,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT
1,2023-01-01,India,Asia,1380000000,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT
2,2023-01-01,Brazil,South America,213000000,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT
3,2023-01-01,France,Europe,67500000,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT
4,2023-01-01,Germany,Europe,83200000,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT



Columns: ['date', 'location', 'continent', 'population', 'total_cases', 'new_cases', 'total_deaths', 'new_deaths', 'total_cases_per_million', 'total_deaths_per_million', 'total_vaccinations', 'people_vaccinated', 'people_fully_vaccinated', 'new_vaccinations', 'population_density', 'median_age', 'gdp_per_capita', 'hospital_beds_per_thousand', 'life_expectancy']

Data types:
 date                                  object
location                              object
continent                             object
population                             int64
total_cases                   datetime64[ns]
new_cases                     datetime64[ns]
total_deaths                  datetime64[ns]
new_deaths                    datetime64[ns]
total_cases_per_million       datetime64[ns]
total_deaths_per_million      datetime64[ns]
total_vaccinations            datetime64[ns]
people_vaccinated             datetime64[ns]
people_fully_vaccinated       datetime64[ns]
new_vaccinations              datetim

Unnamed: 0,Missing Values,Percentage
total_deaths_per_million,20,100.0
total_vaccinations,20,100.0
hospital_beds_per_thousand,20,100.0
gdp_per_capita,20,100.0
median_age,20,100.0
population_density,20,100.0
new_vaccinations,20,100.0
people_fully_vaccinated,20,100.0
people_vaccinated,20,100.0
life_expectancy,20,100.0



=== BASIC STATISTICS ===


Unnamed: 0,population,total_cases,new_cases,total_deaths,new_deaths,total_cases_per_million,total_deaths_per_million,total_vaccinations,people_vaccinated,people_fully_vaccinated,new_vaccinations,population_density,median_age,gdp_per_capita,hospital_beds_per_thousand,life_expectancy
count,20.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
mean,162690000.0,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT
min,17500000.0,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT
25%,46850000.0,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT
50%,67350000.0,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT
75%,130500000.0,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT
max,1380000000.0,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT
std,298316500.0,,,,,,,,,,,,,,,



=== DATA CLEANING ===
After filtering locations, dataset has 20 rows
Selected 19 key columns for analysis


TypeError: cannot perform __truediv__ with this index type: DatetimeArray