# COVID-19 Data Analysis Notebook

This notebook performs end-to-end analysis of COVID-19 datasets, including data preprocessing, exploratory data analysis, and visualization. The processed data will be used in a Streamlit dashboard for interactive visualization.

In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from datetime import datetime

# Set the style for matplotlib
plt.style.use('ggplot')
sns.set_style("whitegrid")

# Configure pandas to display more rows and columns
pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', 50)
pd.set_option('display.width', 1000)

## 1. Data Loading and Initial Exploration

In [None]:
# Load the datasets
day_wise_df = pd.read_csv('attached_assets/day_wise.csv')
full_grouped_df = pd.read_csv('attached_assets/full_grouped.csv')
usa_county_wise_df = pd.read_csv('attached_assets/usa_county_wise.csv')
worldometer_df = pd.read_csv('attached_assets/worldometer_data.csv')

# Display basic information about the datasets
print("Day-wise Dataset:")
print(f"Shape: {day_wise_df.shape}")
day_wise_df.head()

In [None]:
print("Full Grouped Dataset:")
print(f"Shape: {full_grouped_df.shape}")
full_grouped_df.head()

In [None]:
print("USA County-wise Dataset:")
print(f"Shape: {usa_county_wise_df.shape}")
usa_county_wise_df.head()

In [None]:
print("Worldometer Dataset:")
print(f"Shape: {worldometer_df.shape}")
worldometer_df.head()

## 2. Data Preprocessing

### 2.1 Check for missing values

In [None]:
# Check for missing values in day_wise_df
print("Missing values in day_wise_df:")
print(day_wise_df.isnull().sum())

# Check for missing values in full_grouped_df
print("\nMissing values in full_grouped_df:")
print(full_grouped_df.isnull().sum())

# Check for missing values in worldometer_df
print("\nMissing values in worldometer_df:")
print(worldometer_df.isnull().sum())

### 2.2 Preprocessing the day_wise_df dataset

In [None]:
# Convert Date column to datetime
day_wise_df['Date'] = pd.to_datetime(day_wise_df['Date'])

# Check data types
day_wise_df.dtypes

### 2.3 Preprocessing the full_grouped_df dataset

In [None]:
# Convert Date column to datetime
full_grouped_df['Date'] = pd.to_datetime(full_grouped_df['Date'])

# Remove records where confirmed cases are less than 15
full_grouped_filtered = full_grouped_df[full_grouped_df['Confirmed'] >= 15]

# Keep only relevant columns
full_grouped_relevant = full_grouped_filtered[['Date', 'Country/Region', 'Confirmed', 'Deaths', 'Recovered', 'Active', 'WHO Region']]

# Display the processed dataframe
print(f"Shape after filtering: {full_grouped_relevant.shape}")
full_grouped_relevant.head()

### 2.4 Preprocessing the worldometer_df dataset

In [None]:
# Clean worldometer dataset
# Remove rows with confirmed cases less than 15
worldometer_filtered = worldometer_df[worldometer_df['TotalCases'] >= 15]

# Keep only relevant columns
worldometer_relevant = worldometer_filtered[['Country/Region', 'Continent', 'TotalCases', 'TotalDeaths', 'TotalRecovered', 
                                            'ActiveCases', 'WHO Region', 'Population']]

# Rename columns for consistency
worldometer_relevant = worldometer_relevant.rename(columns={
    'TotalCases': 'Confirmed',
    'TotalDeaths': 'Deaths',
    'TotalRecovered': 'Recovered',
    'ActiveCases': 'Active'
})

# Display the processed dataframe
print(f"Shape after filtering: {worldometer_relevant.shape}")
worldometer_relevant.head()

## 3. Exploratory Data Analysis

### 3.1 Global COVID-19 Trends

In [None]:
# Plot the global COVID-19 trends
fig = make_subplots(rows=2, cols=2, 
                    subplot_titles=("Global Confirmed Cases", "Global Deaths", 
                                    "Global Recovered Cases", "Global Active Cases"),
                    shared_xaxes=True)

fig.add_trace(go.Scatter(x=day_wise_df['Date'], y=day_wise_df['Confirmed'], mode='lines+markers', name='Confirmed'), row=1, col=1)
fig.add_trace(go.Scatter(x=day_wise_df['Date'], y=day_wise_df['Deaths'], mode='lines+markers', name='Deaths'), row=1, col=2)
fig.add_trace(go.Scatter(x=day_wise_df['Date'], y=day_wise_df['Recovered'], mode='lines+markers', name='Recovered'), row=2, col=1)
fig.add_trace(go.Scatter(x=day_wise_df['Date'], y=day_wise_df['Active'], mode='lines+markers', name='Active'), row=2, col=2)

fig.update_layout(height=800, width=1200, title_text="Global COVID-19 Trends")
fig.show()

In [None]:
# Plot the daily new cases, new deaths, and new recoveries
fig = make_subplots(rows=1, cols=3, 
                    subplot_titles=("Daily New Cases", "Daily New Deaths", "Daily New Recoveries"),
                    shared_xaxes=True)

fig.add_trace(go.Bar(x=day_wise_df['Date'], y=day_wise_df['New cases'], name='New Cases'), row=1, col=1)
fig.add_trace(go.Bar(x=day_wise_df['Date'], y=day_wise_df['New deaths'], name='New Deaths'), row=1, col=2)
fig.add_trace(go.Bar(x=day_wise_df['Date'], y=day_wise_df['New recovered'], name='New Recovered'), row=1, col=3)

fig.update_layout(height=500, width=1200, title_text="Daily COVID-19 Statistics")
fig.show()

### 3.2 COVID-19 Analysis by Country/Region

In [None]:
# Get the latest data for each country
latest_date = full_grouped_relevant['Date'].max()
latest_data = full_grouped_relevant[full_grouped_relevant['Date'] == latest_date]

# Sort by confirmed cases in descending order
top_countries = latest_data.sort_values('Confirmed', ascending=False).head(15)

# Create a horizontal bar chart for top 15 countries by confirmed cases
fig = px.bar(top_countries, y='Country/Region', x='Confirmed', color='WHO Region',
             orientation='h', title='Top 15 Countries by Confirmed Cases',
             labels={'Confirmed': 'Total Confirmed Cases', 'Country/Region': 'Country'},
             height=600)
fig.show()

In [None]:
# Create a horizontal bar chart for top 15 countries by deaths
top_deaths = latest_data.sort_values('Deaths', ascending=False).head(15)
fig = px.bar(top_deaths, y='Country/Region', x='Deaths', color='WHO Region',
             orientation='h', title='Top 15 Countries by Deaths',
             labels={'Deaths': 'Total Deaths', 'Country/Region': 'Country'},
             height=600)
fig.show()

In [None]:
# Calculate mortality rate (deaths per 100 confirmed cases) for each country
latest_data['Mortality Rate (%)'] = (latest_data['Deaths'] / latest_data['Confirmed']) * 100

# Filter countries with at least 1000 confirmed cases for meaningful mortality rate calculation
mortality_data = latest_data[latest_data['Confirmed'] >= 1000].sort_values('Mortality Rate (%)', ascending=False).head(15)

# Create a horizontal bar chart for mortality rate
fig = px.bar(mortality_data, y='Country/Region', x='Mortality Rate (%)', color='WHO Region',
             orientation='h', title='Top 15 Countries by Mortality Rate (for countries with at least 1000 cases)',
             labels={'Mortality Rate (%)': 'Deaths per 100 Confirmed Cases', 'Country/Region': 'Country'},
             height=600)
fig.show()

### 3.3 COVID-19 Analysis by WHO Region

In [None]:
# Group data by WHO Region
region_data = latest_data.groupby('WHO Region').agg({
    'Confirmed': 'sum',
    'Deaths': 'sum',
    'Recovered': 'sum',
    'Active': 'sum'
}).reset_index()

# Calculate mortality rate by region
region_data['Mortality Rate (%)'] = (region_data['Deaths'] / region_data['Confirmed']) * 100

# Display region data
region_data.sort_values('Confirmed', ascending=False)

In [None]:
# Create pie charts for distribution by WHO Region
fig = make_subplots(rows=2, cols=2, specs=[[{'type': 'domain'}, {'type': 'domain'}],
                                           [{'type': 'domain'}, {'type': 'domain'}]],
                    subplot_titles=("Confirmed Cases", "Deaths", "Recovered Cases", "Active Cases"))

fig.add_trace(go.Pie(labels=region_data['WHO Region'], values=region_data['Confirmed'], name="Confirmed"), 1, 1)
fig.add_trace(go.Pie(labels=region_data['WHO Region'], values=region_data['Deaths'], name="Deaths"), 1, 2)
fig.add_trace(go.Pie(labels=region_data['WHO Region'], values=region_data['Recovered'], name="Recovered"), 2, 1)
fig.add_trace(go.Pie(labels=region_data['WHO Region'], values=region_data['Active'], name="Active"), 2, 2)

fig.update_layout(title_text="COVID-19 Distribution by WHO Region", height=700)
fig.show()

### 3.4 Time Series Analysis for Top Countries

In [None]:
# Get the top 5 countries by confirmed cases
top_5_countries = latest_data.sort_values('Confirmed', ascending=False).head(5)['Country/Region'].tolist()

# Filter data for top 5 countries
top_countries_data = full_grouped_relevant[full_grouped_relevant['Country/Region'].isin(top_5_countries)]

# Plot time series data for top 5 countries
fig = px.line(top_countries_data, x='Date', y='Confirmed', color='Country/Region',
              title='COVID-19 Confirmed Cases for Top 5 Countries',
              labels={'Confirmed': 'Total Confirmed Cases', 'Date': 'Date', 'Country/Region': 'Country'},
              height=600)
fig.show()

In [None]:
# Plot time series data for deaths in top 5 countries
fig = px.line(top_countries_data, x='Date', y='Deaths', color='Country/Region',
              title='COVID-19 Deaths for Top 5 Countries',
              labels={'Deaths': 'Total Deaths', 'Date': 'Date', 'Country/Region': 'Country'},
              height=600)
fig.show()

### 3.5 Analysis Using Worldometer Data

In [None]:
# Group data by continent
continent_data = worldometer_relevant.groupby('Continent').agg({
    'Confirmed': 'sum',
    'Deaths': 'sum',
    'Recovered': 'sum',
    'Active': 'sum',
    'Population': 'sum'
}).reset_index()

# Calculate cases per million and deaths per million
continent_data['Cases per Million'] = (continent_data['Confirmed'] / continent_data['Population']) * 1000000
continent_data['Deaths per Million'] = (continent_data['Deaths'] / continent_data['Population']) * 1000000

# Display continent data
continent_data.sort_values('Confirmed', ascending=False)

In [None]:
# Create bar charts for continent analysis
fig = make_subplots(rows=1, cols=2, subplot_titles=("Cases per Million", "Deaths per Million"))

fig.add_trace(go.Bar(x=continent_data['Continent'], y=continent_data['Cases per Million'], name="Cases per Million"), 1, 1)
fig.add_trace(go.Bar(x=continent_data['Continent'], y=continent_data['Deaths per Million'], name="Deaths per Million"), 1, 2)

fig.update_layout(title_text="COVID-19 Impact by Continent", height=500)
fig.show()

## 4. Identify Regions with Highest Cases

In [None]:
# Identify and display regions with highest confirmed cases
highest_cases = latest_data.sort_values('Confirmed', ascending=False).head(10)
print("Top 10 Countries/Regions with Highest Confirmed Cases:")
highest_cases[['Country/Region', 'Confirmed', 'Deaths', 'Recovered', 'Active']]

In [None]:
# Create a map visualization for confirmed cases
fig = px.choropleth(latest_data, 
                    locations="Country/Region", 
                    locationmode="country names",
                    color=np.log10(latest_data["Confirmed"]), 
                    hover_name="Country/Region",
                    hover_data=["Confirmed", "Deaths", "Recovered", "Active"],
                    title="Global Distribution of COVID-19 Confirmed Cases (log scale)",
                    color_continuous_scale=px.colors.sequential.Plasma,
                    labels={"color": "Log10(Confirmed Cases)"})

fig.update_layout(height=600)
fig.show()

## 5. Save Processed Data for Dashboard

In [None]:
# Save processed datasets for the dashboard
day_wise_df.to_csv('processed_day_wise.csv', index=False)
full_grouped_relevant.to_csv('processed_country_wise.csv', index=False)
worldometer_relevant.to_csv('processed_worldometer.csv', index=False)

# Save the latest data snapshot
latest_data.to_csv('latest_covid_data.csv', index=False)

print("All processed datasets have been saved successfully.")

## 6. Summary of Findings

### Key Insights:

1. **Global Trends**: The COVID-19 pandemic showed a rapid increase in cases globally, with significant growth in confirmed cases, deaths, and recoveries over time.

2. **Regional Distribution**: The Americas and Europe have been the most severely affected regions in terms of confirmed cases and deaths.

3. **Country Analysis**: The United States, Brazil, and India emerged as the countries with the highest number of confirmed cases.

4. **Mortality Rates**: Some countries have significantly higher mortality rates than others, which could be attributed to various factors including healthcare capacity, demographics, and testing strategies.

5. **Continental Impact**: When normalized by population, Europe and North America show the highest cases per million people, indicating the severity of the pandemic in these regions despite their advanced healthcare systems.

These insights will be visualized in the interactive Streamlit dashboard for further exploration.