In [23]:
import pandas as pd

# Import necessary libraries

# Load the dataset
df = pd.read_csv(file_name)

# Display basic information about the dataset
print("Columns in the dataset:")
print(df.columns)

# Preview the first few rows
print("\nPreview of the dataset:")
print(df.head())

# Check for missing values
print("\nMissing values in each column:")
print(df.isnull().sum())

Columns in the dataset:
Index(['iso_code', 'continent', 'location', 'date', 'total_cases', 'new_cases',
       'new_cases_smoothed', 'total_deaths', 'new_deaths',
       'new_deaths_smoothed', 'total_cases_per_million',
       'new_cases_per_million', 'new_cases_smoothed_per_million',
       'total_deaths_per_million', 'new_deaths_per_million',
       'new_deaths_smoothed_per_million', 'reproduction_rate', 'icu_patients',
       'icu_patients_per_million', 'hosp_patients',
       'hosp_patients_per_million', 'weekly_icu_admissions',
       'weekly_icu_admissions_per_million', 'weekly_hosp_admissions',
       'weekly_hosp_admissions_per_million', 'total_tests', 'new_tests',
       'total_tests_per_thousand', 'new_tests_per_thousand',
       'new_tests_smoothed', 'new_tests_smoothed_per_thousand',
       'positive_rate', 'tests_per_case', 'tests_units', 'total_vaccinations',
       'people_vaccinated', 'people_fully_vaccinated', 'total_boosters',
       'new_vaccinations', 'new_vaccinati

In [None]:
import seaborn as sns

import matplotlib.pyplot as plt

# Data Cleaning: Handle missing values
df_cleaned = df.copy()
df_cleaned.fillna(0, inplace=True)

# Analysis: Calculate total cases and deaths by continent
continent_summary = df_cleaned.groupby('continent')[['total_cases', 'total_deaths']].sum().reset_index()

# Visualization: Total cases and deaths by continent
plt.figure(figsize=(12, 6))
sns.barplot(data=continent_summary, x='continent', y='total_cases', color='blue', label='Total Cases')
sns.barplot(data=continent_summary, x='continent', y='total_deaths', color='red', label='Total Deaths')
plt.title('Total Cases and Deaths by Continent')
plt.ylabel('Count')
plt.xlabel('Continent')
plt.legend()
plt.show()

# Save cleaned data for submission
df_cleaned.to_csv('cleaned_covid_data.csv', index=False)
print("Cleaned data saved as 'cleaned_covid_data.csv'")

In [25]:
# Filter the dataset to include only rows with non-null values for 'total_cases' and 'total_deaths'
df_filtered = df[df['total_cases'].notnull() & df['total_deaths'].notnull()]

# Calculate the case fatality rate (CFR) as a percentage
df_filtered['case_fatality_rate'] = (df_filtered['total_deaths'] / df_filtered['total_cases']) * 100

# Display the first few rows of the filtered dataset with the new column
print(df_filtered[['location', 'date', 'total_cases', 'total_deaths', 'case_fatality_rate']].head())

      location        date  total_cases  total_deaths  case_fatality_rate
0  Afghanistan  2020-01-05          0.0           0.0                 NaN
1  Afghanistan  2020-01-06          0.0           0.0                 NaN
2  Afghanistan  2020-01-07          0.0           0.0                 NaN
3  Afghanistan  2020-01-08          0.0           0.0                 NaN
4  Afghanistan  2020-01-09          0.0           0.0                 NaN


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_filtered['case_fatality_rate'] = (df_filtered['total_deaths'] / df_filtered['total_cases']) * 100


In [33]:
df['death_rate'] = df['total_deaths'] / df['total_cases']
df_latest = df.sort_values('date').groupby('location').tail(1)
print(df_latest[['location', 'total_cases', 'total_deaths', 'death_rate']])

                   location  total_cases  total_deaths  death_rate
422728       Western Sahara          NaN           NaN         NaN
282897      Northern Cyprus          NaN           NaN         NaN
225269                Macao          NaN           NaN         NaN
421053                Wales          NaN           NaN         NaN
375651               Taiwan          NaN           NaN         NaN
...                     ...          ...           ...         ...
217093            Lithuania          NaN           NaN         NaN
230301             Malaysia          NaN           NaN         NaN
21775                  Asia          NaN           NaN         NaN
424412                World          NaN           NaN         NaN
121602  European Union (27)          NaN           NaN         NaN

[255 rows x 4 columns]


In [36]:
# Ensure 'continent' column exists in df_filtered
if 'continent' not in df_filtered.columns:
    df_filtered = df_filtered.merge(df[['iso_code', 'continent']], on='iso_code', how='left')

# Replace infinite values in 'average_case_fatality_rate' with NaN
average_cfr_by_continent['average_case_fatality_rate'].replace([float('inf'), -float('inf')], float('nan'), inplace=True)

# Drop rows with NaN values in 'average_case_fatality_rate'
average_cfr_by_continent.dropna(subset=['average_case_fatality_rate'], inplace=True)

# Display the cleaned 'average_cfr_by_continent' DataFrame
print("Cleaned 'average_cfr_by_continent':")
print(average_cfr_by_continent)

Cleaned 'average_cfr_by_continent':
       continent  average_case_fatality_rate
1           Asia                    1.695381
2         Europe                   43.019821
3  North America                    1.618914
4        Oceania                    0.562020
5  South America                    2.451340


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  average_cfr_by_continent['average_case_fatality_rate'].replace([float('inf'), -float('inf')], float('nan'), inplace=True)


In [38]:
# Ensure 'continent' column exists in df_filtered
if 'continent' not in df_filtered.columns:
    df_filtered = df_filtered.merge(df[['iso_code', 'continent']], on='iso_code', how='left')

# Replace infinite values in 'case_fatality_rate' with NaN
df_filtered['case_fatality_rate'].replace([float('inf'), -float('inf')], float('nan'), inplace=True)

# Drop rows with NaN values in 'case_fatality_rate'
df_filtered.dropna(subset=['case_fatality_rate'], inplace=True)

# Display the cleaned 'df_filtered' DataFrame
print("Cleaned 'df_filtered':")
print(df_filtered.head())

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_filtered['case_fatality_rate'].replace([float('inf'), -float('inf')], float('nan'), inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_filtered['case_fatality_rate'].replace([float('inf'), -float('inf')], float('nan'), inplace=True)


Cleaned 'df_filtered':
   iso_code continent     location        date  total_cases  new_cases  \
56      AFG      Asia  Afghanistan  2020-03-01          1.0        1.0   
57      AFG      Asia  Afghanistan  2020-03-02          1.0        0.0   
58      AFG      Asia  Afghanistan  2020-03-03          1.0        0.0   
59      AFG      Asia  Afghanistan  2020-03-04          1.0        0.0   
60      AFG      Asia  Afghanistan  2020-03-05          1.0        0.0   

    new_cases_smoothed  total_deaths  new_deaths  new_deaths_smoothed  ...  \
56               0.143           0.0         0.0                  0.0  ...   
57               0.143           0.0         0.0                  0.0  ...   
58               0.143           0.0         0.0                  0.0  ...   
59               0.143           0.0         0.0                  0.0  ...   
60               0.143           0.0         0.0                  0.0  ...   

    handwashing_facilities  hospital_beds_per_thousand  life_ex

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_filtered.dropna(subset=['case_fatality_rate'], inplace=True)


In [None]:
# Select countries of interest
countries_of_interest = ['United States', 'India', 'Kenya']

# Filter the dataset for the selected countries
filtered_df = df[df['location'].isin(countries_of_interest)]

# Convert the 'date' column to datetime format
filtered_df['date'] = pd.to_datetime(filtered_df['date'])

# Plot total cases over time for selected countries
plt.figure(figsize=(12, 6))
for country in countries_of_interest:
    country_data = filtered_df[filtered_df['location'] == country]
    plt.plot(country_data['date'], country_data['total_cases'], label=country)

plt.title('Total COVID-19 Cases Over Time')
plt.xlabel('Date')
plt.ylabel('Total Cases')
plt.legend()
plt.grid(True)
plt.show()

# Plot total deaths over time for selected countries
plt.figure(figsize=(12, 6))
for country in countries_of_interest:
    country_data = filtered_df[filtered_df['location'] == country]
    plt.plot(country_data['date'], country_data['total_deaths'], label=country)

plt.title('Total COVID-19 Deaths Over Time')
plt.xlabel('Date')
plt.ylabel('Total Deaths')
plt.legend()
plt.grid(True)
plt.show()

In [27]:
# Replace infinite values in 'case_fatality_rate' with NaN in 'average_cfr_by_continent'
average_cfr_by_continent['case_fatality_rate'].replace([float('inf'), -float('inf')], float('nan'), inplace=True)

# Drop rows with NaN values in 'case_fatality_rate'
average_cfr_by_continent.dropna(subset=['case_fatality_rate'], inplace=True)

# Display the cleaned 'average_cfr_by_continent' DataFrame
print("Cleaned 'average_cfr_by_continent':")
print(average_cfr_by_continent)

Cleaned 'average_cfr_by_continent':
       continent  case_fatality_rate
1           Asia            1.695381
2         Europe           43.019821
3  North America            1.618914
4        Oceania            0.562020
5  South America            2.451340


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  average_cfr_by_continent['case_fatality_rate'].replace([float('inf'), -float('inf')], float('nan'), inplace=True)


In [None]:
# Fix infinite values in 'case_fatality_rate' for average_cfr_by_continent
average_cfr_by_continent['case_fatality_rate'].replace([float('inf'), -float('inf')], float('nan'), inplace=True)

# Drop rows with NaN values in 'case_fatality_rate'
average_cfr_by_continent.dropna(subset=['case_fatality_rate'], inplace=True)

# Ensure 'continent' column exists in df_filtered
if 'continent' not in df_filtered.columns:
    df_filtered = df_filtered.merge(df[['iso_code', 'continent']], on='iso_code', how='left')

# Verify the fixes
print("Fixed average_cfr_by_continent:")
print(average_cfr_by_continent)

print("\nUpdated df_filtered:")
print(df_filtered.head())

Fixed average_cfr_by_continent:
       continent  case_fatality_rate
1           Asia            1.695381
2         Europe           43.019821
3  North America            1.618914
4        Oceania            0.562020
5  South America            2.451340

Updated df_filtered:
  iso_code continent     location        date  total_cases  new_cases  \
0      AFG      Asia  Afghanistan  2020-01-05          0.0        0.0   
1      AFG      Asia  Afghanistan  2020-01-06          0.0        0.0   
2      AFG      Asia  Afghanistan  2020-01-07          0.0        0.0   
3      AFG      Asia  Afghanistan  2020-01-08          0.0        0.0   
4      AFG      Asia  Afghanistan  2020-01-09          0.0        0.0   

   new_cases_smoothed  total_deaths  new_deaths  new_deaths_smoothed  ...  \
0                 NaN           0.0         0.0                  NaN  ...   
1                 NaN           0.0         0.0                  NaN  ...   
2                 NaN           0.0         0.0       

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  average_cfr_by_continent['case_fatality_rate'].replace([float('inf'), -float('inf')], float('nan'), inplace=True)


In [None]:
import streamlit as st

# Streamlit Dashboard
st.title("COVID-19 Data Dashboard")

# Allow user input for country selection
countries = df['location'].unique()
selected_country = st.selectbox("Select a Country", countries)

# Allow user input for date range
min_date = pd.to_datetime(df['date']).min()
max_date = pd.to_datetime(df['date']).max()
start_date, end_date = st.date_input("Select Date Range", [min_date, max_date])

# Filter data based on user input
filtered_data = df[(df['location'] == selected_country) & 
                   (pd.to_datetime(df['date']) >= start_date) & 
                   (pd.to_datetime(df['date']) <= end_date)]

# Display filtered data
st.write(f"Data for {selected_country} from {start_date} to {end_date}")
st.dataframe(filtered_data)

# Plot total cases and deaths over time
st.subheader("Total Cases and Deaths Over Time")
if not filtered_data.empty:
    fig, ax = plt.subplots(figsize=(10, 6))
    ax.plot(filtered_data['date'], filtered_data['total_cases'], label='Total Cases', color='blue')
    ax.plot(filtered_data['date'], filtered_data['total_deaths'], label='Total Deaths', color='red')
    ax.set_title(f"COVID-19 Cases and Deaths in {selected_country}")
    ax.set_xlabel("Date")
    ax.set_ylabel("Count")
    ax.legend()
    st.pyplot(fig)
else:
    st.write("No data available for the selected country and date range.")

# Include hospitalization or ICU data if available
st.subheader("Hospitalization and ICU Data")
if 'icu_patients' in filtered_data.columns and not filtered_data['icu_patients'].isna().all():
    fig, ax = plt.subplots(figsize=(10, 6))
    ax.plot(filtered_data['date'], filtered_data['icu_patients'], label='ICU Patients', color='purple')
    ax.set_title(f"ICU Patients in {selected_country}")
    ax.set_xlabel("Date")
    ax.set_ylabel("Count")
    ax.legend()
    st.pyplot(fig)
else:
    st.write("No ICU data available for the selected country and date range.")

In [28]:
# Calculate the average case fatality rate (CFR) for each continent
average_cfr_by_continent = df_filtered.groupby('continent')['case_fatality_rate'].mean().reset_index()

# Rename the column for clarity
average_cfr_by_continent.rename(columns={'case_fatality_rate': 'average_case_fatality_rate'}, inplace=True)

# Display the updated DataFrame
print("Average Case Fatality Rate by Continent:")
print(average_cfr_by_continent)

Average Case Fatality Rate by Continent:
       continent  average_case_fatality_rate
0         Africa                         inf
1           Asia                    1.695381
2         Europe                   43.019821
3  North America                    1.618914
4        Oceania                    0.562020
5  South America                    2.451340


In [None]:
# Move the definition of df_filtered to a cell before it is used
# Filter the dataset to include only rows with non-null values for 'total_cases' and 'total_deaths'
df_filtered = df[df['total_cases'].notnull() & df['total_deaths'].notnull()]

# Calculate the case fatality rate (CFR) as a percentage
df_filtered['case_fatality_rate'] = (df_filtered['total_deaths'] / df_filtered['total_cases']) * 100

# Display the first few rows of the filtered dataset with the new column
print(df_filtered[['location', 'date', 'total_cases', 'total_deaths', 'case_fatality_rate']].head())

      location        date  total_cases  total_deaths  case_fatality_rate
0  Afghanistan  2020-01-05          0.0           0.0                 NaN
1  Afghanistan  2020-01-06          0.0           0.0                 NaN
2  Afghanistan  2020-01-07          0.0           0.0                 NaN
3  Afghanistan  2020-01-08          0.0           0.0                 NaN
4  Afghanistan  2020-01-09          0.0           0.0                 NaN


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_filtered['case_fatality_rate'] = (df_filtered['total_deaths'] / df_filtered['total_cases']) * 100


In [None]:
# Generate a summary report of the dataset
def generate_summary_report(df):
    print("Summary Report:")
    print("-" * 50)
    print(f"Total Rows: {df.shape[0]}")
    print(f"Total Columns: {df.shape[1]}")
    print("\nColumn Data Types:")
    print(df.dtypes)
    print("\nMissing Values:")
    print(df.isnull().sum())
    print("\nBasic Statistics:")
    print(df.describe(include='all'))

# Call the function to generate the report
generate_summary_report(df)

# Save the final processed dataset
final_file_name = 'final_covid_data.csv'
df.to_csv(final_file_name, index=False)
print(f"\nFinal processed dataset saved as '{final_file_name}'")

In [29]:
# Replace infinite values in 'average_case_fatality_rate' with NaN in 'average_cfr_by_continent'
average_cfr_by_continent['average_case_fatality_rate'].replace([float('inf'), -float('inf')], float('nan'), inplace=True)

# Drop rows with NaN values in 'average_case_fatality_rate'
average_cfr_by_continent.dropna(subset=['average_case_fatality_rate'], inplace=True)

# Display the cleaned 'average_cfr_by_continent' DataFrame
print("Cleaned 'average_cfr_by_continent':")
print(average_cfr_by_continent)

Cleaned 'average_cfr_by_continent':
       continent  average_case_fatality_rate
1           Asia                    1.695381
2         Europe                   43.019821
3  North America                    1.618914
4        Oceania                    0.562020
5  South America                    2.451340


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  average_cfr_by_continent['average_case_fatality_rate'].replace([float('inf'), -float('inf')], float('nan'), inplace=True)


In [None]:
# Verify the integrity of the 'average_cfr_by_continent' DataFrame
if average_cfr_by_continent.isnull().sum().sum() == 0:
    print("No missing values in 'average_cfr_by_continent'.")
else:
    print("Missing values detected in 'average_cfr_by_continent':")
    print(average_cfr_by_continent.isnull().sum())

# Verify the integrity of the 'df_filtered' DataFrame
if df_filtered.isnull().sum().sum() == 0:
    print("No missing values in 'df_filtered'.")
else:
    print("Missing values detected in 'df_filtered':")
    print(df_filtered.isnull().sum())

# Verify the integrity of the 'df' DataFrame
if df.isnull().sum().sum() == 0:
    print("No missing values in 'df'.")
else:
    print("Missing values detected in 'df':")
    print(df.isnull().sum())

No missing values in 'average_cfr_by_continent'.
Missing values detected in 'df_filtered':
iso_code                                        0
continent                                   20088
location                                        0
date                                            0
total_cases                                     0
                                            ...  
excess_mortality_cumulative_absolute       398511
excess_mortality_cumulative                398511
excess_mortality                           398511
excess_mortality_cumulative_per_million    398511
case_fatality_rate                          29283
Length: 68, dtype: int64
Missing values detected in 'df':
iso_code                                        0
continent                                   26525
location                                        0
date                                            0
total_cases                                 17631
                                            ...  
p

In [32]:
# Replace infinite values in 'average_case_fatality_rate' with NaN
average_cfr_by_continent['average_case_fatality_rate'].replace([float('inf'), -float('inf')], float('nan'), inplace=True)

# Drop rows with NaN values in 'average_case_fatality_rate'
average_cfr_by_continent.dropna(subset=['average_case_fatality_rate'], inplace=True)

# Verify the integrity of the 'average_cfr_by_continent' DataFrame
if average_cfr_by_continent.isnull().sum().sum() == 0:
    print("No missing values in 'average_cfr_by_continent'.")
else:
    print("Missing values detected in 'average_cfr_by_continent':")
    print(average_cfr_by_continent.isnull().sum())

# Ensure 'continent' column exists in df_filtered
if 'continent' not in df_filtered.columns:
    df_filtered = df_filtered.merge(df[['iso_code', 'continent']], on='iso_code', how='left')

# Display the cleaned 'average_cfr_by_continent' DataFrame
print("Cleaned 'average_cfr_by_continent':")
print(average_cfr_by_continent)

# Verify the integrity of the 'df_filtered' DataFrame
if df_filtered.isnull().sum().sum() == 0:
    print("No missing values in 'df_filtered'.")
else:
    print("Missing values detected in 'df_filtered':")
    print(df_filtered.isnull().sum())

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  average_cfr_by_continent['average_case_fatality_rate'].replace([float('inf'), -float('inf')], float('nan'), inplace=True)


No missing values in 'average_cfr_by_continent'.
Cleaned 'average_cfr_by_continent':
       continent  average_case_fatality_rate
1           Asia                    1.695381
2         Europe                   43.019821
3  North America                    1.618914
4        Oceania                    0.562020
5  South America                    2.451340
Missing values detected in 'df_filtered':
iso_code                                        0
continent                                   20088
location                                        0
date                                            0
total_cases                                     0
                                            ...  
excess_mortality_cumulative_absolute       398511
excess_mortality_cumulative                398511
excess_mortality                           398511
excess_mortality_cumulative_per_million    398511
case_fatality_rate                          29283
Length: 68, dtype: int64
