In [1]:
#Launch commands to automatically reload modules
%load_ext autoreload
%autoreload 2

In [2]:
#Load libraries to use
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
#Load dataset into a dataframe
covid = pd.read_csv('../data/raw/COVID.csv')

In [4]:
#Display dimensions of the dataframe(df)
covid.shape

(239, 14)

In [5]:
#Display summary information of dataframe
covid.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 239 entries, 0 to 238
Data columns (total 14 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Country            239 non-null    object 
 1   Total Cases        239 non-null    object 
 2   New Cases          12 non-null     object 
 3   Total Deaths       234 non-null    object 
 4   New Deaths         7 non-null      float64
 5   Total Recovered    190 non-null    object 
 6   New Recovered      17 non-null     object 
 7   Active Cases       191 non-null    object 
 8   Serious, Critical  60 non-null     object 
 9   Tot Cases/1M pop   230 non-null    object 
 10  Deaths/1M pop      225 non-null    object 
 11  Total Tests        213 non-null    object 
 12  Tests/1M pop       213 non-null    object 
 13  Population         229 non-null    object 
dtypes: float64(1), object(13)
memory usage: 26.3+ KB


In [6]:
covid.head()

Unnamed: 0,Country,Total Cases,New Cases,Total Deaths,New Deaths,Total Recovered,New Recovered,Active Cases,"Serious, Critical",Tot Cases/1M pop,Deaths/1M pop,Total Tests,Tests/1M pop,Population
0,USA,111367209,,1199031,,109053249.0,,1114929.0,1771.0,332633,3581,1186742917,3544577,334805269
1,India,45028429,161.0,533475,2.0,,,,,32012,379,935879495,665334,1406631776
2,France,40138560,,167642,,39970918.0,,0.0,,612013,2556,271490188,4139547,65584518
3,Germany,38819284,574.0,182439,28.0,38240600.0,,396245.0,,462776,2175,122332384,1458359,83883596
4,Brazil,38407327,,709765,,36249161.0,,1448401.0,,178345,3296,63776166,296146,215353593


In [7]:
# Columns to clean and convert
columns_to_convert = [
    'Total Cases', 'New Cases', 'Total Deaths', 'Total Recovered', 'New Recovered',
    'Active Cases', 'Serious, Critical', 'Tot Cases/1M pop', 'Deaths/1M pop',
    'Total Tests', 'Tests/1M pop', 'Population'
]
# Remove commas and convert to numeric
for column in columns_to_convert:
    covid[column] = covid[column].str.replace(',', '')  # Remove commas
    data[column] = pd.to_numeric(data[column], errors='coerce')  # Convert to numeric

In [8]:
data.head()

Unnamed: 0,Country,Total Cases,New Cases,Total Deaths,New Deaths,Total Recovered,New Recovered,Active Cases,"Serious, Critical",Tot Cases/1M pop,Deaths/1M pop,Total Tests,Tests/1M pop,Population
0,USA,111367209,,1199031.0,,109053249.0,,1114929.0,1771.0,332633.0,3581.0,1186743000.0,3544577.0,334805300.0
1,India,45028429,161.0,533475.0,2.0,,,,,32012.0,379.0,935879500.0,665334.0,1406632000.0
2,France,40138560,,167642.0,,39970918.0,,0.0,,612013.0,2556.0,271490200.0,4139547.0,65584520.0
3,Germany,38819284,574.0,182439.0,28.0,38240600.0,,396245.0,,462776.0,2175.0,122332400.0,1458359.0,83883600.0
4,Brazil,38407327,,709765.0,,36249161.0,,1448401.0,,178345.0,3296.0,63776170.0,296146.0,215353600.0


In [10]:
columns_to_convert = [
    'Total Cases', 'New Cases', 'Total Deaths','New Deaths', 'Total Recovered', 'New Recovered',
    'Active Cases', 'Serious, Critical', 'Tot Cases/1M pop', 'Deaths/1M pop',
    'Total Tests', 'Tests/1M pop', 'Population'
]

# Fill NaN values with the mean of the column
for column in columns_to_convert:
    data[column].fillna(data[column].mean(), inplace=True)

# Check for missing values after processing
print("Missing values after processing:")
print(data.isnull().sum())

print("Data types after processing:")
print(data.dtypes)
print(data)

Missing values after processing:
Country              0
Total Cases          0
New Cases            0
Total Deaths         0
New Deaths           0
Total Recovered      0
New Recovered        0
Active Cases         0
Serious, Critical    0
Tot Cases/1M pop     0
Deaths/1M pop        0
Total Tests          0
Tests/1M pop         0
Population           0
dtype: int64
Data types after processing:
Country               object
Total Cases            int64
New Cases            float64
Total Deaths         float64
New Deaths           float64
Total Recovered      float64
New Recovered        float64
Active Cases         float64
Serious, Critical    float64
Tot Cases/1M pop     float64
Deaths/1M pop        float64
Total Tests          float64
Tests/1M pop         float64
Population           float64
dtype: object
     Country  Total Cases  New Cases  Total Deaths  New Deaths   
0        USA    111367209      611.5     1199031.0   15.857143  \
1      India     45028429      161.0      533475.0 

In [11]:
data.head()

Unnamed: 0,Country,Total Cases,New Cases,Total Deaths,New Deaths,Total Recovered,New Recovered,Active Cases,"Serious, Critical",Tot Cases/1M pop,Deaths/1M pop,Total Tests,Tests/1M pop,Population
0,USA,111367209,611.5,1199031.0,15.857143,109053200.0,780.058824,1114929.0,1771.0,332633.0,3581.0,1186743000.0,3544577.0,334805300.0
1,India,45028429,161.0,533475.0,2.0,9980155.0,780.058824,264824.5,1252.8,32012.0,379.0,935879500.0,665334.0,1406632000.0
2,France,40138560,611.5,167642.0,15.857143,39970920.0,780.058824,0.0,1252.8,612013.0,2556.0,271490200.0,4139547.0,65584520.0
3,Germany,38819284,574.0,182439.0,28.0,38240600.0,780.058824,396245.0,1252.8,462776.0,2175.0,122332400.0,1458359.0,83883600.0
4,Brazil,38407327,611.5,709765.0,15.857143,36249160.0,780.058824,1448401.0,1252.8,178345.0,3296.0,63776170.0,296146.0,215353600.0
