# Importing Libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Data Cleaning & Preprocessing

In [None]:
# these are the necessary series that are needed for this project
necessary_series = ["country", "date", "total_cases", "new_cases", "total_deaths", "new_deaths", "total_vaccinations", "new_vaccinations", "hosp_patients", "continent", "population"]

# reading the data CSV file
df_main = pd.read_csv("./compact.csv", usecols=necessary_series)

In [None]:
# finding the null values in dataset
df_main.isnull().sum()

# checking the datatypes of the series
df_main.dtypes

# list of numeric columns which have null values
null_value_cols = ["total_cases", "new_cases", "total_deaths", "new_deaths", "hosp_patients", "total_vaccinations", "new_vaccinations", "population"]

# creating a dictionary for column name: value pairs
null_value_dict = {column: 0 for column in null_value_cols}

# filling the null values in numeric columns with zeroes
df_primary = df_main.fillna(null_value_dict)

# now, checking why continent is null
df_primary.loc[df_primary["continent"].isnull()]

# so clearly, rows which have country name that of a continent and an invalid country name have continent column = NaN
# dropping those rows which have continent = NaN
df_primary.dropna(subset=["continent"], axis=0, inplace=True)

# changing the datatype of date column to datetime
df_primary["date"] = df_primary["date"].astype(dtype="datetime64[s]")

# Filtering & Grouping Data

In [None]:
# creating two extra series, one for month and one year
months = df_primary["date"].dt.month_name()
years = df_primary["date"].dt.year
df_primary["month"] = months
df_primary["year"] = years

# grouping the data into useful groups
df_countries = df_primary.groupby("country")
df_continents = df_primary.groupby("continent")
df_timeline = df_primary.groupby(["month", "year"])