In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
try:
    df = pd.read_csv("COVID-19 Vaccine Data in Ontario.csv")
    print("CSV file loaded successfully")
    
except FileNotFoundError:
    print("Error: The specified CSV file was not found.")

In [None]:
df.head()

In [None]:
df.info()

In [None]:
df.describe()

The step below describes following: If a column has more than 80% missing values, it might be better to drop it, as it may not provide much useful information. For columns with less than 80% missing values, we can fill the missing values with the mean or median of the column.

In [None]:
missing_percentage = df.isnull().sum() / len(df) * 100
print(missing_percentage.sort_values(ascending=False))

To determine whether to fill missing values (NaN) with the mean or median, we examined the distribution of the data. If the data is symmetrically distributed, the mean provides a good measure of central tendency. However, if the data has a skewed distribution (either left or right), the median is a better measure of central tendency because it is not affected by outliers or extreme values.

In [None]:
plt.figure(figsize=(10,6))
sns.histplot(
    data=df.isna().melt(value_name="missing"),
    y="variable",
    hue="missing",
    multiple="fill",
)
plt.title("Missing Data Visualization") #savefig("visualizing_missing_data_with_barplot_Seaborn_distplot.png", dpi=100)
plt.show()

In [None]:
# as an example we take the "cases_unvac_rate_per100k" and "cases_notfull_vac_rate_per100K" columns to see the distribution of data
sns.histplot(df['cases_unvac_rate_per100K'], bins=30, kde=True)
plt.show()

In [None]:
sns.histplot(df['cases_notfull_vac_rate_per100K'], bins=30, kde=True)
plt.show()

In [None]:
# Filling missing values with median for skewed data
for column in df.columns:
    if df[column].dtype =='float64' or df[column].dtype == 'int64':
        df[column] = df[column].fillna(df[column].median())

In [None]:
df.head()

In [None]:
df.rename(columns={"_id":"ID"}, inplace=True)

In [None]:
# converting date column to datetime format
df['date'] = pd.to_datetime(df['date'])
print("Date column converted to datetime successfully")
    

In [None]:
print(df.dtypes)

In [None]:
try:
    df.to_csv('processed_COVID19_vaccine_Data.csv', index=False)
    print("Processed dataset saved successfully")
except pd.errors.EmptyDataError:
    print("Error: The CSV file is empty.")
except pd.errors.ParserError:
    print("Error: There was an error parsing the CSV file.")
except Exception as e:
    print(f"An unexpected error occurred: {e}")