<div style="text-align:center">
    <img src="../../files/monolearn-logo.png" height="150px">
    <h1>ML course</h1>
    <h3>Session 12: Covid vaccination project</h3>
    <h4><a href="https://amzenterprise.ir/">Ali Momenzadeh</a></h5>
</div>

#### Import libraries

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
#When using the 'inline' backend, your matplotlib graphs will be included in your notebook, next to the code.

import seaborn as sns

import warnings
warnings.filterwarnings('ignore')

#### Load and prepare data

In [None]:
data1 = pd.read_csv("country_vaccinations.csv")
data2 = pd.read_csv("country_vaccinations_by_manufacturer.csv")

#### EDA

In [None]:
data1.head()

In [None]:
data2.head()

In [None]:
data1.info()

In [None]:
data2.info()

In [None]:
data1.describe()

In [None]:
data2.describe()

#### Data Preprocessing

In [None]:
data1.isnull().sum()

In [None]:
data2.isnull().sum()

In [None]:
print("The number of countries:", len(np.unique(data1["country"])))

In [None]:
data1 = data1.dropna(subset=["people_fully_vaccinated"])

In [None]:
print("The number of countries:", len(np.unique(data1["country"])))

In [None]:
data1.isnull().sum()

In [None]:
data1["date"] = pd.to_datetime(data1["date"], format="%Y-%m-%d")
data1.head()

In [None]:
data1 = data1.replace([np.inf, -np.inf], np.nan)
data1 = data1.fillna(0)
data1.isnull().sum()

In [None]:
data1["day"] = pd.DatetimeIndex(data1["date"]).day
data1["month"] = pd.DatetimeIndex(data1["date"]).month
data1["year"] = pd.DatetimeIndex(data1["date"]).year
data1["week_of_year"] = pd.DatetimeIndex(data1["date"]).weekofyear
data1["day_of_year"] = pd.DatetimeIndex(data1["date"]).dayofyear
data1["day_of_week"] = pd.DatetimeIndex(data1["date"]).dayofweek

In [None]:
data1.head()

#### Strorytelling - Visualization

In [None]:
np.unique(data2["vaccine"])

In [None]:
np.unique(data2["location"])

Number of Iranian People Vaccinated

In [None]:
data1.query('country == "Iran"')["people_vaccinated"].sum()

In [None]:
def max_total_vaccinated(data):
    return (
        data[["country", "total_vaccinations"]]
        .groupby(by="country")
        .max()
        .sort_values(by="total_vaccinations", ascending=False)
        .reset_index()
    )

max_total_vaccinated(data1)

In [None]:
def sum_people_vaccinated(data):
    return (
        data[["country", "people_vaccinated"]]
        .groupby(by="country")
        .sum()
        .sort_values(by="people_vaccinated", ascending=False)
        .reset_index()
    )

sum_people_vaccinated(data1)

In [None]:
def sum_people_fully_vaccinated(data):
    return (
        data[["country", "people_fully_vaccinated"]]
        .groupby(by="country")
        .sum()
        .sort_values(by="people_fully_vaccinated", ascending=False)
        .reset_index()
    )

sum_people_fully_vaccinated(data1)

In [None]:
def avg_total_vaccinated_per_hundred(data):
    return (
        data[["country", "total_vaccinations_per_hundred"]]
        .groupby(by="country")
        .mean()
        .sort_values(by="total_vaccinations_per_hundred", ascending=False)
        .reset_index()
    )

avg_total_vaccinated_per_hundred(data1)

In [None]:
def avg_people_vaccinated_per_hundred(data):
    return (
        data[["country", "people_vaccinated_per_hundred"]]
        .groupby(by="country")
        .mean()
        .sort_values(by="people_vaccinated_per_hundred", ascending=False)
        .reset_index()
    )

avg_people_vaccinated_per_hundred(data1)

In [None]:
def avg_people_fully_vaccinated(data):
    return (
        data[["country", "people_fully_vaccinated_per_hundred"]]
        .groupby(by="country")
        .mean()
        .sort_values(by="people_fully_vaccinated_per_hundred", ascending=False)
        .reset_index()
    )

avg_people_fully_vaccinated(data1)

In [None]:
def avg_daily_vaccinations(data):
    return (
        data[["country", "daily_vaccinations_per_million"]]
        .groupby(by="country")
        .mean()
        .sort_values(by="daily_vaccinations_per_million", ascending=False)
        .reset_index()
    )

avg_daily_vaccinations(data1)

In [None]:
def min_daily_vaccination_country(data):
    daily_vaccination = data.pivot_table(
        index="country", columns="date", values="daily_vaccinations"
    )
    daily_vaccination.replace(0.00, np.nan, inplace=True)
    daily_vaccination["Min Daily Vaccination"] = daily_vaccination.min(axis=1)
    daily_vaccination["Date - Min Daily Vaccination"] = daily_vaccination.idxmin(axis=1)
    daily_vaccination.sort_values(
        by="Min Daily Vaccination", ascending=True, inplace=True
    )
    daily_vaccination.rename_axis("", axis=1, inplace=True)
    return daily_vaccination[
        ["Min Daily Vaccination", "Date - Min Daily Vaccination"]
    ].reset_index()

min_daily_vaccination_country(data1)

In [None]:
def max_daily_vaccination_country(data):
    daily_vaccination = data.pivot_table(
        index="country", columns="date", values="daily_vaccinations"
    )
    daily_vaccination["Max Daily Vaccination"] = daily_vaccination.max(axis=1)
    daily_vaccination["Date - Max Daily Vaccination"] = daily_vaccination.idxmax(axis=1)
    daily_vaccination.sort_values(
        by="Max Daily Vaccination", ascending=False, inplace=True
    )
    daily_vaccination.rename_axis("", axis=1, inplace=True)

    return daily_vaccination[
        ["Max Daily Vaccination", "Date - Max Daily Vaccination"]
    ].reset_index()

max_daily_vaccination_country(data1)

Top and bottom countries based on Total Vaccination

In [None]:
sns.set_theme(style="whitegrid")
sns.set(rc={"figure.figsize": (12, 5)})

fig, axes = plt.subplots(2, 1)

sns.barplot(
    x="country",
    y="total_vaccinations",
    data=max_total_vaccinated(data1).head(),
    ax=axes[0],
)
axes[0].set(
    xlabel="",
    ylabel="Total Vaccinations",
    title="Top countries based on Total Vaccination",
)

sns.barplot(
    x="country",
    y="total_vaccinations",
    data=max_total_vaccinated(data1).tail(),
    ax=axes[1],
)
axes[1].set(
    xlabel="",
    ylabel="Total Vaccinations",
    title="Bottom countries based on Total Vaccination",
)

fig.tight_layout()
plt.show()

Top and bottom countries based on People Vaccinated

In [None]:
fig, axes = plt.subplots(2, 1)

sns.barplot(
    x="country",
    y="people_vaccinated",
    data=sum_people_vaccinated(data1).head(),
    ax=axes[0],
)
axes[0].set(
    xlabel="",
    ylabel="People Vaccinated",
    title="Top countries based on People Vaccinated",
)

sns.barplot(
    x="country",
    y="people_vaccinated",
    data=sum_people_vaccinated(data1).tail(),
    ax=axes[1],
)
axes[1].set(
    xlabel="",
    ylabel="People Vaccinated",
    title="Bottom countries based on People Vaccinated",
)

fig.tight_layout()
plt.show()

Top and bottom baed on People Fully Vaccinated

In [None]:
fig, axes = plt.subplots(2, 1)

sns.barplot(
    x="country",
    y="people_fully_vaccinated",
    data=sum_people_fully_vaccinated(data1).head(),
    ax=axes[0],
)
axes[0].set(
    xlabel="",
    ylabel="People Fully Vaccinated",
    title="Top countries based on People Fully Vaccinated",
)

sns.barplot(
    x="country",
    y="people_fully_vaccinated",
    data=sum_people_fully_vaccinated(data1).tail(),
    ax=axes[1],
)
axes[1].set(
    xlabel="",
    ylabel="People Fully Vaccinated",
    title="Vottom countries based on People Fully Vaccinated",
)

fig.tight_layout()
plt.show()

Top and bottom countries based on Total Vaccinations Per Hundred

In [None]:
fig, axes = plt.subplots(2, 1)

sns.barplot(
    x="country",
    y="total_vaccinations_per_hundred",
    data=avg_total_vaccinated_per_hundred(data1).head(),
    ax=axes[0],
)
axes[0].set(
    xlabel="",
    ylabel="Average Vaccinations per 100",
    title="Top countries based on Total Vaccinations Per Hundred",
)

sns.barplot(
    x="country",
    y="total_vaccinations_per_hundred",
    data=avg_total_vaccinated_per_hundred(data1).tail(),
    ax=axes[1],
)
axes[1].set(
    xlabel="",
    ylabel="Average Vaccinations per 100",
    title="Bottom countries based on Total Vaccinations Per Hundred",
)

fig.tight_layout(h_pad=3)
plt.show()

Top and bottom countries based on People Vaccinated Per Hundred

In [None]:
fig, axes = plt.subplots(2, 1)

sns.barplot(
    x="country",
    y="people_vaccinated_per_hundred",
    data=avg_people_vaccinated_per_hundred(data1).head(),
    ax=axes[0],
)
axes[0].set(
    xlabel="",
    ylabel="People Vaccinated per 100",
    title="Top countries based on People Vaccinated Per Hundred",
)

sns.barplot(
    x="country",
    y="people_vaccinated_per_hundred",
    data=avg_people_vaccinated_per_hundred(data1).tail(),
    ax=axes[1],
)
axes[1].set(
    xlabel="",
    ylabel="People Vaccinated per 100",
    title="Bottom countries based on People Vaccinated Per Hundred",
)

fig.tight_layout()
plt.show()

Top and bottom countries based on People Fully Vaccinated Per Hundred

In [None]:
fig, axes = plt.subplots(2, 1)

sns.barplot(
    x="country",
    y="people_fully_vaccinated_per_hundred",
    data=avg_people_fully_vaccinated(data1).head(),
    ax=axes[0],
)
axes[0].set(
    xlabel="",
    ylabel="People Fully Vaccinated per 100",
    title="Top countries based on People Fully Vaccinated Per Hundred",
)

sns.barplot(
    x="country",
    y="people_fully_vaccinated_per_hundred",
    data=avg_people_fully_vaccinated(data1).tail(),
    ax=axes[1],
)
axes[1].set(
    xlabel="",
    ylabel="People Fully Vaccinated per 100",
    title="Bottom countries based on People Fully Vaccinated Per Hundred",
)

fig.tight_layout(h_pad=3)
plt.show()

Top and bottom Daily Vaccination based on Country

In [None]:
fig, axes = plt.subplots(1, 2)

sns.barplot(
    data=max_daily_vaccination_country(data1).head(),
    x="country",
    y="Max Daily Vaccination",
    ax=axes[0],
    hue="Date - Max Daily Vaccination",
)
axes[0].set(
    xlabel="", ylabel="Daily Vaccination", title="Max Daily Vaccination by Country"
)

sns.barplot(
    data=min_daily_vaccination_country(data1).head(),
    x="country",
    y="Min Daily Vaccination",
    ax=axes[1],
    hue="Date - Min Daily Vaccination",
)
axes[1].set(
    xlabel="", ylabel="Daily Vaccination", title="Min Daily Vaccination by Country"
)

fig.tight_layout()
plt.show()

Top and bottom Daily Vaccination based on Country Per Million

In [None]:
fig, axes = plt.subplots(2, 1)

sns.barplot(
    x="country",
    y="daily_vaccinations_per_million",
    data=avg_daily_vaccinations(data1).head(),
    ax=axes[0],
)
axes[0].set(
    xlabel="",
    ylabel="Daily Vaccinations per Million",
    title="Top Daily Vaccination based on Country Per Million",
)

sns.barplot(
    x="country",
    y="daily_vaccinations_per_million",
    data=avg_daily_vaccinations(data1).tail(),
    ax=axes[1],
)
axes[1].set(
    xlabel="",
    ylabel="Daily Vaccinations per Million",
    title="Bottom Daily Vaccination based on Country Per Million",
)

fig.tight_layout(h_pad=3)
plt.show()

Evolution of Total Vaccinations

In [None]:
sns.lineplot(x="month", y="total_vaccinations", data=data1)

In [None]:
sns.lineplot(x="day_of_year", y="total_vaccinations", data=data1)

In [None]:
sns.lineplot(x="day", y="total_vaccinations", data=data1)

In [None]:
sns.lineplot(x="date", y="total_vaccinations", data=data1)

Share of each vaccine

In [None]:
shares = data2.groupby(["vaccine"])["total_vaccinations"].sum()
plt.pie(shares, labels=shares.index, autopct="%1.1f%%", radius=4)

In [None]:
plt.bar(shares.index, shares, color="#ff9999")
plt.xticks(rotation=90)

Correlation Heatmap

In [None]:
plt.subplots(figsize=(12, 12))
sns.heatmap(data1.corr(), annot=True, square=True)
plt.show()

#### Data Preprocessing

In [None]:
data1.info()

In [None]:
data1.drop(['country','iso_code','date','source_name','source_website','day','year','week_of_year','day_of_year','day_of_week'],axis=1,inplace=True)

In [None]:
data1.info()

#### Encoding

In [None]:
data1 = pd.get_dummies(data1, columns=["vaccines"])

In [None]:
data1.info()

In [None]:
data1.head()

#### Feature Scaling

In [None]:
from sklearn.preprocessing import StandardScaler

columns = data1.columns
scaler = StandardScaler()
scaled_features = scaler.fit_transform(data1)
data1 = pd.DataFrame(scaled_features, columns=columns)

In [None]:
data1.head()

#### Train and test

In [None]:
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.metrics import mean_squared_error

X_train, X_test, y_train, y_test = train_test_split(
    data1.drop(["total_vaccinations"], axis=1),
    data1["total_vaccinations"],
    test_size=0.2,
    random_state=0,
)

##### Multiple Regression

In [None]:
from sklearn.linear_model import LinearRegression

model_mlr = LinearRegression()
model_mlr.fit(X_train, y_train)

y_pred_mlr = model_mlr.predict(X_test)

In [None]:
sns.histplot(y_test - y_pred_mlr)

In [None]:
map = sns.cubehelix_palette(as_cmap=True)
f, ax = plt.subplots()
points = ax.scatter(y_test, y_pred_mlr, c=y_test, cmap=map)
f.colorbar(points)
plt.show()

In [None]:
print("R2 Score:", metrics.r2_score(y_test, y_pred_mlr))
print("Mean Absolute Error:", metrics.mean_absolute_error(y_test, y_pred_mlr))
print("Mean Squared Error:", metrics.mean_squared_error(y_test, y_pred_mlr))
print("Root Mean Squared Error:", np.sqrt(metrics.mean_squared_error(y_test, y_pred_mlr)))

##### RandomForest Regression

In [None]:
from sklearn.ensemble import RandomForestRegressor

model_rf = RandomForestRegressor()
model_rf.fit(X_train, y_train)
y_pred_rf = model_rf.predict(X_test)

In [None]:
sns.histplot(y_test - y_pred_rf)

In [None]:
map = sns.cubehelix_palette(as_cmap=True)
f, ax = plt.subplots()
points = ax.scatter(y_test, y_pred_rf, c=y_test, cmap=map)
f.colorbar(points)
plt.show()

In [None]:
print("R2 Score:", metrics.r2_score(y_test, y_pred_rf))
print("Mean Absolute Error:", metrics.mean_absolute_error(y_test, y_pred_rf))
print("Mean Squared Error:", metrics.mean_squared_error(y_test, y_pred_rf))
print("Root Mean Squared Error:", np.sqrt(metrics.mean_squared_error(y_test, y_pred_rf)))

##### KNN Regression

In [None]:
from sklearn.neighbors import KNeighborsRegressor

model_knn = KNeighborsRegressor(n_neighbors=2)
model_knn.fit(X_train, y_train)
y_pred_knn = model_knn.predict(X_test)

In [None]:
sns.histplot(y_test - y_pred_knn)

In [None]:
map = sns.cubehelix_palette(as_cmap=True)
f, ax = plt.subplots()
points = ax.scatter(y_test, y_pred_knn, c=y_test, cmap=map)
f.colorbar(points)
plt.show()

In [None]:
print("R2 Score:", metrics.r2_score(y_test, y_pred_knn))
print("Mean Absolute Error:", metrics.mean_absolute_error(y_test, y_pred_knn))
print("Mean Squared Error:", metrics.mean_squared_error(y_test, y_pred_knn))
print("Root Mean Squared Error:", np.sqrt(metrics.mean_squared_error(y_test, y_pred_knn)))

##### Decision Tree Regressor

In [None]:
from sklearn.tree import DecisionTreeRegressor

model_tree = DecisionTreeRegressor()
model_tree.fit(X_train, y_train)
y_pred_dt = model_tree.predict(X_test)

In [None]:
sns.histplot(y_test - y_pred_dt)

In [None]:
map = sns.cubehelix_palette(as_cmap=True)
f, ax = plt.subplots()
points = ax.scatter(y_test, y_pred_dt, c=y_test, cmap=map)
f.colorbar(points)
plt.show()

In [None]:
print("R2 Score:", metrics.r2_score(y_test, y_pred_dt))
print("Mean Absolute Error:", metrics.mean_absolute_error(y_test, y_pred_dt))
print("Mean Squared Error:", metrics.mean_squared_error(y_test, y_pred_dt))
print("Root Mean Squared Error:", np.sqrt(metrics.mean_squared_error(y_test, y_pred_dt)))

##### SVM Regression

In [None]:
from sklearn.svm import SVR

model_svm = SVR(kernel="rbf")
model_svm.fit(X_train, y_train)
y_pred_svm = model_svm.predict(X_test)

In [None]:
sns.histplot(y_test - y_pred_svm)

In [None]:
map = sns.cubehelix_palette(as_cmap=True)
f, ax = plt.subplots()
points = ax.scatter(y_test, y_pred_svm, c=y_test, cmap=map)
f.colorbar(points)
plt.show()

In [None]:
print("R2 Score:", metrics.r2_score(y_test, y_pred_svm))
print("Mean Absolute Error:", metrics.mean_absolute_error(y_test, y_pred_svm))
print("Mean Squared Error:", metrics.mean_squared_error(y_test, y_pred_svm))
print("Root Mean Squared Error:", np.sqrt(metrics.mean_squared_error(y_test, y_pred_svm)))