### Importing Libraries

In [9]:
import math
import pandas as pd
import numpy as np
from scipy import stats
import matplotlib.pyplot as plt
from scipy.stats import ttest_rel
pd.set_option('max_columns', 50)

### Importing dataset

In [10]:
df = pd.read_csv('./datasets/dataset_clean.csv')
df = df.drop('Unnamed: 0', axis = 1)
df.head()

Unnamed: 0,Country,Vaccines are not important for children to have (%),Vaccines are not safe (%),Vaccines are not effective (%),Overall Avg. (%),% of one years-old not vaccinated,Vaccination policy
0,Afghanistan,2.1,4.47,1.84,2.803333,34.0,
1,Albania,1.67,15.85,8.96,8.826667,1.0,
2,Algeria,3.7,11.28,7.7,7.56,9.0,
3,Argentina,1.21,4.85,2.95,3.003333,14.0,
4,Armenia,12.18,20.57,12.47,15.073333,8.0,


In [11]:
df['Vaccination policy'].unique()

array([nan, 'RA', 'MA'], dtype=object)

## Matched Pairs Test

In [15]:
# Separating country with mandatory vaccination policy from the others
mandatory = df.loc[df['Vaccination policy'] == 'MA']
not_mandatory = df.loc[(df['Vaccination policy'] != 'MA') & (df['Vaccination policy'] != 'RA')]
recommended = df.loc[df['Vaccination policy'] == 'RA']

In [16]:
mandatory_ttest = ttest_rel(mandatory['Vaccines are not important for children to have (%)'],
                            mandatory['% of one years-old not vaccinated'])

not_mandatory_ttest = ttest_rel(not_mandatory['Vaccines are not important for children to have (%)'],
                            not_mandatory['% of one years-old not vaccinated'])

recommended_ttest = ttest_rel(recommended['Vaccines are not important for children to have (%)'],
                            recommended['% of one years-old not vaccinated'])

print("Countries with mandatory vaccination policy:")
print("T-Value: ", mandatory_ttest[0])
print("P-Value: ", mandatory_ttest[1])

print("Countries with no mandatory vaccination policy:")
print("T-Value: ", not_mandatory_ttest[0])
print("P-Value: ", not_mandatory_ttest[1])

print("Countries with recommended vaccination policy:")
print("T-Value: ", recommended_ttest[0])
print("P-Value: ", recommended_ttest[1])

Countries with mandatory vaccination policy:
T-Value:  -1.7295783540481857
P-Value:  0.11776295568423537
Countries with no mandatory vaccination policy:
T-Value:  -8.879513786171822
P-Value:  2.1237453375016956e-14
Countries with recommended vaccination policy:
T-Value:  -2.5004785638519773
P-Value:  0.024482666087043525


Countries with mandatory vaccination policy: 
- P-Value is higher than 0.05: there is no significant difference between the mean of the share that disagree that vaccines are important for children to have and the mean of the percentage of one years-old not vaccinated. The null hypothesis cannot be rejected.
- The distribution of the share that disagree that vaccines are important for children to have is similar to the distribution of the percentage of one years-old not vaccinated.

Countries with no mandatory vaccination policy:
- P-Value is lower than 0.05: there is a significant difference between the mean of the share that disagree that vaccines are important for children to have and the mean of the percentage of one years-old not vaccinated. The null hypothesis can be rejected.
- Mean % of one years-old not vaccinated is larger than the mean of Vaccines are not important for children to have (%): lot of children not vaccinated but not so many people thinking that it's not important

In [14]:
# Confirming mean difference for countries with no mandatory vaccination policy
print(not_mandatory['Vaccines are not important for children to have (%)'].mean())
print(not_mandatory['% of one years-old not vaccinated'].mean())

2.8523966942148746
12.958677685950413
