## Importing librairies

In [1]:
import pandas as pd
import numpy as np
import fuzzy_pandas as fpd
import requests
from bs4 import BeautifulSoup

## Survey Dataset

### Content to keep:
#### Sheet:
- Crosstabs all countries

#### Questions:
- Q24 Do you strongly or somewhat agree, strongly or somewhat disagree or neither agree nor disagree with the following statement? Vaccines are important for children to have.
- Q25 Do you strongly or somewhat agree, strongly or somewhat disagree or neither agree nor disagree with the following statement? Vaccines are safe.
- Q26 Do you strongly or somewhat agree, strongly or somewhat disagree or neither agree nor disagree with the following statement? Vaccines are effective.

#### Columns
- Country
- Question
- Response
- National (%, count)

In [2]:
path = './datasets/wgm2018-dataset-crosstabs-all-countries.xlsx'
survey = pd.read_excel(path, sheet_name = 'Crosstabs all countries', index = [3], header = [2])
survey.head()

Unnamed: 0,Country,Question,Response,Column N %,Unweighted Count,Column N %.1,Unweighted Count.1,Column N %.2,Unweighted Count.2,Column N %.3,...,Column N %.16,Unweighted Count.16,Column N %.17,Unweighted Count.17,Column N %.18,Unweighted Count.18,Column N %.19,Unweighted Count.19,Column N %.20,Unweighted Count.20
0,Afghanistan,"Q1 How much do you, personally, know about sci...",A lot,0.024561,30,0.045239,23,0.004403,7,0.029301,...,0.022139,4,0.049635,14,0.0,0,0.044383,6,0.022938,24
1,Afghanistan,"Q1 How much do you, personally, know about sci...",Some,0.337138,393,0.416994,230,0.25929,163,0.408161,...,0.355557,76,0.472349,138,0.0,0,0.479065,61,0.320562,326
2,Afghanistan,"Q1 How much do you, personally, know about sci...",Not much,0.309527,287,0.34742,170,0.272586,117,0.283687,...,0.290543,57,0.275725,68,0.0,0,0.338818,32,0.307318,251
3,Afghanistan,"Q1 How much do you, personally, know about sci...",Nothing at all,0.292751,257,0.188135,75,0.394737,182,0.244218,...,0.298553,58,0.185203,41,0.0,0,0.137733,12,0.310804,244
4,Afghanistan,"Q1 How much do you, personally, know about sci...",(DK),0.036023,33,0.002212,2,0.068984,31,0.034633,...,0.033208,6,0.017088,5,0.0,0,0.0,0,0.038378,32


In [3]:
# Rename columns
survey = survey.rename(columns = {'Column N %':'Percentage', 'Unweighted Count':'Count'})

In [4]:
# Keeping only national results
survey = survey.iloc[:, :5]
survey

Unnamed: 0,Country,Question,Response,Percentage,Count
0,Afghanistan,"Q1 How much do you, personally, know about sci...",A lot,0.024561,30
1,Afghanistan,"Q1 How much do you, personally, know about sci...",Some,0.337138,393
2,Afghanistan,"Q1 How much do you, personally, know about sci...",Not much,0.309527,287
3,Afghanistan,"Q1 How much do you, personally, know about sci...",Nothing at all,0.292751,257
4,Afghanistan,"Q1 How much do you, personally, know about sci...",(DK),0.036023,33
...,...,...,...,...,...
35840,Zimbabwe,How a person views personal & societal benefit...,Included,0.247805,250
35841,Zimbabwe,How a person views personal & societal benefit...,Excluded,0.107581,106
35842,Zimbabwe,How a person views personal & societal benefit...,Sceptic,0.076144,78
35843,Zimbabwe,How a person views personal & societal benefit...,Did not answer one of two questions,0.112203,92


In [5]:
# Keeping only rows with Q24, 25 and 26
survey = survey.loc[(survey.Question.str.contains('Q24', case = False)) 
                   | (survey.Question.str.contains('Q25', case = False))
                   | (survey.Question.str.contains('Q26', case = False))]
survey

Unnamed: 0,Country,Question,Response,Percentage,Count
166,Afghanistan,"Q24 Do you strongly or somewhat agree, strongl...",Strongly agree,0.846256,843
167,Afghanistan,"Q24 Do you strongly or somewhat agree, strongl...",Somewhat agree,0.129257,126
168,Afghanistan,"Q24 Do you strongly or somewhat agree, strongl...",Neither agree nor disagree,0.003520,3
169,Afghanistan,"Q24 Do you strongly or somewhat agree, strongl...",Somewhat disagree,0.015567,15
170,Afghanistan,"Q24 Do you strongly or somewhat agree, strongl...",Strongly disagree,0.005400,6
...,...,...,...,...,...
35808,Zimbabwe,"Q26 Do you strongly or somewhat agree, strongl...",Neither agree nor disagree,0.055830,47
35809,Zimbabwe,"Q26 Do you strongly or somewhat agree, strongl...",Somewhat disagree,0.009500,10
35810,Zimbabwe,"Q26 Do you strongly or somewhat agree, strongl...",Strongly disagree,0.009019,7
35811,Zimbabwe,"Q26 Do you strongly or somewhat agree, strongl...",Don't know/Refused,0.009236,8


In [6]:
# Column N % in %
survey['Percentage'] = [round(x * 100, 2) for x in survey['Percentage']]
survey

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


Unnamed: 0,Country,Question,Response,Percentage,Count
166,Afghanistan,"Q24 Do you strongly or somewhat agree, strongl...",Strongly agree,84.63,843
167,Afghanistan,"Q24 Do you strongly or somewhat agree, strongl...",Somewhat agree,12.93,126
168,Afghanistan,"Q24 Do you strongly or somewhat agree, strongl...",Neither agree nor disagree,0.35,3
169,Afghanistan,"Q24 Do you strongly or somewhat agree, strongl...",Somewhat disagree,1.56,15
170,Afghanistan,"Q24 Do you strongly or somewhat agree, strongl...",Strongly disagree,0.54,6
...,...,...,...,...,...
35808,Zimbabwe,"Q26 Do you strongly or somewhat agree, strongl...",Neither agree nor disagree,5.58,47
35809,Zimbabwe,"Q26 Do you strongly or somewhat agree, strongl...",Somewhat disagree,0.95,10
35810,Zimbabwe,"Q26 Do you strongly or somewhat agree, strongl...",Strongly disagree,0.90,7
35811,Zimbabwe,"Q26 Do you strongly or somewhat agree, strongl...",Don't know/Refused,0.92,8


In [7]:
# Keeping only disagree responses
survey = survey.loc[(survey.Response.str.contains('Somewhat disagree', case = False)) 
                   | (survey.Response.str.contains('Strongly disagree', case = False))]
survey

Unnamed: 0,Country,Question,Response,Percentage,Count
169,Afghanistan,"Q24 Do you strongly or somewhat agree, strongl...",Somewhat disagree,1.56,15
170,Afghanistan,"Q24 Do you strongly or somewhat agree, strongl...",Strongly disagree,0.54,6
175,Afghanistan,"Q25 Do you strongly or somewhat agree, strongl...",Somewhat disagree,3.55,35
176,Afghanistan,"Q25 Do you strongly or somewhat agree, strongl...",Strongly disagree,0.92,10
182,Afghanistan,"Q26 Do you strongly or somewhat agree, strongl...",Somewhat disagree,1.07,11
...,...,...,...,...,...
35797,Zimbabwe,"Q24 Do you strongly or somewhat agree, strongl...",Strongly disagree,0.50,6
35802,Zimbabwe,"Q25 Do you strongly or somewhat agree, strongl...",Somewhat disagree,1.86,18
35803,Zimbabwe,"Q25 Do you strongly or somewhat agree, strongl...",Strongly disagree,1.90,18
35809,Zimbabwe,"Q26 Do you strongly or somewhat agree, strongl...",Somewhat disagree,0.95,10


In [8]:
# Cleaning name
survey['Country'] = survey['Country'].apply(lambda x: x.split(',')[0])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [9]:
# Pivot table by country - grouping disagree responses

survey_pivot = pd.pivot_table(index = 'Country', columns = 'Question', 
                       values = 'Percentage', aggfunc = 'sum', data = survey)
survey_pivot.rename(columns={'Q24 Do you strongly or somewhat agree, strongly or somewhat disagree or neither agree nor disagree with the following statement? Vaccines are important for children to have.':
                             'Vaccines are not important for children to have (%)', 
                             'Q25 Do you strongly or somewhat agree, strongly or somewhat disagree or neither agree nor disagree with the following statement? Vaccines are safe.':
                             'Vaccines are not safe (%)', 
                             'Q26 Do you strongly or somewhat agree, strongly or somewhat disagree or neither agree nor disagree with the following statement? Vaccines are effective.':
                            'Vaccines are not effective (%)'}, inplace=True)
survey_pivot


Question,Vaccines are not important for children to have (%),Vaccines are not safe (%),Vaccines are not effective (%)
Country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Afghanistan,2.10,4.47,1.84
Albania,1.67,15.85,8.96
Algeria,3.70,11.28,7.70
Argentina,1.21,4.85,2.95
Armenia,12.18,20.57,12.47
...,...,...,...
Venezuela,0.15,4.05,2.49
Vietnam,1.39,8.11,3.57
Yemen,1.84,3.06,3.22
Zambia,2.34,10.28,8.15


In [10]:
# Adding overall avg.
survey_pivot['Overall Avg. (%)'] = survey_pivot.mean(axis = 1, skipna = True) 

In [11]:
survey_pivot

Question,Vaccines are not important for children to have (%),Vaccines are not safe (%),Vaccines are not effective (%),Overall Avg. (%)
Country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Afghanistan,2.10,4.47,1.84,2.803333
Albania,1.67,15.85,8.96,8.826667
Algeria,3.70,11.28,7.70,7.560000
Argentina,1.21,4.85,2.95,3.003333
Armenia,12.18,20.57,12.47,15.073333
...,...,...,...,...
Venezuela,0.15,4.05,2.49,2.230000
Vietnam,1.39,8.11,3.57,4.356667
Yemen,1.84,3.06,3.22,2.706667
Zambia,2.34,10.28,8.15,6.923333


In [12]:
# Saving cleaned dataset
survey_pivot.to_excel('./datasets/survey_clean.xlsx')

## Immunization

### Content
- Country Name
- Country Code
- Indicator Name
- Indicator Code 
- Data from 1960 to 2018

### To keep:
- Country Name
- Indicator Name
- Data 2018

In [13]:
immu = pd.read_csv('./datasets/WDIData.csv')
immu.head()

Unnamed: 0,Country Name,Country Code,Indicator Name,Indicator Code,1960,1961,1962,1963,1964,1965,...,2011,2012,2013,2014,2015,2016,2017,2018,2019,Unnamed: 64
0,Arab World,ARB,"2005 PPP conversion factor, GDP (LCU per inter...",PA.NUS.PPP.05,,,,,,,...,,,,,,,,,,
1,Arab World,ARB,"2005 PPP conversion factor, private consumptio...",PA.NUS.PRVT.PP.05,,,,,,,...,,,,,,,,,,
2,Arab World,ARB,Access to clean fuels and technologies for coo...,EG.CFT.ACCS.ZS,,,,,,,...,82.783289,83.120303,83.533457,83.897596,84.171599,84.510171,,,,
3,Arab World,ARB,Access to electricity (% of population),EG.ELC.ACCS.ZS,,,,,,,...,86.428272,87.070576,88.176836,87.342739,89.130121,89.678685,90.273687,,,
4,Arab World,ARB,"Access to electricity, rural (% of rural popul...",EG.ELC.ACCS.RU.ZS,,,,,,,...,73.942103,75.244104,77.162305,75.538976,78.741152,79.665635,80.749293,,,


In [14]:
# Keeping only rows with immunization indicators
immu = immu.loc[(immu['Indicator Name'].str.contains('immunization', case = False)) 
                & (immu['Indicator Name'].str.contains('% of one-year-old children'))] 
immu

Unnamed: 0,Country Name,Country Code,Indicator Name,Indicator Code,1960,1961,1962,1963,1964,1965,...,2011,2012,2013,2014,2015,2016,2017,2018,2019,Unnamed: 64
588,Arab World,ARB,"Immunization, HepB3 (% of one-year-old children)",SH.IMM.HEPB,,,,,,,...,89.128433,85.229495,87.024230,85.499756,84.614608,85.676629,86.869923,86.268804,,
2017,Caribbean small states,CSS,"Immunization, HepB3 (% of one-year-old children)",SH.IMM.HEPB,,,,,,,...,91.889481,94.309270,93.418916,92.967658,92.098611,96.525429,91.311438,96.339519,,
3446,Central Europe and the Baltics,CEB,"Immunization, HepB3 (% of one-year-old children)",SH.IMM.HEPB,,,,,,,...,97.141350,96.611042,96.212978,95.670205,94.271654,93.753374,93.131930,91.997551,,
4875,Early-demographic dividend,EAR,"Immunization, HepB3 (% of one-year-old children)",SH.IMM.HEPB,,,,,,,...,67.392136,79.147284,78.359261,81.056405,84.385574,86.360561,84.690535,84.302717,,
6304,East Asia & Pacific,EAS,"Immunization, HepB3 (% of one-year-old children)",SH.IMM.HEPB,,,,,,,...,92.659343,93.684316,92.679681,92.043236,91.692084,93.665168,92.568566,91.092703,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
370699,Virgin Islands (U.S.),VIR,"Immunization, HepB3 (% of one-year-old children)",SH.IMM.HEPB,,,,,,,...,,,,,,,,,,
372128,West Bank and Gaza,PSE,"Immunization, HepB3 (% of one-year-old children)",SH.IMM.HEPB,,,,,,,...,,,,,,,,,,
373557,"Yemen, Rep.",YEM,"Immunization, HepB3 (% of one-year-old children)",SH.IMM.HEPB,,,,,,,...,69.000000,67.000000,73.000000,73.000000,69.000000,71.000000,68.000000,65.000000,,
374986,Zambia,ZMB,"Immunization, HepB3 (% of one-year-old children)",SH.IMM.HEPB,,,,,,,...,81.000000,78.000000,79.000000,86.000000,90.000000,91.000000,94.000000,90.000000,,


In [15]:
# Keeping only columns to be used
immu = immu[['Country Name', 'Indicator Name', '2018']]

In [16]:
# Cleaning country name
immu['Country Name'] = immu['Country Name'].apply(lambda x: x.split(',')[0])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [17]:
# Keeping only countries
immu = immu.iloc[47:]
immu.head(50)

Unnamed: 0,Country Name,Indicator Name,2018
67751,Afghanistan,"Immunization, HepB3 (% of one-year-old children)",66.0
69180,Albania,"Immunization, HepB3 (% of one-year-old children)",99.0
70609,Algeria,"Immunization, HepB3 (% of one-year-old children)",91.0
72038,American Samoa,"Immunization, HepB3 (% of one-year-old children)",
73467,Andorra,"Immunization, HepB3 (% of one-year-old children)",98.0
74896,Angola,"Immunization, HepB3 (% of one-year-old children)",59.0
76325,Antigua and Barbuda,"Immunization, HepB3 (% of one-year-old children)",95.0
77754,Argentina,"Immunization, HepB3 (% of one-year-old children)",86.0
79183,Armenia,"Immunization, HepB3 (% of one-year-old children)",92.0
80612,Aruba,"Immunization, HepB3 (% of one-year-old children)",


In [18]:
# Dropping rows with nulls data for 2018
immu = immu.dropna(axis = 0)
immu

Unnamed: 0,Country Name,Indicator Name,2018
67751,Afghanistan,"Immunization, HepB3 (% of one-year-old children)",66.0
69180,Albania,"Immunization, HepB3 (% of one-year-old children)",99.0
70609,Algeria,"Immunization, HepB3 (% of one-year-old children)",91.0
73467,Andorra,"Immunization, HepB3 (% of one-year-old children)",98.0
74896,Angola,"Immunization, HepB3 (% of one-year-old children)",59.0
...,...,...,...
367841,Venezuela,"Immunization, HepB3 (% of one-year-old children)",60.0
369270,Vietnam,"Immunization, HepB3 (% of one-year-old children)",75.0
373557,Yemen,"Immunization, HepB3 (% of one-year-old children)",65.0
374986,Zambia,"Immunization, HepB3 (% of one-year-old children)",90.0


In [19]:
# Column: % of children not vaccinated
immu['% of one years-old not vaccinated'] = [round(100 - x, 2) for x in immu['2018']]
immu = immu.drop(['2018'], axis = 1)

In [20]:
immu = immu.drop(['Indicator Name'], axis = 1)

In [21]:
immu.head()

Unnamed: 0,Country Name,% of one years-old not vaccinated
67751,Afghanistan,34.0
69180,Albania,1.0
70609,Algeria,9.0
73467,Andorra,2.0
74896,Angola,41.0


In [22]:
immu.to_csv('./datasets/immu_clean.csv')

## Vaccination policy

Legend: 
<img src="./img/legend.png">

In [23]:
url = 'https://en.m.wikipedia.org/wiki/Vaccination_policy'

html = requests.get(url).content
soup = BeautifulSoup(html, "lxml")

In [24]:
table_body = soup.find_all('tbody')[1]
rows = table_body.find_all('td')
rows_clean = [r.split('\n\n') for row in rows for r in row.text.split('\xa0\xa0')]
rows_clean = [[r.replace('\n','').split('[')[0] for r in rows] for rows in rows_clean]

In [25]:
df = pd.DataFrame(rows_clean[1:42], columns = rows_clean[0])
df = df[['Countries', 'Strictest policy']]
df = df.rename(columns = {'Strictest policy':'Vaccination policy'})

In [26]:
df.to_csv('./datasets/wiki_clean.csv')