# Imports

In [1]:
import pingouin as pg
import scipy
import pandas as pd
import numpy as np

# Converting Responses to Points

- Yes/No/Unknown/NA responses are converted into points based on a predefined matrix.

In [2]:
def point_extract(item):
    # This extract the point scores from the dataframe
    item = str(item)
    point = np.nan if item.startswith('--') else item.split('|')[0]
    return float(point)

In [3]:
qc = pd.read_csv('../data/response_score_matrix.csv')
qc['Original Question Number'] = [i.replace('"', '').replace('Rewritten', '').strip() for i in qc['Original Question Number']]
qc.set_index('Original Question Number', inplace = True)
qc = qc.T

qc['Yes'] = qc.Yes.apply(point_extract)
qc['No'] = qc.No.apply(point_extract)
qc['Unknown'] = qc.Unknown.apply(point_extract)
qc['NA'] = qc.NA.apply(point_extract)
qc.head()

Original Question Number,Action Item (rewritten question),Importance,Category,Possible answers,Yes,No,Unknown,NA
T1,1. National vaccination rollout document(s) is...,1.0,Policy Transparency,Yes / No,1.0,-1.0,,
T2,2. National vaccination rollout document(s) ha...,1.0,Policy Transparency,Yes / No,1.0,-1.0,,
T3,3. There is a publicly available national vacc...,1.0,Policy Transparency,Yes / No / Unknown,1.0,-1.0,-0.5,
T11,11. National vaccination rollout document(s) i...,,Policy Transparency,Yes / No / NA,1.0,-1.0,,-2.0
T6,6. National vaccination rollout document(s) st...,,Policy Transparency,Yes / No / Unknown,1.0,-1.0,-0.5,


# Category Series

In [4]:
category = pd.Series(qc.Category)
category.index = qc.index
print(category.unique())
category.head()

['Policy Transparency' 'Undocumented Access'
 'Identification and Residency Requirements' 'Marginalized Access'
 'Privacy Guarantees']


T1     Policy Transparency
T2     Policy Transparency
T3     Policy Transparency
T11    Policy Transparency
T6     Policy Transparency
Name: Category, dtype: object

# Question Importance Series

In [5]:
tmp = pd.read_csv('../data/questions_cleaned.csv')
qi = tmp.iloc[8, 1:]
qi = qi.apply(lambda x: x.strip().lower())

In [6]:
print(set(qi))

STEP = 0.5
MID = 1.0
ordinal2number = {
    'not so important': MID  - STEP,
    'important': MID,
    'very important': MID + STEP
}

question_importance = pd.Series([ordinal2number[v] for v in qi])
question_importance.index = qi.index
question_importance.head()

{'important', 'not so important', 'very important'}


T1     1.5
T2     1.5
T3     1.0
T11    0.5
T6     0.5
dtype: float64

# Responses `aggregated`

In [7]:
# Row 0 and 1 are question text and category. So drop them.
raw = pd.read_csv('../data/final_responses.csv').iloc[2:, :]

raw.rename(columns = {'Original Question Number': 'Country'}, inplace = True)
raw.set_index('Country', inplace = True)

# We want to select every 5th column starting at 0 to only keep the responses
n_items_per_question = 5
raw = raw.loc[:, [(k % n_items_per_question) == 0 for k in range(raw.shape[1])]]

# Replace nan by "NA"
raw.fillna('NA', inplace = True)

print(raw.shape)
raw.head()

(21, 23)


Unnamed: 0_level_0,T1,T2,T3,T11,T6,A1,T4,T5,A3,A11,...,T10,T7,A18,A19,A21,A26,A27,T12,A5,A31
Country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Belgium,Yes,Yes,No,,Yes,No,Unknown,Unknown,Unknown,Unknown,...,Yes,Yes,Unknown,Yes,Unknown,Yes,Unknown,No,,Unknown
Bulgaria,Yes,Yes,No,,Unknown,,Unknown,Unknown,Unknown,Unknown,...,Yes,Unknown,Unknown,Yes,Yes,Yes,Yes,Unknown,,Yes
Czech Republic,Yes,Yes,No,,Unknown,,No,Unknown,,,...,Yes,No,No,Yes,Yes,Yes,Unknown,Unknown,,No
Denmark,Yes,Yes,Unknown,,Unknown,,Yes,Unknown,Yes,Unknown,...,Unknown,Yes,Yes,Yes,Yes,Yes,Yes,Yes,Yes,Unknown
Estonia,Yes,Yes,Yes,No,Unknown,,Unknown,Unknown,Unknown,Unknown,...,Unknown,No,No,Yes,Yes,Yes,Unknown,No,,Unknown


## Convert Responses to Points

In [8]:
def convert2points(question_number, response_series):
    return [qc.loc[question_number, response] for response in response_series]


res = raw.copy(deep = True)
if not isinstance(raw.iloc[0, 0], float):
    for c in raw.columns:
        res[c] = convert2points(c, res[c])

res.head()

Unnamed: 0_level_0,T1,T2,T3,T11,T6,A1,T4,T5,A3,A11,...,T10,T7,A18,A19,A21,A26,A27,T12,A5,A31
Country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Belgium,1.0,1.0,-1.0,-2.0,1.0,-1.0,-1.0,-1.0,-1.0,-0.5,...,1.0,1.0,-0.5,1.0,-0.5,1.0,-0.5,-1.0,-2.0,-1.0
Bulgaria,1.0,1.0,-1.0,-2.0,-0.5,-5.0,-1.0,-1.0,-1.0,-0.5,...,1.0,-0.5,-0.5,1.0,1.0,1.0,1.0,-1.0,-2.0,-1.0
Czech Republic,1.0,1.0,-1.0,-2.0,-0.5,-5.0,-2.0,-1.0,-2.0,-2.0,...,1.0,-1.0,-1.0,1.0,1.0,1.0,-0.5,-1.0,-2.0,1.0
Denmark,1.0,1.0,-0.5,-2.0,-0.5,-5.0,1.0,-1.0,1.0,-0.5,...,-0.5,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,-1.0
Estonia,1.0,1.0,1.0,-1.0,-0.5,-5.0,-1.0,-1.0,-1.0,-0.5,...,-0.5,-1.0,-1.0,1.0,1.0,1.0,-0.5,-1.0,-2.0,-1.0


## Categorical Weighted Sum

In [9]:
lst = []

for cat in category.unique():
    # print('#'*10, cat, '#'*10)

    # Get question numbers (T1, A2, etc.) for the given category
    questions_in_category = category.index[category == cat]

    # Filter to response dataframe & importance series for the given category
    this_res = res.loc[:, questions_in_category]
    this_imp = question_importance[questions_in_category]

    # Calculate weighted sum of the given category
    this_res[cat] = [np.dot(this_res.loc[country], this_imp) for country in this_res.index]

    lst.append(this_res)

aggregated = pd.concat(lst, axis = 1)
aggregated.head()

Unnamed: 0_level_0,T1,T2,T3,T11,T6,A1,Policy Transparency,T4,T5,A3,...,A18,A19,A21,A26,A27,Marginalized Access,T12,A5,A31,Privacy Guarantees
Country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Belgium,1.0,1.0,-1.0,-2.0,1.0,-1.0,1.0,-1.0,-1.0,-1.0,...,-0.5,1.0,-0.5,1.0,-0.5,1.0,-1.0,-2.0,-1.0,-4.0
Bulgaria,1.0,1.0,-1.0,-2.0,-0.5,-5.0,-1.75,-1.0,-1.0,-1.0,...,-0.5,1.0,1.0,1.0,1.0,2.5,-1.0,-2.0,-1.0,-4.0
Czech Republic,1.0,1.0,-1.0,-2.0,-0.5,-5.0,-1.75,-2.0,-1.0,-2.0,...,-1.0,1.0,1.0,1.0,-0.5,0.0,-1.0,-2.0,1.0,-3.0
Denmark,1.0,1.0,-0.5,-2.0,-0.5,-5.0,-1.25,1.0,-1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,5.5,1.0,1.0,-1.0,2.0
Estonia,1.0,1.0,1.0,-1.0,-0.5,-5.0,0.75,-1.0,-1.0,-1.0,...,-1.0,1.0,1.0,1.0,-0.5,0.0,-1.0,-2.0,-1.0,-4.0


## Score Card -- Grand Weighted Total

In [10]:
aggregated['Total Score'] = aggregated.loc[:, category.unique()].sum(axis = 1)
aggregated.sort_values('Total Score', inplace = True, ascending = False)
display(aggregated.head())

Unnamed: 0_level_0,T1,T2,T3,T11,T6,A1,Policy Transparency,T4,T5,A3,...,A19,A21,A26,A27,Marginalized Access,T12,A5,A31,Privacy Guarantees,Total Score
Country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
United Kingdom,1.0,1.0,1.0,-1.0,1.0,-0.5,3.75,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,5.5,1.0,1.0,-1.0,2.0,18.75
Portugal,1.0,1.0,-1.0,-2.0,1.0,-0.5,1.25,1.0,1.0,1.0,...,1.0,-0.5,1.0,1.0,0.5,1.0,1.0,-1.0,2.0,9.75
Netherlands,1.0,1.0,1.0,-1.0,-0.5,-5.0,0.75,1.0,-1.0,1.0,...,1.0,1.0,1.0,1.0,5.5,1.0,-1.0,-1.0,0.0,7.25
Denmark,1.0,1.0,-0.5,-2.0,-0.5,-5.0,-1.25,1.0,-1.0,1.0,...,1.0,1.0,1.0,1.0,5.5,1.0,1.0,-1.0,2.0,6.25
France,1.0,1.0,-1.0,-2.0,1.0,-1.0,1.0,1.0,-1.0,-1.0,...,1.0,1.0,1.0,1.0,4.0,1.0,-1.0,-1.0,0.0,4.0


# Confidence Score

## Main Approach (% of Unknowns weighted by Q-importance)

In [11]:
# Create a DF with -1 for `Unknown` and 0 for others
unknown_df = (-(raw == 'Unknown').astype(int))

# Worst confidence in absolute value
worst_confidence_abs = question_importance.sum()

In [12]:
confidence_scores = pd.Series(
    np.dot(
        unknown_df,
        question_importance
    ) / worst_confidence_abs
) + 1.0 # add 1 to shift to [0,1] range
confidence_scores.index = raw.index
confidence_scores.sort_values(ascending = False, inplace = True)
confidence_scores.name = 'Confidence'
confidence_scores.head(), confidence_scores.tail()

(Country
 Poland            0.956522
 United Kingdom    0.913043
 Netherlands       0.847826
 Czech Republic    0.804348
 Portugal          0.782609
 Name: Confidence, dtype: float64,
 Country
 Germany       0.543478
 Malta         0.543478
 Luxembourg    0.521739
 Bulgaria      0.521739
 Cyprus        0.521739
 Name: Confidence, dtype: float64)

## Transparency Unknowns to Eliminate Countries?

- After weighing by importance of each question, countries whose transparency confidence score of less than 2 thirds are dropped from analysis, due to very low confidence.

In [13]:
TRANSPARENCY_CONFIDENCE_THRESHOLD = 2.0/3

In [14]:
# A18 is in fact a transparency question.
transparency_questions = ['A18'] + [c for c in raw.columns if c.startswith('T')]

print(transparency_questions)
transparency_responses = raw.loc[:, transparency_questions]
display(transparency_responses.head())

['A18', 'T1', 'T2', 'T3', 'T11', 'T6', 'T4', 'T5', 'T9', 'T10', 'T7', 'T12']


Unnamed: 0_level_0,A18,T1,T2,T3,T11,T6,T4,T5,T9,T10,T7,T12
Country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
Belgium,Unknown,Yes,Yes,No,,Yes,Unknown,Unknown,Yes,Yes,Yes,No
Bulgaria,Unknown,Yes,Yes,No,,Unknown,Unknown,Unknown,Yes,Yes,Unknown,Unknown
Czech Republic,No,Yes,Yes,No,,Unknown,No,Unknown,Yes,Yes,No,Unknown
Denmark,Yes,Yes,Yes,Unknown,,Unknown,Yes,Unknown,Yes,Unknown,Yes,Yes
Estonia,No,Yes,Yes,Yes,No,Unknown,Unknown,Unknown,Yes,Unknown,No,No


In [15]:
# Create a DF with -1 for `Unknown` and 0 for others
transparency_unknown_df = (-(transparency_responses == 'Unknown').astype(int))

# This is positive but worst confidence will be negative -> abs
worst_transparency_confidence_abs = question_importance[transparency_questions].sum()

transparency_confidence = pd.Series(
    np.dot(
        transparency_unknown_df,
        question_importance[transparency_questions]
    ) / worst_transparency_confidence_abs
) + 1.0 # add 1 to shift to [0,1] range
transparency_confidence.index = raw.index
transparency_confidence.sort_values(ascending = False, inplace = True)
transparency_confidence.head(3), transparency_confidence.tail(5)

(Country
 United Kingdom    1.000000
 Poland            1.000000
 Portugal          0.925926
 dtype: float64,
 Country
 Greece        0.666667
 Estonia       0.666667
 Luxembourg    0.592593
 Cyprus        0.592593
 Bulgaria      0.481481
 dtype: float64)

In [16]:
countries_to_keep = transparency_confidence[transparency_confidence >= TRANSPARENCY_CONFIDENCE_THRESHOLD]

# Write Output Files

In [17]:
aggregated.loc[countries_to_keep.index, :].to_csv('../output/main_data.csv')
confidence_scores[countries_to_keep.index].to_csv('../output/confidence_scores_by_country.csv')

In [18]:
## Following was used to check the data correctness after migrating to local project.

# a = pd.read_csv('../output/_ARC_main_data.csv')
# a.set_index('Country', inplace = True)
# a = a.astype(float)
# a = a.loc[aggregated.loc[countries_to_keep.index, :].index, :]
# assert a.equals(aggregated.loc[countries_to_keep.index, :]), 'Values different.'

# a = pd.read_csv('../output/_ARC_confidence_scores_by_country.csv')
# a.set_index('Country', inplace = True)
# a = a.astype(float)
# assert 0==(a.Confidence[countries_to_keep.index] - confidence_scores[countries_to_keep.index] > ERROR).sum(), 'Values different.'