In [1]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [8]:
%cd gdrive/My Drive/UncertaintyModelling/

/content/gdrive/My Drive/UncertaintyModelling


In [None]:
!pip install pgmpy

In [65]:
import pandas as pd
import numpy as np
import pickle
import pgmpy
import sklearn
from sklearn.metrics import f1_score

In [9]:
with open("models/bayesian_18countries_learnt_bn_treesearch.pickle", "rb") as handle:
  model = pickle.load(handle)

In [23]:
train_df = pd.read_csv("data/train_18_countries.csv")
print(train_df.shape)
train_df.head()

(495, 307)


Unnamed: 0,deaths_per_mil_cat_india,facial_covering_new_zealand,cancel_public_events_argentina,debt_relief_india,income_support_singapore,school_closures_argentina,vaccine_policy_china,restriction_internal_movement_indonesia,workplace_closures_finland,internation_travel_control_finland,...,cases_per_mil_cat_finland,cases_per_mil_cat_hong_kong,cases_per_mil_cat_indonesia,cases_per_mil_cat_india,cases_per_mil_cat_israel,cases_per_mil_cat_malaysia,cases_per_mil_cat_new_zealand,cases_per_mil_cat_singapore,cases_per_mil_cat_united_states,cases_per_mil_cat_south_africa
0,0,3,2,1.0,2,1,5,2,1,2.0,...,1,0,1,1,5,3,0,1,6,2
1,0,2,2,0.0,2,3,4,2,2,3.0,...,1,0,0,1,4,1,0,1,4,1
2,0,3,1,1.0,2,0,5,2,1,2.0,...,4,0,1,1,6,4,0,3,6,3
3,0,3,2,1.0,2,1,5,2,1,2.0,...,1,0,1,1,5,3,0,1,5,2
4,0,2,2,0.0,2,1,4,2,2,3.0,...,1,0,0,0,4,1,0,1,4,1


In [29]:
test_df = pd.read_csv("data/test_18_countries.csv")
print(test_df.shape)
test_df.head()

(165, 307)


Unnamed: 0,deaths_per_mil_cat_india,facial_covering_new_zealand,cancel_public_events_argentina,debt_relief_india,income_support_singapore,school_closures_argentina,vaccine_policy_china,restriction_internal_movement_indonesia,workplace_closures_finland,internation_travel_control_finland,...,cases_per_mil_cat_finland,cases_per_mil_cat_hong_kong,cases_per_mil_cat_indonesia,cases_per_mil_cat_india,cases_per_mil_cat_israel,cases_per_mil_cat_malaysia,cases_per_mil_cat_new_zealand,cases_per_mil_cat_singapore,cases_per_mil_cat_united_states,cases_per_mil_cat_south_africa
0,0,3,1,1,2,0,5,2,1,2,...,4,0,1,1,6,4,0,3,6,3
1,0,3,2,1,2,1,5,2,1,2,...,1,0,1,1,6,3,0,1,6,2
2,0,2,2,2,2,3,0,2,1,3,...,0,0,0,0,1,0,0,1,1,1
3,0,3,2,1,2,1,5,2,1,2,...,1,0,1,1,5,3,0,1,5,2
4,0,0,2,2,2,3,0,1,1,3,...,0,0,0,0,0,0,0,0,1,0


In [26]:
test_df = test_df.astype(int)

**Split by countries**

In [39]:
countries = [
    "singapore",
    "china",
    "malaysia",
    "indonesia",
    "hong_kong",
    "australia",
    "new_zealand",
    "united_states",
    "canada",
    "argentina",
    "brazil",
    "south_africa",
    "egypt",
    "germany",
    "finland",
    "switzerland",
    "israel",
    "india"
]

In [41]:
dfs = {}

In [53]:
import regex as re
for country in countries:
  pattern = "country$"
  my_regex = re.escape(country) + r"$"
  dfs[country] = test_df.filter(regex=(my_regex))

In [17]:
from pgmpy.inference import VariableElimination
ve = VariableElimination(model)

  import pandas.util.testing as tm


In [None]:
import networkx as nx
import matplotlib.pyplot as plt
fig, ax = plt.subplots(1,1, figsize=(80,80))
nx.draw(model, with_labels=True, ax=ax)
plt.tight_layout()

In [68]:
pred_list = {}
for country in countries:
  preds = []
  deaths_per_mil_cat_country = 'deaths_per_mil_cat_'+country
  cases_per_mil_cat_country = 'cases_per_mil_cat_'+country

  for idx, row in dfs[country].drop([deaths_per_mil_cat_country, cases_per_mil_cat_country], axis = 1).iterrows():
    evidence = row.to_dict()
    res = ve.query(
          [cases_per_mil_cat_country], evidence=evidence, show_progress=False
      )
    ans_dict = {}
    for key, val in zip(res.state_names[res.variables[0]], res.values):
      ans_dict[key] = val
    max_val = max(ans_dict, key=ans_dict.get)
    preds.append(max_val)
  pred_list[country] = preds

  phi.values = phi.values / phi.values.sum()


In [70]:
predictions_list = {}
for country in countries:
  predictions = []
  for t in pred_list[country]:
    predictions.append(t)
  cases_per_mil_cat_country = 'cases_per_mil_cat_'+country
  preds_df = pd.DataFrame(predictions, columns = [cases_per_mil_cat_country])
  predictions_list[country] = preds_df

In [71]:
scores = {}
for country in countries:
  preds_df = predictions_list[country]
  cases_per_mil_cat_country = 'cases_per_mil_cat_'+country
  actual_df = test_df[cases_per_mil_cat_country]
  score = f1_score(actual_df, preds_df, average='micro')
  scores[country] = score

In [72]:
scores

{'argentina': 0.8424242424242424,
 'australia': 0.9393939393939394,
 'brazil': 0.8000000000000002,
 'canada': 0.8848484848484849,
 'china': 1.0,
 'egypt': 1.0,
 'finland': 0.9333333333333333,
 'germany': 0.9090909090909091,
 'hong_kong': 1.0,
 'india': 0.9515151515151515,
 'indonesia': 0.9212121212121213,
 'israel': 0.9515151515151515,
 'malaysia': 0.8787878787878788,
 'new_zealand': 1.0,
 'singapore': 0.7090909090909091,
 'south_africa': 0.9030303030303031,
 'switzerland': 0.7515151515151516,
 'united_states': 0.8727272727272727}

In [73]:
pred_list = {}
for country in countries:
  preds = []
  deaths_per_mil_cat_country = 'deaths_per_mil_cat_'+country
  cases_per_mil_cat_country = 'cases_per_mil_cat_'+country

  for idx, row in dfs[country].drop([deaths_per_mil_cat_country, cases_per_mil_cat_country], axis = 1).iterrows():
    evidence = row.to_dict()
    res = ve.query(
          [deaths_per_mil_cat_country], evidence=evidence, show_progress=False
      )
    ans_dict = {}
    for key, val in zip(res.state_names[res.variables[0]], res.values):
      ans_dict[key] = val
    max_val = max(ans_dict, key=ans_dict.get)
    preds.append(max_val)
  pred_list[country] = preds

  phi.values = phi.values / phi.values.sum()


In [74]:
predictions_list = {}
for country in countries:
  predictions = []
  for t in pred_list[country]:
    predictions.append(t)
  deaths_per_mil_cat_country = 'deaths_per_mil_cat_'+country
  preds_df = pd.DataFrame(predictions, columns = [deaths_per_mil_cat_country])
  predictions_list[country] = preds_df

In [75]:
scores = {}
for country in countries:
  preds_df = predictions_list[country]
  deaths_per_mil_cat_country = 'deaths_per_mil_cat_'+country
  actual_df = test_df[deaths_per_mil_cat_country]
  score = f1_score(actual_df, preds_df, average='micro')
  scores[country] = score

In [76]:
scores

{'argentina': 1.0,
 'australia': 1.0,
 'brazil': 1.0,
 'canada': 1.0,
 'china': 1.0,
 'egypt': 1.0,
 'finland': 1.0,
 'germany': 1.0,
 'hong_kong': 1.0,
 'india': 1.0,
 'indonesia': 1.0,
 'israel': 1.0,
 'malaysia': 1.0,
 'new_zealand': 1.0,
 'singapore': 1.0,
 'south_africa': 1.0,
 'switzerland': 1.0,
 'united_states': 1.0}