In [None]:
import torch
from task import pubmed_task, nicta_task, dri_task, art_task, PUBMED_TASK, NICTA_TASK, DRI_TASK, ART_TASK 
from models import BertHSLN, BertHSLNMultiSeparateLayers
from utils import *
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from os import makedirs
import os

In [None]:
# ADAPT: provide the paths to the models
multi_groups_model_path = "results/mult_grouped/0_0_6_model.pt"
multi_model_path = "results/mult_all/0_0_10_model.pt"
pubmed_base_path = "results/pubmed_base/0_0_model.pt"
nicta_base_path = "results/nicta_base/0_0_model.pt"
dri_base_path = "results/dri_base/0_0_model.pt"
art_base_path = "results/art_base/0_0_model.pt"

# ADAPT: provide the path where to store the results (semantic vectors heatmaps and PCA results)
results_out = "results/correlations"

In [None]:
makedirs(results_out, exist_ok=True)

In [None]:
def create_task(create_func):
    return create_func(train_batch_size=32, max_docs=-1)


def get_all_tasks():
    tasks = []
    tasks.append(create_task(pubmed_task))
    tasks.append(create_task(nicta_task))
    tasks.append(create_task(dri_task))
    tasks.append(create_task(art_task))
    return tasks


def get_task(task_name):
    for t in get_all_tasks():
        if t.task_name == task_name:
            return t
    return None

In [None]:
def load_model(path, tasks, model_class=BertHSLN, config=dict()):
    BERT_MODEL = "bert_model/scibert_scivocab_uncased/"
    best_config = {
        "bert_model": BERT_MODEL,
        "bert_trainable": False,
        "model": BertHSLN.__name__,
        "cacheable_tasks": [],

        "dropout": 0.5,
        "word_lstm_hs": 758,
        "att_pooling_dim_ctx": 200,
        "att_pooling_num_ctx": 15,

        "lr": 3e-05,
        "lr_epoch_decay": 0.9,
        "batch_size":  32,
        "max_seq_length": 128,
        "max_epochs": 20,
        "early_stopping": 5
    }
    best_config.update(config)
    
    model = model_class(best_config, tasks)
    params = torch.load(path, map_location=torch.device("cuda"))
    model.load_state_dict(params)        
    return model

In [None]:
device = torch.device("cuda")
cpu_device = torch.device("cpu")

In [None]:
multi_groups_model_config = dict()
multi_groups_model_config["attention_groups"] = [[PUBMED_TASK, NICTA_TASK, ART_TASK, DRI_TASK]]
multi_groups_model_config["sentence_encoder_groups"] = [[PUBMED_TASK, NICTA_TASK], [ART_TASK, DRI_TASK]]
multi_groups_model = load_model(multi_groups_model_path, get_all_tasks(), BertHSLNMultiSeparateLayers, multi_groups_model_config)

In [None]:
multi_model = load_model(multi_model_path, get_all_tasks())

In [None]:
pubmed_model = load_model(pubmed_base_path, [get_task(PUBMED_TASK)])

In [None]:
nicta_model = load_model(nicta_base_path, [get_task(NICTA_TASK)])

In [None]:
dri_model = load_model(dri_base_path, [get_task(DRI_TASK)])

In [None]:
art_model = load_model(art_base_path, [get_task(ART_TASK)])

In [None]:
def clear_and_map_predicted_values(true_labels, predicted_labels, true_label_names, pred_label_names):
    assert len(true_labels) == len(predicted_labels)
    cleared_predicted = []
    cleared_true = []
    for true_label, predicted_label in zip(true_labels, predicted_labels):
        # filter masked labels (0)
        if true_label > 0:
            cleared_true.append(true_label_names[true_label])
            cleared_predicted.append(pred_label_names[predicted_label])
    return cleared_true, cleared_predicted


def get_labels_matrix(tasks):
    labels_matrix = dict()
    for t1 in tasks:
        for l1 in t1.get_labels_pres_titled():
            l1 = t1.short_name + ":" + l1
            labels_matrix[l1] = dict()
            for t2 in tasks:
                for l2 in t2.get_labels_pres_titled():
                    l2 = t2.short_name + ":" + l2
                    labels_matrix[l1][l2] = 0
    return labels_matrix

def normalize_values(tasks, labels_matrix):
    '''Normalise the predicted counts within a task.'''
    for t1 in tasks:
        del labels_matrix[t1.short_name + ":Mask"]
        for l1 in t1.get_labels_titled()[1:]:
            l1 = t1.short_name + ":" + l1
            predicted_labels = labels_matrix[l1]            
            for t2 in tasks:
                label_sum = 0
                del predicted_labels[t2.short_name + ":Mask"]
                for l2 in t2.get_labels_titled()[1:]:
                    l2 = t2.short_name + ":" + l2
                    label_sum += predicted_labels[l2]
                for l2 in t2.get_labels_titled()[1:]:
                    l2 = t2.short_name + ":" + l2
                    if label_sum != 0:
                        predicted_labels[l2] = predicted_labels[l2] / label_sum
                
def predict_labels(eval_tasks, models):
    labels_matrix = get_labels_matrix(get_all_tasks())
    with torch.no_grad():                
        for eval_task in eval_tasks: 
            print(f'evaluating task {eval_task.task_name}... ')                        
            for mod in models:                
                for fold in eval_task.get_folds()[0:1]: # predict labels of first fold only
                    for batch in fold.test:
                        tensor_dict_to_gpu(batch, device)

                        if len(mod.crf.per_task_output.values()) == 1:
                            #single task model
                            orig_task = batch["task"]
                            batch["task"] = list(mod.crf.per_task_output.keys())[0]
                            output = mod(batch=batch, output_all_tasks=True)
                            batch["task"] = orig_task
                        else:
                            # multi-task model
                            output = mod(batch=batch, output_all_tasks=True)

                        true_labels = batch["label_ids"].view(-1)
                        for task_output in output["task_outputs"]:
                            t = get_task(task_output["task"])
                            pred_labels = task_output["predicted_label"].view(-1)
                            cleared_true, cleared_predicted = clear_and_map_predicted_values(true_labels, pred_labels, eval_task.get_labels_titled(), t.get_labels_titled())
                            for true_label, pred_label in zip(cleared_true, cleared_predicted):
                                true_label = eval_task.short_name + ":" + true_label
                                pred_label = t.short_name + ":" + pred_label
                                labels_matrix[true_label][pred_label] += 1                                                        

                        tensor_dict_to_cpu(batch)
    # normalize values
    normalize_values(get_all_tasks(), labels_matrix)
    # convert to format for pandas
    result = []
    for k, v in labels_matrix.items():
        r = dict()
        r["true_label"] = k
        r.update(v)
        result.append(r)
    return result
            


In [None]:
multi_model.to(device)
multi_labels_matrix = predict_labels(get_all_tasks(), [multi_model])
multi_model.to(cpu_device)

In [None]:
multi_groups_model.to_device(device, device)
multi_groups_labels_matrix = predict_labels(get_all_tasks(), [multi_groups_model])
multi_groups_model.to_device(cpu_device, cpu_device)

In [None]:
base_models = [pubmed_model, nicta_model, dri_model, art_model]
for m in base_models:
    m.to(device)
base_labels_matrix = predict_labels(get_all_tasks(), base_models)
for m in base_models:
    m.to(cpu_device)

In [None]:
def labels_matrix_to_df(labels_matrix):
    df = pd.DataFrame.from_dict(labels_matrix)
    df = df.set_index("true_label")
    df = df.round(2)
    return df    

# Semantic Vectors

## MULT GROUPED 

In [None]:
multi_groups_df = labels_matrix_to_df(multi_groups_labels_matrix)
multi_groups_df.to_csv(os.path.join(results_out, "multi_groups_labels_matrix.csv"))
multi_groups_df

## MULT ALL

In [None]:
multi_df = labels_matrix_to_df(multi_labels_matrix)
multi_df.to_csv(os.path.join(results_out, "multi_labels_matrix.csv"))
multi_df

## Baseline

In [None]:
base_df = labels_matrix_to_df(base_labels_matrix)
base_df.to_csv(os.path.join(results_out, "base_labels_matrix.csv"))
base_df

## MULT GROUPED

In [None]:
sns.set(font_scale=1.2)
plt.figure(figsize = (22,14))
ax = sns.heatmap(multi_groups_df, annot=True, linewidths=2, cmap=plt.cm.Blues)
plt.savefig(os.path.join(results_out, "multi_groups_heatmap.pdf"), format="pdf")
ax

## MULT ALL

In [None]:
sns.set(font_scale=1.2)
plt.figure(figsize = (22,14))
ax = sns.heatmap(multi_df, annot=True, linewidths=2, cmap=plt.cm.Blues)
plt.savefig(os.path.join(results_out, "multi_heatmap.pdf"), format="pdf")
ax

## Baseline

In [None]:
sns.set(font_scale=1.2)
plt.figure(figsize = (22,14))
ax = sns.heatmap(base_df, annot=True, linewidths=2, cmap=plt.cm.Blues)
plt.savefig(os.path.join(results_out, "base_heatmap.pdf"), format="pdf")
ax

In [None]:
def do_pca(df):
    pca = PCA(n_components=2)
    X = pca.fit_transform(df)
    x = X[:, 0]
    y = X[:, 1]
    print(pca.explained_variance_ratio_)
    
    fig, ax = plt.subplots(figsize=(20, 20))
    plt.xlim(-1.5, 1.5)
    plt.ylim(-1.5, 1.5)
    ax.scatter(x, y, alpha=0.2)
    for i, txt in enumerate(df.index):
        ax.annotate(txt, (x[i], y[i]))
    
    df_copy = df.copy()
    df_copy["PCA_X"] = x
    df_copy["PCA_Y"] = y
    return X, ax, df_copy
        

# PCA

## MULT GROUPED

In [None]:
multi_groups_x, ax, multi_groups_pca_df = do_pca(multi_groups_df)
multi_groups_pca_df.to_csv(os.path.join(results_out, "multi_groups_corr.csv"))
ax

## MULT ALL

In [None]:
multi_x, ax, multi_pca_df = do_pca(multi_df)
multi_pca_df.to_csv(os.path.join(results_out, "multi_corr.csv"))
ax

## Baseline

In [None]:
base_x, ax, base_pca_df = do_pca(base_df)
base_pca_df.to_csv(os.path.join(results_out, "base_corr.csv"))
ax