In [1]:
## Setup

In [1]:
from nlp_utils.data_module import SemEvalDataModule
from nlp_utils.model import CustomDistilBertModel
import pytorch_lightning as pl
from pytorch_lightning.callbacks.early_stopping import EarlyStopping
from pytorch_lightning.callbacks import Callback
from pytorch_lightning.callbacks import ModelCheckpoint
from pytorch_lightning.utilities.seed import seed_everything
from glob import glob
import ipywidgets as widgets
from tqdm.notebook import tqdm
import pandas as pd
import torch
import seaborn as sb
import re
import random
import os

%load_ext tensorboard
%load_ext autoreload
%autoreload 2
seed_everything(42)

Global seed set to 42


42

In [2]:
if not 'notebookDir' in globals():
    notebookDir = os.getcwd()
print('notebookDir: ' + notebookDir)
os.chdir(notebookDir)

notebookDir: /home/user/Documents/Github/Uni/Master/TUM_Praktikum_NLP_Explainability/understanding-opinions-on-social-media/tasks


In [4]:
# get folder where logs are stored
save_folder = "../logs/StancePrediction_SemEval/lightning_logs/"
save_folder = os.path.join(notebookDir, save_folder)

# Load model

In [5]:
# Select a model
w = widgets.Dropdown(
    options=glob(os.path.join(save_folder, '*/checkpoints/*.ckpt')),
    description='Select a checkpoint:'
)
w

Dropdown(description='Select a checkpoint:', options=('/home/user/Documents/Github/Uni/Master/TUM_Praktikum_NL…

In [6]:
model_version = re.findall("version_[0-9]+", w.value)[0]
model = CustomDistilBertModel.load_from_checkpoint(w.value)
data_module = SemEvalDataModule(num_workers=4, config=model.config)

model.config, model_version

({'dataset_path': '../../data/raw/SemEval/',
  'learning_rate': 0.0030806995333433384,
  'batch_size': 16,
  'epochs': 20,
  'num_trials': 50,
  'vocab_size': 30522,
  'target_encoding': {0: 'Atheism',
   1: 'Climate Change is a Real Concern',
   2: 'Feminist Movement',
   3: 'Hillary Clinton',
   4: 'Legalization of Abortion'},
  'stance_encoding': {0: 'AGAINST', 1: 'FAVOR', 2: 'NONE', 3: 'UNKNOWN'}},
 'version_26')

In [7]:
# check performance
trainer = pl.Trainer(deterministic=True)
trainer.test(model, datamodule=data_module)

GPU available: False, used: False
TPU available: False, using: 0 TPU cores


Testing:  99%|█████████████████████████████████▌| 78/79 [00:33<00:00,  2.80it/s]

Results				 
FAVOR     precision: 0.3222 recall: 0.9211 f-score: 0.4774
AGAINST   precision: 0.9286 recall: 0.2727 f-score: 0.4216
------------
Macro F: 0.4495

Testing: 100%|██████████████████████████████████| 79/79 [00:33<00:00,  2.37it/s]
--------------------------------------------------------------------------------
DATALOADER:0 TEST RESULTS
{'test_epoch_F1': 0.656525194644928,
 'test_epoch_target_F1': 0.656525194644928,
 'test_loss': 0.9024659395217896}
--------------------------------------------------------------------------------


[{'test_loss': 0.9024659395217896,
  'test_epoch_target_F1': 0.656525194644928,
  'test_epoch_F1': 0.656525194644928}]

## Auto select best model from Raytune hyperparameter optimization

In [8]:
# get best config
def get_best_config(path):
    scores = []
    for root, dirs, files in os.walk(path):
        for name in files:
            if ".ckpt" in name:
                score = name.split("val")
                score = [re.findall(r"[-+]?\d*\.\d+|\d+", s)[-1] for s in score]
                score.append(os.path.join(root, name))

                scores.append(score)

    # filter scores for best version
    df = pd.DataFrame(scores, columns=["epoch", "loss", "F1", "path"])
    ckpt = df.sort_values(by=["F1", "loss"], ascending=[False,True]).head(1).path.values[0]
    return ckpt

In [9]:
# take best model
best_model_path = get_best_config(save_folder)
best_model_version = re.findall("version_[0-9]+", best_model_path)[0]

best_model = CustomDistilBertModel.load_from_checkpoint(best_model_path)
best_data_module = SemEvalDataModule(num_workers=4, config=model.config)

best_model.config, best_model_version

({'dataset_path': '../../data/raw/SemEval/',
  'learning_rate': 0.0013774978663536918,
  'batch_size': 32,
  'epochs': 20,
  'num_trials': 50,
  'vocab_size': 30522,
  'target_encoding': {0: 'Atheism',
   1: 'Climate Change is a Real Concern',
   2: 'Feminist Movement',
   3: 'Hillary Clinton',
   4: 'Legalization of Abortion'},
  'stance_encoding': {0: 'AGAINST', 1: 'FAVOR', 2: 'NONE', 3: 'UNKNOWN'}},
 'version_28')

In [10]:
# check performance
trainer = pl.Trainer(deterministic=True)
trainer.test(best_model, datamodule=best_data_module)

GPU available: False, used: False
TPU available: False, using: 0 TPU cores


Testing:  99%|█████████████████████████████████▌| 78/79 [00:32<00:00,  2.84it/s]

Results				 
FAVOR     precision: 0.6468 recall: 0.5000 f-score: 0.5640
AGAINST   precision: 0.7426 recall: 0.7385 f-score: 0.7405
------------
Macro F: 0.6523

Testing: 100%|██████████████████████████████████| 79/79 [00:32<00:00,  2.40it/s]
--------------------------------------------------------------------------------
DATALOADER:0 TEST RESULTS
{'test_epoch_F1': 0.7502001523971558,
 'test_epoch_target_F1': 0.7502001523971558,
 'test_loss': 0.7219581007957458}
--------------------------------------------------------------------------------


[{'test_loss': 0.7219581007957458,
  'test_epoch_target_F1': 0.7502001523971558,
  'test_epoch_F1': 0.7502001523971558}]

### Decide for model

In [11]:
model = model
model = best_model

# Explain model

In [12]:
import numpy as np
import scipy as sp
import spacy
import pickle
import json
import sage
import nltk
import string
from scipy.spatial.distance import pdist, squareform
from sklearn.cluster import AgglomerativeClustering
from transformers import DistilBertTokenizer
from numpy.random import default_rng
tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")

## Sage

In [13]:
# display encoding
data_module.stance_encoding, data_module.target_encoding

({0: 'AGAINST', 1: 'FAVOR', 2: 'NONE', 3: 'UNKNOWN'},
 {0: 'Atheism',
  1: 'Climate Change is a Real Concern',
  2: 'Feminist Movement',
  3: 'Hillary Clinton',
  4: 'Legalization of Abortion'})

In [16]:
def get_dataset(data_module, part="all"):
    # train
    text, label = data_module.trainset.texts, data_module.trainset.labels
    df_train = pd.DataFrame(data=(text), columns=["Text"])
    df_train["Stance"] = label[0]
    df_train["Target"] = label[1]

    # val
    text, label = data_module.valset.texts, data_module.valset.labels
    df_val = pd.DataFrame(data=(text), columns=["Text"])
    df_val["Stance"] = label[0]
    df_val["Target"] = label[1]
    #test
    text, label = data_module.testset.texts, data_module.testset.labels
    df_test = pd.DataFrame(data=(text), columns=["Text"])
    df_test["Stance"] = label[0]
    df_test["Target"] = label[1]

    if part == "all":
        df = pd.concat([df_train, df_val, df_test])
        return df
    elif part == "train":
        return df_train
    elif part == "val":
        return df_val
    else:
        return df_test


In [None]:
df = get_dataset(data_module)

In [17]:
# display data for inspection
df1 = df.copy()
df1["Stance"] = df1["Stance"].transform(lambda x: data_module.stance_encoding[x])
df1["Target"] = df1["Target"].transform(lambda x: data_module.target_encoding[x])
df1.head()

Unnamed: 0,Text,Stance,Target
0,Don't get it twisted. A major presidential can...,AGAINST,Hillary Clinton
1,The Dukes of Hazzard has been on tv for 36 yea...,AGAINST,Hillary Clinton
2,#BlackLivesMatter unless they are pre born bla...,AGAINST,Legalization of Abortion
3,Of mothers advising their daughter's to abort ...,AGAINST,Legalization of Abortion
4,"If you want to empower women, you need to dise...",FAVOR,Feminist Movement


In [18]:
def create_word_clusters(df, similarity_func, num_clusters):
    X = df["Text"]
    print("Create Vocabulary...")
    stripped_of_syllables_vocab, stripped = create_vocab(X)
    print("Cluster Vocabulary...")
    dictionary_of_vocab = cluster_vocab(X, stripped_of_syllables_vocab, stripped, similarity_func, num_clusters)
    print("Finished...")
    
    return dictionary_of_vocab

In [19]:
def create_vocab(X):
    nlp_bigger = spacy.load('en_core_web_lg')
    list_of_list = [tokenizer.tokenize(x) for x in X.values]
    flat_list = [item for sublist in list_of_list for item in sublist]
    small_vocab = list(set(flat_list))
    spacy_vocab = [nlp_bigger(x) for x in small_vocab] 
    vocab_2d = [[x] for x in spacy_vocab]

    nltk.download('stopwords')
    from nltk.corpus import stopwords
    stop_words = set(stopwords.words('english'))
    filtered_vocab = [w for w in spacy_vocab if not str(w) in stop_words]
    filtered_stopwords = [str(w) for w in spacy_vocab if str(w) in stop_words]

    #removing punctuation from vocab
    #print(string.punctuation)
    stripped_vocab = [w for w in filtered_vocab if not str(w) in string.punctuation] 
    stripped_punctuation = [str(w) for w in filtered_vocab if str(w) in string.punctuation]

    #removing digits from vocab
    stripped_of_digits_vocab = [w for w in stripped_vocab if not str(w).isdigit()]
    stripped_digits = [str(w) for w in stripped_vocab if str(w).isdigit()]

    #removing syllables from vocab
    stripped_of_syllables_vocab = [w for w in stripped_of_digits_vocab if not "#" in str(w)] 
    stripped_syllables = [str(w) for w in stripped_of_digits_vocab if "#" in str(w)]
    
    stripped = (filtered_stopwords, stripped_punctuation, stripped_digits, stripped_syllables)

    return stripped_of_syllables_vocab, stripped

In [20]:
def similarity_func(u, v):
    if u[0].text == "#":
        token_one = u[len(u)-1]
    else:
        token_one = u[0]
    if v[0].text == "#":
        token_two = v[len(v)-1]
    else:
        token_two = v[0]
    return token_one.similarity(token_two)

In [21]:
def cluster_vocab(X, stripped_of_syllables_vocab, stripped, similarity_func, num_clusters):
    stripped_of_syllables_vocab_2d = [[x] for x in stripped_of_syllables_vocab]
    dists = pdist(np.array(stripped_of_syllables_vocab_2d), similarity_func) 
    similarity_matrix = squareform(dists)
    cluster_model = AgglomerativeClustering(affinity='precomputed', n_clusters=num_clusters, linkage='complete').fit(1-similarity_matrix)

    clustered_vocab = [[] for x in range(num_clusters)]
    _ = [clustered_vocab[cluster_model.labels_[index]].append(str(x)) for index, x in enumerate(stripped_of_syllables_vocab)]

    token_groupings = {str(index) : element for index, element in enumerate(clustered_vocab)} 

    token_groupings["stop_words"] = stripped[0]
    token_groupings["punctuation"] = stripped[1]
    token_groupings["digits"] = stripped[2]
    token_groupings["syllables"] = stripped[3]

    dictionary_of_vocab = {
    "description": "",
    "text": (X.values),
    "num_clusters": num_clusters,
    "similarity_matrix": similarity_matrix,
    "cluster_model": cluster_model,
    "clustered_vocab": clustered_vocab,
    "token_groupings": token_groupings
    }

    return dictionary_of_vocab

In [28]:
def save_vocab_dict(dictionary_of_vocab, filename):
    file = open(os.path.join(notebookDir, "vocab/" + filename), 'wb')
    pickle.dump(dictionary_of_vocab, file)
    file.close()

def open_vocab_dict(filename):
    dictionary_of_vocab = pickle.load(open(os.path.join(notebookDir, "../logs/StancePrediction_SemEval/vocab/" + filename), 'rb'))
    return dictionary_of_vocab

In [23]:
# Set up imputer object
class Imputer_groupings:
    def __init__(self, model, number_of_groups):
        self.model = model
        self.num_groups = number_of_groups
    
    def __call__(self, input_array, S):
        max_length = 0
        reconstructed_array = []
        for index, sentence in enumerate(input_array):
            length_of_sentence = max(x[0] for group in sentence for x in group)
            if length_of_sentence > max_length:
                max_length = length_of_sentence

            original_input_ids = [None] * (length_of_sentence+1)
            for sub_index, group in enumerate(sentence):
                
                if S[index][sub_index]: #put in '[MASK]' elements if needed
                    for x in group:
                        original_input_ids[x[0]]=x[1]
                else: 
                    for x in group:
                        original_input_ids[x[0]]=103 #id of '[MASK]'

            original_input_ids.append(102)
            original_input_ids.insert(0, 101)
            reconstructed_array.append(original_input_ids)

        max_length+=3

        for index, sentence in enumerate(reconstructed_array):
            if len(sentence) < max_length:
                reconstructed_array[index].extend([0 for i in range(max_length-len(sentence))])

        input_ids = np.array(reconstructed_array)

        # "handmade" attention mask -> basically just set everything to one, except '[PAD]'s which are zero
        am = np.ones(input_ids.shape)
        am[input_ids == 0] = 0 #id of '[PAD]'
        tensor_attention_mask = torch.tensor(am)
        tensor_attention_mask = tensor_attention_mask.to(model.device)


        tensor_input_ids = torch.tensor(input_ids)
        tensor_input_ids = tensor_input_ids.to(model.device)

        encoded_text = {"input_ids": tensor_input_ids, "attention_mask": tensor_attention_mask}
        
        #predict with model
        outputs = self.model(encoded_text)[0]       
        outputs = outputs.detach().cpu().numpy()
        
        score_most_prob = [max(x) for x in outputs]

        return np.array(score_most_prob) # sp.special.logit(score_most_prob)

In [24]:
def calc_sage_input(dictionary_of_vocab_clusters, number_of_groups):
    current_groupings = dictionary_of_vocab_clusters["token_groupings"]
    feature_names = list(current_groupings.keys())
    sage_input_all_instances = []
    for text_element in dictionary_of_vocab_clusters["text"]:
        sage_input_groupings = [[] for x in range(number_of_groups)]
        input_text = tokenizer.tokenize(text_element)
        input_ids = tokenizer(text_element, add_special_tokens=False)["input_ids"]
        groups_to_text = [next((key for key, value in current_groupings.items() if x in value)) for x in input_text]

        positioned_ids = [(i, x) for i, x in enumerate(input_ids)]
        _ = [sage_input_groupings[list(current_groupings.keys()).index(groups_to_text[index])].append(x)  for index, x in enumerate(positioned_ids)]
        sage_input_all_instances.append(sage_input_groupings)

    sage_input_all_instances_array = np.array(sage_input_all_instances, dtype=object)
    return sage_input_all_instances_array

### Automatically run a number of cluster sizes

In [74]:
# create and save clustered vocabularies for a number of different cluster sizes and save them to the logs
cluster_sizes = [3, 5, 7, 9, 11, 15, 17, 19, 21, 25, 29, 35, 45, 55]
for n in cluster_sizes:
    dictionary_of_vocab_clusters = create_word_clusters(df, similarity_func, n)
    save_vocab_dict(dictionary_of_vocab_clusters, "dictionary_of_vocab_" + str(n) + "cluster.pickle")


Create Vocabulary...
[nltk_data] Downloading package stopwords to /home/user/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
  This is separate from the ipykernel package so we can avoid doing imports until
Cluster Vocabulary...
Finished...
Create Vocabulary...
[nltk_data] Downloading package stopwords to /home/user/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
  This is separate from the ipykernel package so we can avoid doing imports until
Cluster Vocabulary...
Finished...
Create Vocabulary...
[nltk_data] Downloading package stopwords to /home/user/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
  This is separate from the ipykernel package so we can avoid doing imports until
Cluster Vocabulary...
Finished...
Create Vocabulary...
[nltk_data] Downloading package stopwords to /home/user/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
  This is separate from the ipykernel package so we can avoid doing impo

In [89]:
# create sage plots for the different cluster sizes and save them to the logs
for n in cluster_sizes:
    dictionary_of_vocab_clusters = open_vocab_dict("dictionary_of_vocab_" + str(n) + "cluster.pickle")
    number_of_groups = len(dictionary_of_vocab_clusters["token_groupings"])
    sage_input = calc_sage_input(dictionary_of_vocab_clusters, number_of_groups)
    imputer = Imputer_groupings(model, number_of_groups)
    groupings_estimator = sage.PermutationEstimator(imputer, "mse")

    # sage for stance
    label="Stance"
    sage_values_groupings = groupings_estimator(sage_input, np.array(df[label].values), batch_size=64, verbose=True,thresh=0.15) 
    stance_plot = sage_values_groupings.plot(feature_names, return_fig=True) 
    # save plot
    stance_plot.savefig(os.path.join(notebookDir, "plots/", label + "_feature_importance_" + str(n) + "cluster.png"), format="png")

    # sage for target
    label="Target"
    sage_values_groupings = groupings_estimator(sage_input, np.array(df[label].values), batch_size=64, verbose=True,thresh=0.15) 
    target_plot = sage_values_groupings.plot(feature_names, return_fig=True) 
    # save plot
    target_plot.savefig(os.path.join(notebookDir, "plots/", label + "_feature_importance_" + str(n) + "cluster.png"), format="png")


  9%|▉         | 0.0875/1 [4:09:18<43:20:00, 170958.92s/it]
 28%|██▊       | 0.2755/1 [00:34<01:30, 124.40s/it]StdDev Ratio = 0.2858 (Converge at 0.1500)
 36%|███▋      | 0.3626/1 [01:11<02:05, 196.42s/it]StdDev Ratio = 0.2491 (Converge at 0.1500)
 35%|███▌      | 0.35/1 [01:46<03:17, 303.45s/it]  StdDev Ratio = 0.2535 (Converge at 0.1500)
 33%|███▎      | 0.3262/1 [02:26<05:03, 450.41s/it]StdDev Ratio = 0.2626 (Converge at 0.1500)
 51%|█████     | 0.5065/1 [03:27<03:22, 410.33s/it]StdDev Ratio = 0.2108 (Converge at 0.1500)
 46%|████▌     | 0.4551/1 [04:32<05:26, 599.40s/it]StdDev Ratio = 0.2224 (Converge at 0.1500)
 56%|█████▋    | 0.5647/1 [05:18<04:05, 563.96s/it]StdDev Ratio = 0.1996 (Converge at 0.1500)
 72%|███████▏  | 0.7195/1 [06:23<02:29, 533.49s/it]StdDev Ratio = 0.1768 (Converge at 0.1500)
 96%|█████████▌| 0.9589/1 [07:35<00:19, 474.56s/it]StdDev Ratio = 0.1532 (Converge at 0.1500)
100%|██████████| 1/1 [08:33<00:00, 513.69s/it]StdDev Ratio = 0.1469 (Converge at 0.1500)
Detec

IndexError: index 18 is out of bounds for axis 0 with size 17