This notebook calculates the TCAV<sub>Magnitude</sub> and TCAV<sub>Direction</sub> scores of the TweetEval classifier regarding two concepts
(as explained in https://www.svkir.com/papers/Nejadgholi-et-al-TCAVcausal-WOAH-2023.pdf  ) :


*   Describing a protected group with negative emotions
*   Describing non-protected groups with fine-grained negative emotions






In [1]:
pip install transformers #version transformers-4.24.0

Collecting transformers
  Downloading transformers-4.30.2-py3-none-any.whl (7.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.2/7.2 MB[0m [31m57.3 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.14.1 (from transformers)
  Downloading huggingface_hub-0.15.1-py3-none-any.whl (236 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m236.8/236.8 kB[0m [31m31.7 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m115.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Downloading safetensors-0.3.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m88.8 MB/s[0m eta [36m0:00:

In [None]:
cd "Working Directory"

In [3]:
from Roberta_model_data import RobertaClassifier, ToxicityDataset


In [4]:
import torch
import pandas as pd
import nltk
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


True

Download the NRC-EIL lexicon from https://saifmohammad.com/WebPages/AffectIntensity.htm  and save it to data_DIR

In [5]:
data_DIR = "emotion_intensity_data/NRC-Emotion-Intensity-Lexicon/OneFilePerEmotion/"


#Functions to Calculate TCAV Scores

In [6]:
import torch.nn as nn
import numpy as np
import os
import pickle
import torch
from transformers import RobertaTokenizerFast
from torch.utils.data.dataloader import DataLoader
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import train_test_split

import random

np.random.seed(42)


with open('random_stopword_tweets.txt','r') as f_:
  random_examples= f_.read().split('\n\n')

random_concepts = random_examples[-1000:] #tweets collected with stop words as random inputs to the classifier


device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')


def cav_classifier(model,tokenizer, concept_reps):
  #trains a linear classifier to separate concept and non-concept exampels and
  #returns the vector orthogonal to the classification boundary as the representation of teh concept

  random_example_id = list(np.random.choice(range(len(random_examples)), 200))
  non_concept_examples = [random_examples[i] for i in random_example_id]
  non_concept_reps = get_reps(model,tokenizer,non_concept_examples)

  #print(non_concept_reps.shape)
  #print(concept_reps.shape)
  X = np.concatenate((concept_reps,non_concept_reps))
  Y = np.concatenate((np.ones([len(concept_reps)]),np.zeros([len(non_concept_reps)])))

  x_train, x_test, y_train, y_test = train_test_split(X, Y)
  clf = SGDClassifier(alpha=0.01, max_iter=1000,tol=1e-3)
  clf.fit(x_train, y_train)

  preds = clf.predict(x_test)
  print('accs of cav classifier: ',(preds == y_test).mean())
  #print(len(clf.coef_))
  #if len(clf.coef_) == 1:
  # if there are two concepts, there is only one label.
  # We split it in two.
    #return np.array([-1 * clf.coef_[0], clf.coef_[0]])
  #else:
  return clf.coef_[0]


def get_dataloader(X, y, tokenizer, batch_size):
  assert len(X) == len(y)
  encodings = tokenizer(X, truncation=True, padding=True, return_tensors="pt")
  dataset = ToxicityDataset(encodings, y)
  dataloader = DataLoader(dataset, batch_size=batch_size)
  return dataloader

def get_reps(model,tokenizer, concept_examples):
  #returns roberta representations of input examples and concept examples
  batch_size = 8
  concept_labels = torch.ones([len(concept_examples)])

  concept_repres = []
  concept_dataloader = get_dataloader(concept_examples,concept_labels,tokenizer,64)
  with torch.no_grad():
    for i_batch, batch in enumerate(concept_dataloader):
      input_ids = batch['input_ids'].to(device)
      attention_mask = batch['attention_mask'].to(device)
      _, _, representation = model(input_ids, attention_mask=attention_mask)
      concept_repres.append(representation[:,0,:])

  concept_repres = torch.cat(concept_repres, dim=0).cpu().detach().numpy()
  #print('concept representation shape', concept_repres.shape)
  #print('concept representation shape', representation[:,0,:].shape)

  return concept_repres

def statistical_testing(model, tokenizer, concept_examples, classifier_train, num_runs=10):
  #calculates CAVs for statistical testing
  cavs = []

  concept_repres = get_reps(model,tokenizer,concept_examples)
  for i in range(num_runs):
    #print(i)
    if classifier_train: #CAV by training a classifier
      cavs.append(cav_classifier(model,tokenizer,concept_repres))
    else:  # CAV by averaging
      concept_rep_ids = list(np.random.choice(range(len(concept_repres)), 50))
      concept_rep = [concept_repres[i] for i in concept_rep_ids]
      cavs.append(np.mean(concept_rep, axis = 0))

  return cavs

def get_logits_grad(model, tokenizer, sample, desired_class):
  #returns logits and gradients
  #print(sample)
  input = tokenizer(sample, truncation=True,padding=True, return_tensors="pt")
  model.zero_grad()
  input_ids = input['input_ids'].to(device)
  attention_mask = input['attention_mask'].to(device)
  logits, _, representation = model(input_ids, attention_mask=attention_mask)

  logits[0, desired_class].backward()
  #print('cav shape',cav.shape)
  grad = model.grad_representation
  #print('first',grad.shape)
  grad = grad[0][0].cpu().numpy()

  return logits,grad

def get_preds_tcavs(model, tokenizer , desired_class = 1,examples_set = 'random',concept_examples = random_concepts, classifier_train =False, Magnitude = True, num_runs = 10):
  # returns logits, sensitivies and tcav score
  # Magnitude= True returns TCAV_mag
  # Magnitude = False returns TCAV_dir

  if examples_set=='random':
    examples = random_examples[:500]   # input examples
  else:
    examples = examples_set



  print('calculating cavs...')
  model.to(device)
  concept_cavs = statistical_testing(model,tokenizer, concept_examples, classifier_train = classifier_train,num_runs=num_runs)



  print('calculating logits and grads...')
  logits = []
  grads = []
  for sample in examples:
    logit,grad = get_logits_grad(model, tokenizer, sample, desired_class)
    grads.append(grad)
    logits.append(logit)
    data ={'grads':grads,
          'logits':logits}



  sensitivities = []
  for grad in grads:
    sensitivities.append([np.dot(grad, cav) for cav in concept_cavs])
  sensitivities = np.array(sensitivities)
  tcavs = []
  for i in range(num_runs):
    if Magnitude:
      tcavs.append(np.sum([s for s in sensitivities[:,i] if s>0]+[0])/len(examples))
    else:
      tcavs.append(len([s for s in sensitivities[:,i] if s>0])/len(examples))


    #tcavs.append(np.mean(np.array([s for s in sensitivities[:,i] if s>0]))/len(examples))
  print('__________________________________ ')
  print('Mean and std of TCAV score for the concept: ')
  print('__________________________________ ')
  print(np.mean(tcavs),np.std(tcavs))

  return logits, sensitivities, tcavs


#Load TweetEval

In [7]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer
TweetEval_path = './model'
TweetEval_model = AutoModelForSequenceClassification.from_pretrained("cardiffnlp/twitter-roberta-base-offensive", use_auth_token='YOUR TOKEN').save_pretrained(TweetEval_path)
TweetEval_tokenizer = AutoTokenizer.from_pretrained("cardiffnlp/twitter-roberta-base-offensive", use_auth_token='YOUR TOKEN').save_pretrained(TweetEval_path)

Downloading (…)lve/main/config.json:   0%|          | 0.00/725 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/499M [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/150 [00:00<?, ?B/s]

In [8]:
id2label={"0": "non-offensive",
    "1": "offensive"
  }

In [9]:
model = RobertaClassifier(TweetEval_path)
tokenizer = RobertaTokenizerFast.from_pretrained(TweetEval_path)


# Section 4

##Template := "\<protected_group\> are \<emotion_words\>"


##Emotion : Anger and disgust with intensity higher than 0.5

---



In [10]:
protected_groups = ['women', 'trans people', 'gay people', 'black people', 'disabled people', 'muslims', 'immigrants']

In [11]:
emotion_words= dict()

for item in ['anger','anticipation','disgust','fear','joy','sadness','surprise','trust']:
 data = pd.read_csv(data_DIR+item+"-NRC-Emotion-Intensity-Lexicon-v1.txt", sep ='\t')
 data = data[data['Emotion-Intensity-Score']>0.5]
 data['tag'] = [item[1] for item in nltk.pos_tag(data['English Word'].values)]
 words = data[data['tag'].isin(['JJ','VBD','VBN'])]['English Word'].values
 tags = data[data['tag'].isin(['JJ','VBD','VBN'])]['tag'].values
 emotion_words[item]=[word_tag for word_tag in zip(words, tags)]

###TCAV<sub>Magnitude</sub>

In [13]:
concept = [random.choice(protected_groups)+ ' are '+ item[0]+'.' for item in emotion_words['disgust']+emotion_words['anger']]
print('\n\n')
logits, sensitivity, TCAV = get_preds_tcavs(model, tokenizer,desired_class = 1,examples_set = 'random', concept_examples = concept, classifier_train =True, Magnitude = True, num_runs=10)




calculating cavs...


Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


accs of cav classifier:  0.9931972789115646
accs of cav classifier:  0.9863945578231292
accs of cav classifier:  1.0
accs of cav classifier:  0.9931972789115646
accs of cav classifier:  0.9795918367346939
accs of cav classifier:  0.9659863945578231
accs of cav classifier:  0.9523809523809523
accs of cav classifier:  0.9795918367346939
accs of cav classifier:  1.0
accs of cav classifier:  0.9727891156462585
calculating logits and grads...
__________________________________ 
Mean and std of TCAV score for the concept: 
__________________________________ 
0.16619942827833056 0.02727653324395077


###TCAV<sub>Direction</sub>

In [15]:
concept = [random.choice(protected_groups)+ ' are '+ item[0]+'.' for item in emotion_words['disgust']+emotion_words['anger']]
print('\n\n')
logits, sensitivity, TCAV = get_preds_tcavs(model, tokenizer,desired_class = 1,examples_set = 'random', concept_examples = concept, classifier_train =True, Magnitude = False, num_runs=10)




calculating cavs...
accs of cav classifier:  0.9863945578231292
accs of cav classifier:  1.0
accs of cav classifier:  0.9795918367346939
accs of cav classifier:  0.8843537414965986
accs of cav classifier:  0.9727891156462585
accs of cav classifier:  0.9931972789115646
accs of cav classifier:  1.0
accs of cav classifier:  1.0
accs of cav classifier:  1.0
accs of cav classifier:  0.9795918367346939
calculating logits and grads...
__________________________________ 
Mean and std of TCAV score for the concept: 
__________________________________ 
1.0 0.0


# Section 5

##Template := "They are \<emotion_words\>"


##Emotion : Negative emotions with intensity higher than 0.7

---



In [16]:
template_1 = ["They are "]
template_2 = "."

In [17]:
emotion_words= dict()

for item in ['anger','anticipation','disgust','fear','joy','sadness','surprise','trust']:
 data = pd.read_csv(data_DIR+item+"-NRC-Emotion-Intensity-Lexicon-v1.txt", sep ='\t')
 data = data[data['Emotion-Intensity-Score']>0.7]
 data['tag'] = [item[1] for item in nltk.pos_tag(data['English Word'].values)]
 words = data[data['tag'].isin(['JJ','VBD','VBN'])]['English Word'].values
 tags = data[data['tag'].isin(['JJ','VBD','VBN'])]['tag'].values
 emotion_words[item]=[word_tag for word_tag in zip(words, tags)]

###TCAV<sub>Magnitude</sub>

In [19]:
emotions = list()
mean_TCAV = list()
std_TCAV = list()
for emotion in ['anger', 'fear', 'disgust', 'sadness']:
  concept = ["They are "+ item[0]+"." for item in emotion_words[emotion]]
  print('\n\n')
  print(emotion, '\n')
  #print(template,'\n')
  logits, sensitivity, TCAV = get_preds_tcavs(model, tokenizer, desired_class = 1, examples_set = 'random', concept_examples = concept, classifier_train =True,Magnitude = True, num_runs=10)
  emotions+=[emotion]
  mean_TCAV.append(np.mean(TCAV))
  std_TCAV.append(np.std(TCAV))




anger 

calculating cavs...
accs of cav classifier:  0.971830985915493
accs of cav classifier:  1.0
accs of cav classifier:  0.971830985915493
accs of cav classifier:  1.0
accs of cav classifier:  0.9577464788732394
accs of cav classifier:  1.0
accs of cav classifier:  1.0
accs of cav classifier:  0.9014084507042254
accs of cav classifier:  0.9859154929577465
accs of cav classifier:  1.0
calculating logits and grads...
__________________________________ 
Mean and std of TCAV score for the concept: 
__________________________________ 
0.24262771041461195 0.0709363230305687



fear 

calculating cavs...
accs of cav classifier:  0.987012987012987
accs of cav classifier:  0.987012987012987
accs of cav classifier:  1.0
accs of cav classifier:  1.0
accs of cav classifier:  1.0
accs of cav classifier:  0.974025974025974
accs of cav classifier:  0.987012987012987
accs of cav classifier:  1.0
accs of cav classifier:  0.961038961038961
accs of cav classifier:  1.0
calculating logits and grads

###TCAV<sub>Direction</sub>

In [20]:
emotions = list()
mean_TCAV = list()
std_TCAV = list()
for emotion in ['anger', 'fear', 'disgust', 'sadness']:
  concept = ["They are "+ item[0]+"." for item in emotion_words[emotion]]
  print('\n\n')
  print(emotion, '\n')
  #print(template,'\n')
  logits, sensitivity, TCAV = get_preds_tcavs(model, tokenizer, desired_class = 1, examples_set = 'random', concept_examples = concept, classifier_train =True,Magnitude = False, num_runs=10)
  emotions+=[emotion]
  mean_TCAV.append(np.mean(TCAV))
  std_TCAV.append(np.std(TCAV))




anger 

calculating cavs...
accs of cav classifier:  1.0
accs of cav classifier:  0.9436619718309859
accs of cav classifier:  1.0
accs of cav classifier:  0.7323943661971831
accs of cav classifier:  1.0
accs of cav classifier:  0.9436619718309859
accs of cav classifier:  0.9577464788732394
accs of cav classifier:  0.9859154929577465
accs of cav classifier:  1.0
accs of cav classifier:  1.0
calculating logits and grads...
__________________________________ 
Mean and std of TCAV score for the concept: 
__________________________________ 
1.0 0.0



fear 

calculating cavs...
accs of cav classifier:  1.0
accs of cav classifier:  1.0
accs of cav classifier:  0.974025974025974
accs of cav classifier:  1.0
accs of cav classifier:  0.961038961038961
accs of cav classifier:  0.961038961038961
accs of cav classifier:  0.961038961038961
accs of cav classifier:  1.0
accs of cav classifier:  1.0
accs of cav classifier:  0.974025974025974
calculating logits and grads...
_________________________