<a href="https://colab.research.google.com/github/JacopoMangiavacchi/SBERT-ZSC/blob/main/ZSC_Test.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install transformers



In [None]:
sentence = 'Who are you voting for in 2020?'
labels = ['business', 'art & culture', 'politics']

# Test HuggingFace Zero Shot Classification Pipeline

In [None]:
from transformers import pipeline

classifier = pipeline('zero-shot-classification')

No model was supplied, defaulted to facebook/bart-large-mnli (https://huggingface.co/facebook/bart-large-mnli)


In [None]:
classes = classifier(sentence, labels)
classes

{'labels': ['politics', 'business', 'art & culture'],
 'scores': [0.9604313373565674, 0.02018606849014759, 0.01938255876302719],
 'sequence': 'Who are you voting for in 2020?'}

# Test with simple BERT Sentence Embedding and cosine similarity

In [None]:
from transformers import AutoTokenizer, AutoModel
from torch.nn import functional as F
from scipy import spatial

# tokenizer = AutoTokenizer.from_pretrained('deepset/sentence_bert')
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
model = AutoModel.from_pretrained('deepset/sentence_bert')

Some weights of the model checkpoint at deepset/sentence_bert were not used when initializing BertModel: ['classifier.bias', 'classifier.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [None]:
# run inputs through model and mean-pool over the sequence
# dimension to get sequence-level representations
inputs = tokenizer.batch_encode_plus([sentence] + labels,
                                     return_tensors='pt',
                                     pad_to_max_length=True)



In [None]:
input_ids = inputs['input_ids']
attention_mask = inputs['attention_mask']
output = model(input_ids, attention_mask=attention_mask)[0]
sentence_rep = output[:1].mean(dim=1)
label_reps = output[1:].mean(dim=1)

In [None]:
# find the highest cosine similarities between sentences
print(F.cosine_similarity(sentence_rep[0], label_reps[0], dim=0))
print(F.cosine_similarity(sentence_rep[0], label_reps[1], dim=0))
print(F.cosine_similarity(sentence_rep[0], label_reps[2], dim=0))

tensor(0.0045, grad_fn=<DivBackward0>)
tensor(-0.0274, grad_fn=<DivBackward0>)
tensor(0.2156, grad_fn=<DivBackward0>)


In [None]:
sentence_embedding = sentence_rep.detach().numpy()
label_embedding = label_reps.detach().numpy()

print(1 - spatial.distance.cosine(sentence_embedding[0], label_embedding[0]))
print(1 - spatial.distance.cosine(sentence_embedding[0], label_embedding[1]))
print(1 - spatial.distance.cosine(sentence_embedding[0], label_embedding[2]))


0.00452416529878974
-0.02739686518907547
0.2156151980161667


In [None]:
similarities = F.cosine_similarity(sentence_rep, label_reps)
closest = similarities.argsort(descending=True)
for ind in closest:
  print(f'label: {labels[ind]} \t similarity: {similarities[ind]}')

label: politics 	 similarity: 0.2156151831150055
label: business 	 similarity: 0.004524169024080038
label: art & culture 	 similarity: -0.02739686332643032


In [None]:
similarities, F.softmax(similarities)

  """Entry point for launching an IPython kernel.


(tensor([ 0.0045, -0.0274,  0.2156], grad_fn=<DivBackward0>),
 tensor([0.3121, 0.3023, 0.3855], grad_fn=<SoftmaxBackward>))

# Test augmenting labels with static embedding neighbors (GloVe) and BERT Sentence Embedding with cosine similarity

In [None]:
import torchtext.vocab

glove = torchtext.vocab.GloVe(name='6B', dim=100)
print(f"{len(glove.itos)} words in dictionary")

400000 words in dictionary


In [None]:
import torch

def get_vector(embeddings, w):
  return embeddings.vectors[embeddings.stoi[w]]

def closest_words(embeddings, vector, n=10):
  distances = [(w, torch.dist(vector, get_vector(embeddings, w)).item()) for w in embeddings.itos]
  return sorted(distances, key = lambda w: w[1])[:n]

In [None]:
closest_words(glove, get_vector(glove, 'politics'))

[('politics', 0.0),
 ('political', 3.8383750915527344),
 ('debate', 4.631179332733154),
 ('matters', 4.661602973937988),
 ('influence', 4.729617118835449),
 ('culture', 4.731587886810303),
 ('rather', 4.750455856323242),
 ('history', 4.752238750457764),
 ('politicians', 4.768784999847412),
 ('matter', 4.817280292510986)]

In [None]:
labels_neighbours = [closest_words(glove, get_vector(glove, word)) for label in labels for word in label.split(' & ')]
labels_neighbours

[[('business', 0.0),
  ('industry', 3.5567009449005127),
  ('businesses', 3.84977126121521),
  ('marketing', 3.870338201522827),
  ('corporate', 3.901237726211548),
  ('enterprise', 4.052821636199951),
  ('companies', 4.098732948303223),
  ('company', 4.115787982940674),
  ('well', 4.250703811645508),
  ('commercial', 4.251638889312744)],
 [('art', 0.0),
  ('arts', 3.688779592514038),
  ('museum', 3.934798240661621),
  ('sculpture', 4.103562355041504),
  ('works', 4.126135349273682),
  ('photography', 4.151274681091309),
  ('contemporary', 4.155360221862793),
  ('painting', 4.276235103607178),
  ('gallery', 4.385191440582275),
  ('collection', 4.4654622077941895)],
 [('culture', 0.0),
  ('cultural', 3.783661127090454),
  ('tradition', 4.208914279937744),
  ('traditions', 4.227012634277344),
  ('cultures', 4.243590831756592),
  ('civilization', 4.2488861083984375),
  ('society', 4.413925647735596),
  ('history', 4.420716285705566),
  ('religion', 4.51834774017334),
  ('context', 4.55044

In [None]:
labels_neighbours = []
for l in range(len(labels)):
  neighbours = []
  for word in labels[l].split(' & '):
    neighbours.extend([n[0] for n in closest_words(glove, get_vector(glove, word))])
  labels_neighbours.append(neighbours)

labels_neighbours

[['business',
  'industry',
  'businesses',
  'marketing',
  'corporate',
  'enterprise',
  'companies',
  'company',
  'well',
  'commercial'],
 ['art',
  'arts',
  'museum',
  'sculpture',
  'works',
  'photography',
  'contemporary',
  'painting',
  'gallery',
  'collection',
  'culture',
  'cultural',
  'tradition',
  'traditions',
  'cultures',
  'civilization',
  'society',
  'history',
  'religion',
  'context'],
 ['politics',
  'political',
  'debate',
  'matters',
  'influence',
  'culture',
  'rather',
  'history',
  'politicians',
  'matter']]

In [None]:
labels_sentences = [' & '.join(neighbor) for neighbor in labels_neighbours]
labels_sentences

['business & industry & businesses & marketing & corporate & enterprise & companies & company & well & commercial',
 'art & arts & museum & sculpture & works & photography & contemporary & painting & gallery & collection & culture & cultural & tradition & traditions & cultures & civilization & society & history & religion & context',
 'politics & political & debate & matters & influence & culture & rather & history & politicians & matter']

In [None]:
# run inputs through model and mean-pool over the sequence
# dimension to get sequence-level representations
inputs = tokenizer.batch_encode_plus([sentence] + labels_sentences,
                                     return_tensors='pt',
                                     pad_to_max_length=True)

input_ids = inputs['input_ids']
attention_mask = inputs['attention_mask']
output = model(input_ids, attention_mask=attention_mask)[0]
sentence_rep = output[:1].mean(dim=1)
label_reps = output[1:].mean(dim=1)

similarities = F.cosine_similarity(sentence_rep, label_reps)
closest = similarities.argsort(descending=True)
for ind in closest:
  print(f'label: {labels[ind]} \t similarity: {similarities[ind]}')



label: politics 	 similarity: 0.19136422872543335
label: business 	 similarity: 0.01101242657750845
label: art & culture 	 similarity: -0.04034052789211273


# Test with BERT MNLI Sequence Classification large (BART-LARGE-MNLI) with simple labels

In [None]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained('facebook/bart-large-mnli')
model = AutoModelForSequenceClassification.from_pretrained('facebook/bart-large-mnli')

In [None]:
sentence = 'Who are you voting for in 2020?'
labels = ['business', 'art & culture', 'politics']

print(f"{sentence}\n")
for label in labels:
  # run through model pre-trained on MNLI matching sentense with first label neighbors sentence
  x = tokenizer.encode(sentence, label, return_tensors='pt',
                       truncation=True)
  logits = model(x)[0]

  # we throw away "neutral" (dim 1) and take the probability of
  # "entailment" (2) as the probability of the label being true 
  entail_contradiction_logits = logits[:,[0,2]]
  probs = entail_contradiction_logits.softmax(dim=1)
  prob_label_is_true = probs[:,1]
  print(prob_label_is_true.item(), label)

Who are you voting for in 2020?

0.25931456685066223 business
0.06867887079715729 art & culture
0.7672102451324463 politics


# Test BERT MNLI Sequence Classification with augmented labels by static embedding neighbors (GloVe)

In [None]:
print(f"{sentence}\n")
for label_sentence in labels_sentences:
  # run through model pre-trained on MNLI matching sentense with first label neighbors sentence
  x = tokenizer.encode(sentence, label_sentence, return_tensors='pt',
                       truncation=True)
  logits = model(x)[0]

  # we throw away "neutral" (dim 1) and take the probability of
  # "entailment" (2) as the probability of the label being true 
  entail_contradiction_logits = logits[:,[0,2]]
  probs = entail_contradiction_logits.softmax(dim=1)
  prob_label_is_true = probs[:,1]
  print(prob_label_is_true.item(), label_sentence)

Who are you voting for in 2020?

0.7021744251251221 business & industry & businesses & marketing & corporate & enterprise & companies & company & well & commercial
0.9785778522491455 art & arts & museum & sculpture & works & photography & contemporary & painting & gallery & collection & culture & cultural & tradition & traditions & cultures & civilization & society & history & religion & context
0.919766902923584 politics & political & debate & matters & influence & culture & rather & history & politicians & matter


In [None]:
sentence = "Who are you voting for in 2020?"
labels = ['foreign policy', 'Europe', 'elections & vote', 'business & industry & businesses & marketing', '2020', 'outdoor recreation', 'politics & political & debate & matters & influence & culture & rather & history & politicians & matter', 'art & culture & arts & museum & sculpture & photography & painting & gallery']

print(f"{sentence}\n")
for label in labels:
  # run through model pre-trained on MNLI matching sentense with first label neighbors sentence
  x = tokenizer.encode(sentence, label, return_tensors='pt',
                       truncation=True)
  logits = model(x)[0]

  # we throw away "neutral" (dim 1) and take the probability of
  # "entailment" (2) as the probability of the label being true 
  entail_contradiction_logits = logits[:,[0,2]]
  probs = entail_contradiction_logits.softmax(dim=1)
  prob_label_is_true = probs[:,1]
  print(prob_label_is_true.item(), label)

Who are you voting for in 2020?

0.5547316074371338 foreign policy
0.03854397311806679 Europe
0.8834729194641113 elections & vote
0.04872368276119232 business & industry & businesses & marketing
0.9925204515457153 2020
0.16657975316047668 outdoor recreation
0.919766902923584 politics & political & debate & matters & influence & culture & rather & history & politicians & matter
0.13452966511249542 art & culture & arts & museum & sculpture & photography & painting & gallery


# Test with BERT MNLI Sequence Classification small (DistilBert-Uncased-MNLI) with simple labels

In [None]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained('textattack/distilbert-base-uncased-MNLI')
model = AutoModelForSequenceClassification.from_pretrained('textattack/distilbert-base-uncased-MNLI')

In [None]:
sentence = 'Who are you voting for in 2020?'
labels = ['business', 'art & culture', 'politics']

In [None]:
print(f"{sentence}\n")
for label_sentence in labels:
  # run through model pre-trained on MNLI matching sentense with first label neighbors sentence
  x = tokenizer.encode(sentence, label_sentence, return_tensors='pt',
                       truncation=True)
  logits = model(x)[0]

  # we throw away "neutral" (dim 1) and take the probability of
  # "entailment" (2) as the probability of the label being true 
  entail_contradiction_logits = logits[:,[0,2]]
  probs = entail_contradiction_logits.softmax(dim=1)
  prob_label_is_true = probs[:,1]
  print(prob_label_is_true.item(), label_sentence)

Who are you voting for in 2020?

0.1496192067861557 business
0.6417831182479858 art & culture
0.8972325325012207 politics


In [None]:
sentence = "Who are you voting for in 2020?"
labels = ['foreign policy', 'Europe', 'elections & vote', 'business & industry & businesses & marketing', '2020', 'outdoor recreation', 'politics & political & debate & matters & influence & culture & rather & history & politicians & matter', 'art & culture & arts & museum & sculpture & photography & painting & gallery']

print(f"{sentence}\n")
for label in labels:
  # run through model pre-trained on MNLI matching sentense with first label neighbors sentence
  x = tokenizer.encode(sentence, label, return_tensors='pt',
                       truncation=True)
  logits = model(x)[0]

  # we throw away "neutral" (dim 1) and take the probability of
  # "entailment" (2) as the probability of the label being true 
  entail_contradiction_logits = logits[:,[0,2]]
  probs = entail_contradiction_logits.softmax(dim=1)
  prob_label_is_true = probs[:,1]
  print(prob_label_is_true.item(), label)

Who are you voting for in 2020?

0.8460844159126282 foreign policy
0.786884069442749 Europe
0.9221029877662659 elections & vote
0.9136582612991333 business & industry & businesses & marketing
0.6420761346817017 2020
0.5007476210594177 outdoor recreation
0.6367897987365723 politics & political & debate & matters & influence & culture & rather & history & politicians & matter
0.7028875946998596 art & culture & arts & museum & sculpture & photography & painting & gallery


In [None]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained('ishan/distilbert-base-uncased-mnli')
model = AutoModelForSequenceClassification.from_pretrained('ishan/distilbert-base-uncased-mnli')

Downloading:   0%|          | 0.00/639 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/268M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

In [None]:
sentence = 'Who are you voting for in 2020?'
labels = ['business', 'art & culture', 'politics']

In [None]:
print(f"{sentence}\n")
for label_sentence in labels:
  # run through model pre-trained on MNLI matching sentense with first label neighbors sentence
  x = tokenizer.encode(sentence, label_sentence, return_tensors='pt',
                       truncation=True)
  logits = model(x)[0]

  # we throw away "neutral" (dim 1) and take the probability of
  # "entailment" (2) as the probability of the label being true 
  entail_contradiction_logits = logits[:,[0,2]]
  probs = entail_contradiction_logits.softmax(dim=1)
  prob_label_is_true = probs[:,1]
  print(prob_label_is_true.item(), label_sentence)

Who are you voting for in 2020?

0.09194833785295486 business
0.8147940635681152 art & culture
0.8844189643859863 politics


# Test with BERT MNLI Sequence Classification (BART-LARGE-MNLI) with 'bert-base-uncased' standard Tokenizer

In [None]:
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
# model = AutoModelForSequenceClassification.from_pretrained('ishan/distilbert-base-uncased-mnli')
# model = AutoModelForSequenceClassification.from_pretrained('textattack/distilbert-base-uncased-MNLI')
model = AutoModelForSequenceClassification.from_pretrained('facebook/bart-large-mnli')

In [None]:
sentence = 'Who are you voting for in 2020?'
labels = ['business', 'art & culture', 'politics']

In [None]:
print(f"{sentence}\n")
for label_sentence in labels:
  # run through model pre-trained on MNLI matching sentense with first label neighbors sentence
  x = tokenizer.encode(sentence, label_sentence, return_tensors='pt',
                       truncation=True)
  logits = model(x)[0]

  # we throw away "neutral" (dim 1) and take the probability of
  # "entailment" (2) as the probability of the label being true 
  entail_contradiction_logits = logits[:,[0,2]]
  probs = entail_contradiction_logits.softmax(dim=1)
  prob_label_is_true = probs[:,1]
  print(prob_label_is_true.item(), label_sentence)

Who are you voting for in 2020?



IndexError: ignored