In [1]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.29.1-py3-none-any.whl (7.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.1/7.1 MB[0m [31m27.0 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.14.1 (from transformers)
  Downloading huggingface_hub-0.14.1-py3-none-any.whl (224 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m224.5/224.5 kB[0m [31m9.2 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m30.0 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.14.1 tokenizers-0.13.3 transformers-4.29.1


In [2]:
!wget https://alt.qcri.org/semeval2017/task7/data/uploads/semeval2017_task7.tar.xz
!tar -xf semeval2017_task7.tar.xz
#!tar -xvf semeval2017_task7.tar.xz
#%cd semeval2017_task7/
#%cd ..
%ls

--2023-05-12 07:47:26--  https://alt.qcri.org/semeval2017/task7/data/uploads/semeval2017_task7.tar.xz
Resolving alt.qcri.org (alt.qcri.org)... 80.76.166.231
Connecting to alt.qcri.org (alt.qcri.org)|80.76.166.231|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 748424 (731K) [application/x-xz]
Saving to: ‘semeval2017_task7.tar.xz’


2023-05-12 07:47:29 (652 KB/s) - ‘semeval2017_task7.tar.xz’ saved [748424/748424]

[0m[01;34msample_data[0m/  [01;34msemeval2017_task7[0m/  semeval2017_task7.tar.xz


In [3]:
import torch
from transformers import AutoTokenizer, AutoModelForMaskedLM
from tqdm import tqdm
from sklearn.metrics import accuracy_score
import numpy as np
import copy
import xml.etree.ElementTree as ET

In [4]:
device = "cuda:0" if torch.cuda.is_available() else "cpu"
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
model = AutoModelForMaskedLM.from_pretrained("bert-base-uncased", output_hidden_states=True).to(device)

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [5]:
def bert(puns, ratio=0):
    res = []
    for i in tqdm(puns):
        for pun in i.values():
            poss = [x for x in pun]
            token_num = []
            start = False
            for x in pun:
                subwords = tokenizer.tokenize((' ' if start else '') + pun[x])
                token_num.append(len(subwords))
                start = True
            sentence = ' '.join([pun[x] for x in pun])
            tokenized = tokenizer(sentence, return_tensors="pt").to(device)
            # tokenizer will automatic add the start token (ID: 101) and end token (ID: 102) of the sentence
            # The token ID of "[MASK]" is 103
            # #token >= #word
            word_probabilities_list = []
            for mask_i in range(1, len(tokenized['input_ids'][0]) - 1):
                tokenized_input = (copy.deepcopy(tokenized)).to(device)
                tokenized_input['input_ids'][0][mask_i] = torch.tensor(103) # The ID of "[MASK]" is 103
                outputs = model(**tokenized_input)
                logits = outputs.logits.to(device)
                old_word_ID = tokenized['input_ids'][0][mask_i]
                prob = torch.nn.functional.softmax(logits[0][mask_i], dim=-1)[old_word_ID].item()
                word_probabilities_list.append(prob)
        final_word_probabilities_list = np.ones((len(poss),), dtype=np.float64)
        start_tok = 0
        for word_index, num_token in enumerate(token_num):
            curr = 1.0
            for each_tok_i in range(num_token):
                curr *= word_probabilities_list[start_tok]
                start_tok += 1
            final_word_probabilities_list[word_index] = curr
        start_index = min(len(poss) - 1, int(ratio * len(poss)))
        min_index = start_index + np.argmin(final_word_probabilities_list[start_index:], axis=0)
        res.append(poss[min_index])
    return res

In [6]:
f = 'semeval2017_task7/data/test/subtask2-homographic-test.xml'

mytree = ET.parse(f)
myroot = mytree.getroot()

puns = []
for item in myroot.findall('./text'):
  dict1 = {}
  dict1[item.attrib['id']] = {}
  for child in item:
    idd = child.attrib['id']
    dict1[item.attrib['id']][idd] = child.text
  puns.append(dict1)

print(puns[0])

{'hom_1': {'hom_1_1': 'They', 'hom_1_2': 'hid', 'hom_1_3': 'from', 'hom_1_4': 'the', 'hom_1_5': 'gunman', 'hom_1_6': 'in', 'hom_1_7': 'a', 'hom_1_8': 'sauna', 'hom_1_9': 'where', 'hom_1_10': 'they', 'hom_1_11': 'could', 'hom_1_12': 'sweat', 'hom_1_13': 'it', 'hom_1_14': 'out', 'hom_1_15': '.'}}


In [7]:
gold = []
with open('semeval2017_task7/data/test/subtask2-homographic-test.gold', 'r') as fin:
  for row in fin:
    gold.append(row.strip().split('\t')[1])
print(gold[:5])

['hom_1_12', 'hom_2_9', 'hom_3_7', 'hom_4_5', 'hom_5_15']


In [8]:
bert(puns[30:33], 0.6)

100%|██████████| 3/3 [00:13<00:00,  4.53s/it]


['hom_39_16', 'hom_41_16', 'hom_43_8']

In [9]:
print("accuracy_score:", accuracy_score(gold, bert(puns, ratio=0.6)))

100%|██████████| 1607/1607 [1:00:57<00:00,  2.28s/it]

accuracy_score: 0.5028002489110143





In [10]:
f = 'semeval2017_task7/data/test/subtask2-heterographic-test.xml'

mytree = ET.parse(f)
myroot = mytree.getroot()

puns = []
for item in myroot.findall('./text'):
  dict1 = {}
  dict1[item.attrib['id']] = {}
  for child in item:
    idd = child.attrib['id']
    dict1[item.attrib['id']][idd] = child.text
  puns.append(dict1)

print(puns[0])

{'het_1': {'het_1_1': "'", 'het_1_2': "'", 'het_1_3': 'I', 'het_1_4': "'", 'het_1_5': 'm', 'het_1_6': 'halfway', 'het_1_7': 'up', 'het_1_8': 'a', 'het_1_9': 'mountain', 'het_1_10': ',', 'het_1_11': "'", 'het_1_12': "'", 'het_1_13': 'Tom', 'het_1_14': 'alleged', 'het_1_15': '.'}}


In [11]:
gold = []
with open('semeval2017_task7/data/test/subtask2-heterographic-test.gold', 'r') as fin:
  for row in fin:
    gold.append(row.strip().split('\t')[1])
print(gold[:5])

['het_1_14', 'het_2_13', 'het_4_11', 'het_5_5', 'het_7_6']


In [12]:
print("accuracy_score:", accuracy_score(gold, bert(puns, ratio=0.6)))

100%|██████████| 1271/1271 [53:54<00:00,  2.54s/it]

accuracy_score: 0.6081825334382376





In [13]:
tokenizer.tokenize("pretrain")

['pre', '##train']

In [14]:
tokenizer.tokenize("ChatGPT")

['chat', '##gp', '##t']