# Prepare the environment

In [5]:
!pip install transformers
!pip install datasets
!pip install --upgrade accelerate

Collecting transformers
  Using cached transformers-4.29.1-py3-none-any.whl (7.1 MB)
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Using cached tokenizers-0.13.3-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
Collecting huggingface-hub<1.0,>=0.14.1
  Using cached huggingface_hub-0.14.1-py3-none-any.whl (224 kB)
Collecting packaging>=20.0
  Using cached packaging-23.1-py3-none-any.whl (48 kB)
Installing collected packages: tokenizers, packaging, huggingface-hub, transformers
  Attempting uninstall: packaging
    Found existing installation: packaging 20.1
    Uninstalling packaging-20.1:
      Successfully uninstalled packaging-20.1
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
pytest-astropy 0.8.0 requires pytest-cov>=2.0, which is not installed.
pytest-astropy 0.8.0 requires pytest-filter-subpackage>=0.1, which is not installed.
sag

In [6]:
import json
import numpy as np
import transformers
import random
from datasets import Dataset
from transformers import DataCollatorWithPadding
from transformers import Trainer
from transformers import TrainingArguments

In [7]:
semcor_samples = "semcor_samples_4-samples.json"

In [8]:
with open(semcor_samples, 'r') as f:
    full_samples = json.load(f)

In [9]:
uniq_sent_idx = list(set(full_samples['index']))

In [10]:
random.Random(4).shuffle(uniq_sent_idx)
train_idx = uniq_sent_idx[:24138]
val_idx = uniq_sent_idx[24138:24138+5173]
test_idx = uniq_sent_idx[24138+5173:24138+5173+5173]

In [11]:
len(train_idx), len(val_idx), len(test_idx)

(24138, 5173, 5173)

# Test for baseline

In [12]:
test_samples = {
    'sentence': [],
    'hypothesis': [],
    'label': []
}

test_idx = set(test_idx)

for i in range(len(full_samples['sentence'])):
    curr_idx = full_samples['index'][i]
    
    if curr_idx in test_idx:
        test_samples['sentence'].append(full_samples['sentence'][i])
        test_samples['hypothesis'].append(full_samples['hypothesis'][i])
        test_samples['label'].append(full_samples['label'][i])

In [13]:
test_samples['hypothesis'][0:4]

['The meaning of [take] is to carry out.',
 'The meaning of [take] is to take something or somebody with oneself somewhere.',
 'The meaning of [take] is to be stricken by an illness, fall victim to an illness.',
 'The meaning of [take] is to develop a habit.']

In [14]:
classifier = transformers.pipeline("zero-shot-classification", model="facebook/bart-large-mnli", device=0)

In [15]:
answers = []
for i in range(len(test_samples['sentence'])):
  if i % 4 == 0:
    sequence_to_classify = test_samples['sentence'][i]

    candidate_labels = test_samples['hypothesis'][i:i+4]

    sen = sequence_to_classify.split(" [")

    try:
      sen = sen[1].split("] ")
    except(IndexError):
      break

    wword = sen[0]

    hypothesis_template = "{}"

    output = classifier(sequence_to_classify, candidate_labels, hypothesis_template=hypothesis_template)

    label_list = []
    for label, score in zip(output['labels'], output['scores']):
        label_list.append(label)
    answers.append(label_list[0])



In [16]:
print(answers)
print(type(answers))
print(len(answers))

['The meaning of [take] is to carry out.', 'The meaning of [steps] is any maneuver made as part of progress toward a goal.', 'The meaning of [said] is to express in words.', 'The meaning of [realize] is to be fully aware or cognizant of.', 'The meaning of [distribution] is the act of distributing or spreading or apportioning.', 'The meaning of [program] is a series of steps to be carried out or goals to be accomplished.', 'The meaning of [matters] is some situation or event that is thought about.', 'The meaning of [filed] is to record in a public office or in a court of law.', 'The meaning of [suit] is a comprehensive term for any proceeding in a court of law whereby an individual seeks a legal remedy.', 'The meaning of [charged] is to make an accusatory claim.', 'The meaning of [mental] is of or relating to the mind.', 'The meaning of [have] is to have or possess, either in a concrete or an abstract sense.', 'The meaning of [became] is to enter or assume a certain state or condition.'

In [17]:
print(candidate_labels)
print(hypothesis_template)

['The meaning of [Extension] is act of expanding in scope; making more widely available.', 'The meaning of [Extension] is an addition to the length of something.', 'The meaning of [Extension] is the most direct or specific meaning of a word or expression; the class of objects that an expression refers to.', 'The meaning of [Extension] is an additional telephone set that is connected to the same telephone line.']
{}


In [18]:
correct_ref = []
for j in range(len(test_samples['hypothesis'])):
  if j % 4 == 0:
    print(test_samples['hypothesis'][j])
    correct_ref.append(test_samples['hypothesis'][j])

  if j > 466:
    break

The meaning of [take] is to carry out.
The meaning of [steps] is any maneuver made as part of progress toward a goal.
The meaning of [said] is to express in words.
The meaning of [realize] is to be fully aware or cognizant of.
The meaning of [distribution] is the spatial or geographic property of being scattered about over a range, area, or volume.
The meaning of [program] is a series of steps to be carried out or goals to be accomplished.
The meaning of [matters] is some situation or event that is thought about.
The meaning of [filed] is to record in a public office or in a court of law.
The meaning of [suit] is a comprehensive term for any proceeding in a court of law whereby an individual seeks a legal remedy.
The meaning of [charged] is to file a formal charge against.
The meaning of [mental] is of or relating to the mind.
The meaning of [have] is to have or possess, either in a concrete or an abstract sense.
The meaning of [became] is to undergo a change or development.
The meanin

In [19]:
print(correct_ref)
print(type(correct_ref))
print(len(correct_ref))

['The meaning of [take] is to carry out.', 'The meaning of [steps] is any maneuver made as part of progress toward a goal.', 'The meaning of [said] is to express in words.', 'The meaning of [realize] is to be fully aware or cognizant of.', 'The meaning of [distribution] is the spatial or geographic property of being scattered about over a range, area, or volume.', 'The meaning of [program] is a series of steps to be carried out or goals to be accomplished.', 'The meaning of [matters] is some situation or event that is thought about.', 'The meaning of [filed] is to record in a public office or in a court of law.', 'The meaning of [suit] is a comprehensive term for any proceeding in a court of law whereby an individual seeks a legal remedy.', 'The meaning of [charged] is to file a formal charge against.', 'The meaning of [mental] is of or relating to the mind.', 'The meaning of [have] is to have or possess, either in a concrete or an abstract sense.', 'The meaning of [became] is to under

In [20]:
print(test_samples['hypothesis'][:10])

['The meaning of [take] is to carry out.', 'The meaning of [take] is to take something or somebody with oneself somewhere.', 'The meaning of [take] is to be stricken by an illness, fall victim to an illness.', 'The meaning of [take] is to develop a habit.', 'The meaning of [steps] is any maneuver made as part of progress toward a goal.', "The meaning of [steps] is a solid block joined to the beams in which the heel of a ship's mast or capstan is fixed.", 'The meaning of [steps] is support consisting of a place to rest the foot while ascending or descending a stairway.', 'The meaning of [steps] is a musical interval of two semitones.', 'The meaning of [said] is to express in words.', 'The meaning of [said] is to give instructions to or direct somebody to do something with authority.']


In [21]:
correct = 0
wrong = 0

for n in range(len(correct_ref)):
  if correct_ref[n] == answers[n]:
    print(correct_ref[n])
    print(answers[n])
    correct += 1
  else:
    print("🟢", correct_ref[n])
    print("❌", answers[n])
    print("")
    wrong += 1

The meaning of [take] is to carry out.
The meaning of [take] is to carry out.
The meaning of [steps] is any maneuver made as part of progress toward a goal.
The meaning of [steps] is any maneuver made as part of progress toward a goal.
The meaning of [said] is to express in words.
The meaning of [said] is to express in words.
The meaning of [realize] is to be fully aware or cognizant of.
The meaning of [realize] is to be fully aware or cognizant of.
🟢 The meaning of [distribution] is the spatial or geographic property of being scattered about over a range, area, or volume.
❌ The meaning of [distribution] is the act of distributing or spreading or apportioning.

The meaning of [program] is a series of steps to be carried out or goals to be accomplished.
The meaning of [program] is a series of steps to be carried out or goals to be accomplished.
The meaning of [matters] is some situation or event that is thought about.
The meaning of [matters] is some situation or event that is thought a

In [22]:
print("Correct:", correct)
print("Wrong  :", wrong)

Correct: 66
Wrong  : 51


## Test results before fine-tuning

In [23]:
# accuracy calculation

accuracy = (correct / (correct + wrong)) * 100
# print(accuracy)
accuracy = str(round(accuracy, 2))

print("")
print("----------- TEST RESULT -----------")
print("Model    : Facebook Bart Large Mnli")
print("State    : Before fine-tuning")
print("Accuracy : " + accuracy + "%")


----------- TEST RESULT -----------
Model    : Facebook Bart Large Mnli
State    : Before fine-tuning
Accuracy : 56.41%


# Fine-tuning

In [24]:
train_samples = {
    'sentence': [],
    'hypothesis': [],
    'label': []
}

train_samples01 = {
    'sentence': [],
    'hypothesis': [],
    'label': []
}

train_idx = set(train_idx)

for i in range(len(full_samples['sentence'])):
    curr_idx = full_samples['index'][i]
    
    if curr_idx in train_idx:
        train_samples01['sentence'].append(full_samples['sentence'][i])
        train_samples01['hypothesis'].append(full_samples['hypothesis'][i])
        train_samples01['label'].append(full_samples['label'][i])

        if len(train_samples01['sentence']) > 40000:
            train_samples['sentence'].append(full_samples['sentence'][i])
            train_samples['hypothesis'].append(full_samples['hypothesis'][i])
            train_samples['label'].append(full_samples['label'][i])
            
        if len(train_samples['sentence']) > 70000:
            break

In [25]:
len(train_samples['sentence'])
train_samples['sentence'][:8]

["Three distinct classes of loans are [made] available to farmers' cooperatives by the Banks for Cooperatives:",
 "Three distinct classes of loans are [made] available to farmers' cooperatives by the Banks for Cooperatives:",
 "Three distinct classes of loans are [made] available to farmers' cooperatives by the Banks for Cooperatives:",
 "Three distinct classes of loans are [made] available to farmers' cooperatives by the Banks for Cooperatives:",
 'Commodity loans, operating capital loans, and [facility] loans.',
 'Commodity loans, operating capital loans, and [facility] loans.',
 'Commodity loans, operating capital loans, and [facility] loans.',
 'Commodity loans, operating capital loans, and [facility] loans.']

In [26]:
len(train_samples['hypothesis'])
train_samples['hypothesis'][:8]

['The meaning of [made] is to give certain properties to something.',
 'The meaning of [made] is to compel or make somebody or something to act in a certain way.',
 'The meaning of [made] is to create or design, often in a certain way.',
 'The meaning of [made] is to give rise to; cause to happen or occur, not always intentionally.',
 'The meaning of [facility] is a building or place that provides a particular service or is used for a particular industry.',
 'The meaning of [facility] is a natural effortlessness; ; --Jane Austen.',
 'The meaning of [facility] is skillful performance or ability without difficulty.',
 'The meaning of [facility] is something designed and created to serve a particular function and to afford a particular convenience or service.']

In [27]:
len(train_samples['label'])
train_samples['label'][:8]

[2, 0, 0, 0, 2, 0, 0, 0]

In [28]:
samples = train_samples

In [29]:
raw_datasets = Dataset.from_dict(samples)

In [30]:
raw_datasets[:10]

{'sentence': ["Three distinct classes of loans are [made] available to farmers' cooperatives by the Banks for Cooperatives:",
  "Three distinct classes of loans are [made] available to farmers' cooperatives by the Banks for Cooperatives:",
  "Three distinct classes of loans are [made] available to farmers' cooperatives by the Banks for Cooperatives:",
  "Three distinct classes of loans are [made] available to farmers' cooperatives by the Banks for Cooperatives:",
  'Commodity loans, operating capital loans, and [facility] loans.',
  'Commodity loans, operating capital loans, and [facility] loans.',
  'Commodity loans, operating capital loans, and [facility] loans.',
  'Commodity loans, operating capital loans, and [facility] loans.',
  'To [be] eligible to borrow from a Bank for Cooperatives, a cooperative must be an association in which farmers act together in processing and marketing farm products, purchasing farm supplies, or furnishing farm business services, and must meet the requ

In [31]:
def tokenize_function(example):
  return classifier.tokenizer(example['sentence'], example['hypothesis'], truncation=True)

tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)
data_collator = DataCollatorWithPadding(tokenizer=classifier.tokenizer)

Map:   0%|          | 0/70001 [00:00<?, ? examples/s]

In [32]:
training_args = TrainingArguments("test-trainer")

trainer = Trainer(
    classifier.model, # โมเดลที่ผ่านการเพิ่ม classifier head
    training_args, # parameter จาก hugging face
    train_dataset=tokenized_datasets,
    data_collator=data_collator, # ตัวสร้าง padding
    tokenizer=classifier.tokenizer,
)

## Training

In [33]:
trainer.train()

You're using a BartTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss
500,0.4644
1000,0.3786
1500,0.3484
2000,0.3249
2500,0.2575
3000,0.2245
3500,0.2112
4000,0.2023
4500,0.1736
5000,0.1134




TrainOutput(global_step=6564, training_loss=0.23076320544457304, metrics={'train_runtime': 4671.7504, 'train_samples_per_second': 44.952, 'train_steps_per_second': 1.405, 'total_flos': 4.574634705334184e+16, 'train_loss': 0.23076320544457304, 'epoch': 3.0})

In [34]:
classifier.save_pretrained('./simple_trained_wsd_pipeline')

In [35]:
!ls -lh ./simple_trained_wsd_pipeline

total 1.6G
-rw-r--r-- 1 root root 1.3K May 15 21:17 config.json
-rw-r--r-- 1 root root 446K May 15 21:18 merges.txt
-rw-r--r-- 1 root root 1.6G May 15 21:18 pytorch_model.bin
-rw-r--r-- 1 root root  280 May 15 21:18 special_tokens_map.json
-rw-r--r-- 1 root root 2.1M May 15 21:18 tokenizer.json
-rw-r--r-- 1 root root  349 May 15 21:18 tokenizer_config.json
-rw-r--r-- 1 root root 780K May 15 21:18 vocab.json


# Re-test the model

In [44]:
# !pip install transformers
# import transformers

[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.0.1[0m[39;49m -> [0m[32;49m23.1.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [45]:
loaded_classifier = transformers.pipeline("zero-shot-classification", model="./simple_trained_wsd_pipeline", device=0)

In [46]:
answers_trained = []
for i in range(len(test_samples['sentence'])):
  if i % 4 == 0:
    sequence_to_classify = test_samples['sentence'][i]

    candidate_labels = test_samples['hypothesis'][i:i+4]

    sen = sequence_to_classify.split(" [")

    try:
      sen = sen[1].split("] ")
    except(IndexError):
      break

    wword = sen[0]

    hypothesis_template = "{}"

    output = loaded_classifier(sequence_to_classify, candidate_labels, hypothesis_template=hypothesis_template)

    label_list = []
    for label, score in zip(output['labels'], output['scores']):
        label_list.append(label)
    answers_trained.append(label_list[0])



In [47]:
print(answers_trained)
print(type(answers_trained))
print(len(answers_trained))

['The meaning of [take] is to carry out.', 'The meaning of [steps] is any maneuver made as part of progress toward a goal.', 'The meaning of [said] is to express in words.', 'The meaning of [realize] is to perceive (an idea or situation) mentally.', 'The meaning of [distribution] is the act of distributing or spreading or apportioning.', 'The meaning of [program] is a series of steps to be carried out or goals to be accomplished.', 'The meaning of [matters] is some situation or event that is thought about.', 'The meaning of [filed] is to record in a public office or in a court of law.', 'The meaning of [suit] is a comprehensive term for any proceeding in a court of law whereby an individual seeks a legal remedy.', 'The meaning of [charged] is to file a formal charge against.', 'The meaning of [mental] is involving the mind or an intellectual process.', 'The meaning of [have] is to have or possess, either in a concrete or an abstract sense.', 'The meaning of [became] is to undergo a cha

In [48]:
correct_trained = 0
wrong_trained = 0

for n in range(len(correct_ref)):
  if correct_ref[n] == answers_trained[n]:
    print(correct_ref[n])
    print(answers_trained[n])
    correct_trained += 1
  else:
    print("🟢", correct_ref[n])
    print("❌", answers_trained[n])
    print("")
    wrong_trained += 1

The meaning of [take] is to carry out.
The meaning of [take] is to carry out.
The meaning of [steps] is any maneuver made as part of progress toward a goal.
The meaning of [steps] is any maneuver made as part of progress toward a goal.
The meaning of [said] is to express in words.
The meaning of [said] is to express in words.
🟢 The meaning of [realize] is to be fully aware or cognizant of.
❌ The meaning of [realize] is to perceive (an idea or situation) mentally.

🟢 The meaning of [distribution] is the spatial or geographic property of being scattered about over a range, area, or volume.
❌ The meaning of [distribution] is the act of distributing or spreading or apportioning.

The meaning of [program] is a series of steps to be carried out or goals to be accomplished.
The meaning of [program] is a series of steps to be carried out or goals to be accomplished.
The meaning of [matters] is some situation or event that is thought about.
The meaning of [matters] is some situation or event th

In [49]:
print("Correct:", correct)
print("Wrong  :", wrong)

Correct: 66
Wrong  : 51


## Test result after fine-tuning

In [50]:
# accuracy calculation

accuracy_trained = (correct_trained / (correct_trained + wrong_trained)) * 100
# print(accuracy_trained)
accuracy_trained = str(round(accuracy_trained, 2))

print("")
print("----------- TEST RESULT -----------")
print("Model    : ./simple_trained_wsd_pipeline")
print("State    : After fine-tuning")
print("Accuracy : " + accuracy_trained + "%")


----------- TEST RESULT -----------
Model    : ./simple_trained_wsd_pipeline
State    : After fine-tuning
Accuracy : 75.21%
