# Fine-Tuning, Cross-Validation and PBT algorithm for BERT

Note: PBT algorithm took awhile to execute on colab pro

All directories have been commented out and marked with a large text cell above where directories need to be replaced

In [None]:
#older version of pickle installed due to compatability issues with Ray PBT algorithm
!pip install pickle5
import pickle5 as pickle

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pickle5
  Downloading pickle5-0.0.12-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.whl (256 kB)
[K     |████████████████████████████████| 256 kB 14.7 MB/s 
[?25hInstalling collected packages: pickle5
Successfully installed pickle5-0.0.12


In [None]:
!pip install datasets seqeval transformers "ray[tune]" scipy sklearn torch

In [None]:
import torch
import pandas as pd
import numpy as np
from datasets import Dataset, DatasetDict, load_dataset, load_metric
from transformers import AutoTokenizer, AutoModelForTokenClassification
from transformers import pipeline
from sklearn.model_selection import train_test_split
import json
import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize
from transformers import TrainingArguments, Trainer
from transformers import DataCollatorForTokenClassification

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [None]:
def tag_creator1(json):
    data_list = json['annotations']
    classes = json['classes']

    store = []
    for sent_ent in data_list:
        store.append((sent_ent[0], sent_ent[1]))
    
    out = []
    for words, ents in store:
        letters = []
        for letter in words:
            letters.append(letter)
        out.append((letters, ents))

    #out = [(['T','o',' ','s'...],{'entities':[[118,132,'MIXED EFFECTS],[...],...]}),(....)]

    all_ranges = []

    for letters_list, entdict in out:
        ranges=[]
        for entpair in entdict['entities']:
            ranges.append((entpair[0],entpair[1],entpair[2]))
        all_ranges.append(ranges) # all_ranges is a list of lists where each sublist contains a tuple [(start_index, end_index, entity)] for each entdict

    for letters_list_entdict_pair in out:
        for ents in letters_list_entdict_pair[1]['entities']:
            start = ents[0]
            end = ents[1]
            ent = ents[2]

            for coded_pair in at_ent_tups:
                if ent == coded_pair[1]:
                    ent = coded_pair[0]

            for i in range(start,end):
                if letters_list_entdict_pair[0][i] != ' ':
                  letters_list_entdict_pair[0][i] = ent

    out2 = []
    
    for letters_list_entdict_pair in out:
        new = ""
        for letter in letters_list_entdict_pair[0]:
            new += letter
        out2.append((new, letters_list_entdict_pair[1]))

    test = ['@0','@1','@2','@3','@4','@5','@6','@7']
    testdict = {}
    for number, cla in zip(test,classes):
        testdict[number] = cla
    #print(testdict)
    #Tokenizing the words in each sentence
    tagslist=[]
    for sent, ents in out2:
        tagslist.append(sent.split(" "))

    #Below reassigns any non entities to 'O' and assigns any entities to 'B -' + the entity type
    for tags in tagslist:
        for i in range(len(tags)):
            if '@0' in tags[i]:
                tags[i] = 'B-' + testdict['@0']
            elif '@1' in tags[i]:
                tags[i] = 'B-' + testdict['@1']
            elif '2@' in tags[i]:
                tags[i] = 'B-' + testdict['@2']
            elif '@3' in tags[i]:
                tags[i] = 'B-' + testdict['@3']
            elif '@4' in tags[i]:
                tags[i] = 'B-' + testdict['@4']
            elif '@5' in tags[i]:
                tags[i] = 'B-' + testdict['@5']
            elif '@6' in tags[i]:
                tags[i] = 'B-' + testdict['@6']
            elif '@7' in tags[i]:
                tags[i] = 'B-' + testdict['@7']
            else:
                tags[i] = 'O'
    #print(tagslist)


    for tags in tagslist:
        for index, elem in enumerate(tags):
            el_minus_1 = tags[index-1]
            el_current = tags[index]
            if el_minus_1.startswith('B') and el_minus_1[-4:].endswith(el_current[-4:]):
                tags[index] = tags[index].replace('B', 'I', 1)
            elif el_minus_1.startswith('I') and el_minus_1[-4:].endswith(el_current[-4:]):
              tags[index] = tags[index].replace('B','I', 1)


    #print(tagslist)
    return tagslist


In [None]:
def tags_convert2(tag_list):
  out=[]
  for tag in tag_list:
    if tag == 'O':
      out.append(0)
    elif tag == 'B-OLS GLM':
      out.append(1)
    elif tag == 'I-OLS GLM':
      out.append(2)
    elif tag == 'B-MIXED EFFECTS, MLMS, GROWTH MODELS, GENERALISED LMS (INC LOGREG)':
      out.append(3)
    elif tag == 'I-MIXED EFFECTS, MLMS, GROWTH MODELS, GENERALISED LMS (INC LOGREG)':
      out.append(4)
    elif tag == 'B-BAYES FACTOR, BAYESIAN ESTIMATION':
      out.append(5)
    elif tag == 'I-BAYES FACTOR, BAYESIAN ESTIMATION':
      out.append(6)
    elif tag == 'B-NON-PARAMETRIC TESTS':
      out.append(7)
    elif tag == 'I-NON-PARAMETRIC TESTS':
      out.append(8)
    elif tag == 'B-ROBUST MODELS (BOOTSTRAPPED OLS, HCSES, M-ESTIMATORS, ROBUST TRIMMING)':
      out.append(9)
    elif tag == 'I-ROBUST MODELS (BOOTSTRAPPED OLS, HCSES, M-ESTIMATORS, ROBUST TRIMMING)':
      out.append(10)
    elif tag == 'B-SCALE DEVELOPMENT, FACTOR ANALYSIS, PCA':
      out.append(11)
    elif tag == 'I-SCALE DEVELOPMENT, FACTOR ANALYSIS, PCA':
      out.append(12)
    elif tag == 'B-OTHER':
      out.append(13)
    elif tag == 'I-OTHER':
      out.append(14)
    elif tag == 'B-CHI-SQUARE, FISHER EXACT':
      out.append(15)
    elif tag == 'I-CHI-SQUARE, FISHER EXACT':
      out.append(16)
    
  return out

#Pre-Processing

# **REPLACE DIRECTORY HERE**

In [None]:
#Will need to replace directory here with own directory of JSON's in zip file. Order does not matter.
f = open(#"/content/drive/MyDrive/Colab Notebooks/anno_mart.json")
g = open(#"/content/drive/MyDrive/Colab Notebooks/annotations_18_08_2022.json")
h = open(#"/content/drive/MyDrive/Colab Notebooks/annotations_22_08_2022.json")
i = open(#"/content/drive/MyDrive/Colab Notebooks/annotations_27_08_2022.json")
TRAIN_DATA = json.load(f)
TRAIN_DATA1 = json.load(g)
TRAIN_DATA2 = json.load(h)
TRAIN_DATA3 = json.load(i)

In [None]:
coded_entity_types = list(enumerate(TRAIN_DATA['classes']))
O = [(8, 'O')]
coded_with_O = coded_entity_types + O

In [None]:
at_ent_tups = []
for i,j in coded_entity_types:
  at_ent_tups.append(('@'+str(i),j))

at_ent_tups

[('@0', 'OLS GLM'),
 ('@1', 'MIXED EFFECTS, MLMS, GROWTH MODELS, GENERALISED LMS (INC LOGREG)'),
 ('@2', 'BAYES FACTOR, BAYESIAN ESTIMATION'),
 ('@3', 'NON-PARAMETRIC TESTS'),
 ('@4',
  'ROBUST MODELS (BOOTSTRAPPED OLS, HCSES, M-ESTIMATORS, ROBUST TRIMMING)'),
 ('@5', 'SCALE DEVELOPMENT, FACTOR ANALYSIS, PCA'),
 ('@6', 'OTHER'),
 ('@7', 'CHI-SQUARE, FISHER EXACT')]

In [None]:
label_list=[]
label_list.append("O")
for num,ent in coded_entity_types:
  label_list.append("B-"+ent)
  label_list.append("I-"+ent)
label_list

['O',
 'B-OLS GLM',
 'I-OLS GLM',
 'B-MIXED EFFECTS, MLMS, GROWTH MODELS, GENERALISED LMS (INC LOGREG)',
 'I-MIXED EFFECTS, MLMS, GROWTH MODELS, GENERALISED LMS (INC LOGREG)',
 'B-BAYES FACTOR, BAYESIAN ESTIMATION',
 'I-BAYES FACTOR, BAYESIAN ESTIMATION',
 'B-NON-PARAMETRIC TESTS',
 'I-NON-PARAMETRIC TESTS',
 'B-ROBUST MODELS (BOOTSTRAPPED OLS, HCSES, M-ESTIMATORS, ROBUST TRIMMING)',
 'I-ROBUST MODELS (BOOTSTRAPPED OLS, HCSES, M-ESTIMATORS, ROBUST TRIMMING)',
 'B-SCALE DEVELOPMENT, FACTOR ANALYSIS, PCA',
 'I-SCALE DEVELOPMENT, FACTOR ANALYSIS, PCA',
 'B-OTHER',
 'I-OTHER',
 'B-CHI-SQUARE, FISHER EXACT',
 'I-CHI-SQUARE, FISHER EXACT']

In [None]:
full_ent_codes = list(enumerate(label_list))
full_ent_codes

[(0, 'O'),
 (1, 'B-OLS GLM'),
 (2, 'I-OLS GLM'),
 (3, 'B-MIXED EFFECTS, MLMS, GROWTH MODELS, GENERALISED LMS (INC LOGREG)'),
 (4, 'I-MIXED EFFECTS, MLMS, GROWTH MODELS, GENERALISED LMS (INC LOGREG)'),
 (5, 'B-BAYES FACTOR, BAYESIAN ESTIMATION'),
 (6, 'I-BAYES FACTOR, BAYESIAN ESTIMATION'),
 (7, 'B-NON-PARAMETRIC TESTS'),
 (8, 'I-NON-PARAMETRIC TESTS'),
 (9,
  'B-ROBUST MODELS (BOOTSTRAPPED OLS, HCSES, M-ESTIMATORS, ROBUST TRIMMING)'),
 (10,
  'I-ROBUST MODELS (BOOTSTRAPPED OLS, HCSES, M-ESTIMATORS, ROBUST TRIMMING)'),
 (11, 'B-SCALE DEVELOPMENT, FACTOR ANALYSIS, PCA'),
 (12, 'I-SCALE DEVELOPMENT, FACTOR ANALYSIS, PCA'),
 (13, 'B-OTHER'),
 (14, 'I-OTHER'),
 (15, 'B-CHI-SQUARE, FISHER EXACT'),
 (16, 'I-CHI-SQUARE, FISHER EXACT')]

# **REPLACE DIRECTORY HERE** - for each dataframe assign the directory of one of the JSON's, order does not matter.

In [None]:
df = pd.read_json(#"/content/drive/MyDrive/Colab Notebooks/anno_mart.json", orient='index' )
df = df.iloc[1]
sentences=[]
entity_dicts=[]
for i in df:
  sentences.append(i[0])
  entity_dicts.append(i[1])

new = {"sentences": sentences, "entities": entity_dicts}

df = pd.DataFrame(new)
df.head()

Unnamed: 0,sentences,entities
0,To explore longitudinal trends in cognitive ab...,"{'entities': [[126, 157, 'MIXED EFFECTS, MLMS,..."
1,Regression slopes for all groups did not reach...,"{'entities': [[0, 17, 'OLS GLM']]}"
2,"Within these analyses, the percent of particip...","{'entities': [[210, 226, 'CHI-SQUARE, FISHER E..."
3,Statistical differences across outcome groups ...,"{'entities': [[68, 104, 'OLS GLM'], [109, 141,..."
4,Because the population distribution of RVC is ...,"{'entities': [[81, 100, 'NON-PARAMETRIC TESTS']]}"


In [None]:
dfx = pd.read_json(#"/content/drive/MyDrive/Colab Notebooks/annotations_18_08_2022.json", orient='index')
dfx = dfx.iloc[1]
sentences=[]
entity_dicts=[]
for i in dfx:
  sentences.append(i[0])
  entity_dicts.append(i[1])

new = {"sentences": sentences, "entities": entity_dicts}

dfx = pd.DataFrame(new)
dfx.tail()

Unnamed: 0,sentences,entities
98,"For any other MHP, the logistic regression mod...","{'entities': [[23, 48, 'MIXED EFFECTS, MLMS, G..."
99,Cox regression was used to compare the risk of...,"{'entities': [[0, 14, 'MIXED EFFECTS, MLMS, GR..."
100,The differences between this group and the sta...,"{'entities': [[115, 136, 'NON-PARAMETRIC TESTS..."
101,"This subscale included three items namely, “su...",{'entities': []}
102,Descriptive analysis involved means and standa...,"{'entities': [[84, 112, 'OLS GLM']]}"


In [None]:
dfy = pd.read_json(#"/content/drive/MyDrive/Colab Notebooks/annotations_22_08_2022.json", orient='index')
dfy = dfy.iloc[1]
sentences=[]
entity_dicts=[]
for i in dfy:
  sentences.append(i[0])
  entity_dicts.append(i[1])

new = {"sentences": sentences, "entities": entity_dicts}

dfy = pd.DataFrame(new)
dfy.tail()

Unnamed: 0,sentences,entities
87,We followed an iterative process to build the ...,"{'entities': [[46, 66, 'MIXED EFFECTS, MLMS, G..."
88,Adolescent sex was included in all hierarchica...,"{'entities': [[35, 67, 'OLS GLM'], [72, 104, '..."
89,The pre-treatment score for the outcome variab...,"{'entities': [[72, 91, 'OLS GLM'], [126, 150, ..."
90,Logistic regression analyses were conducted to...,"{'entities': [[0, 28, 'MIXED EFFECTS, MLMS, GR..."
91,A chi-square analysis revealed that the differ...,"{'entities': [[2, 21, 'CHI-SQUARE, FISHER EXAC..."


In [None]:
dfz = pd.read_json(#"/content/drive/MyDrive/Colab Notebooks/annotations_27_08_2022.json", orient='index')
dfz = dfz.iloc[1]
sentences=[]
entity_dicts=[]
for i in dfz:
  sentences.append(i[0])
  entity_dicts.append(i[1])

new = {"sentences": sentences, "entities": entity_dicts}

dfz = pd.DataFrame(new)
dfz.tail()

Unnamed: 0,sentences,entities
297,"n the current study, a new 2-factor analysis o...","{'entities': [[27, 44, 'SCALE DEVELOPMENT, FAC..."
298,"A series of analyses of variance, discriminant...","{'entities': [[12, 32, 'OLS GLM'], [34, 64, 'S..."
299,"Moreover, results of factor and discriminant f...","{'entities': [[32, 62, 'SCALE DEVELOPMENT, FAC..."
300,All symptom items for both disorders were ente...,"{'entities': [[57, 72, 'SCALE DEVELOPMENT, FAC..."
301,"Similarly, when entered into a factor analysis...","{'entities': [[31, 46, 'SCALE DEVELOPMENT, FAC..."


In [None]:
df = pd.concat([df,dfx,dfy,dfz],ignore_index=True)
df

Unnamed: 0,sentences,entities
0,To explore longitudinal trends in cognitive ab...,"{'entities': [[126, 157, 'MIXED EFFECTS, MLMS,..."
1,Regression slopes for all groups did not reach...,"{'entities': [[0, 17, 'OLS GLM']]}"
2,"Within these analyses, the percent of particip...","{'entities': [[210, 226, 'CHI-SQUARE, FISHER E..."
3,Statistical differences across outcome groups ...,"{'entities': [[68, 104, 'OLS GLM'], [109, 141,..."
4,Because the population distribution of RVC is ...,"{'entities': [[81, 100, 'NON-PARAMETRIC TESTS']]}"
...,...,...
609,"n the current study, a new 2-factor analysis o...","{'entities': [[27, 44, 'SCALE DEVELOPMENT, FAC..."
610,"A series of analyses of variance, discriminant...","{'entities': [[12, 32, 'OLS GLM'], [34, 64, 'S..."
611,"Moreover, results of factor and discriminant f...","{'entities': [[32, 62, 'SCALE DEVELOPMENT, FAC..."
612,All symptom items for both disorders were ente...,"{'entities': [[57, 72, 'SCALE DEVELOPMENT, FAC..."


In [None]:
tags = tag_creator1(TRAIN_DATA)
tagsx = tag_creator1(TRAIN_DATA1)
tagsy = tag_creator1(TRAIN_DATA2)
tagsz = tag_creator1(TRAIN_DATA3)
tags1 = []
for i in tags:
  tags1.append(tags_convert2(i))

for i in tagsx:
  tags1.append(tags_convert2(i))

for i in tagsy:
  tags1.append(tags_convert2(i))

for i in tagsz:
  tags1.append(tags_convert2(i))

In [None]:
sentence_taglist_dict = {}
for sent, tags in zip(df['sentences'],tags1):
  sentence_taglist_dict[sent] = tags

In [None]:
def label(row):
  for i in df['sentences']:
      if row == i:
        return sentence_taglist_dict[i]

In [None]:
df['Coded'] = df['sentences'].apply(lambda row: label(row))

In [None]:
df['sentences'] = df['sentences'].apply(lambda row: row.split(" "))
df

Unnamed: 0,sentences,entities,Coded
0,"[To, explore, longitudinal, trends, in, cognit...","{'entities': [[126, 157, 'MIXED EFFECTS, MLMS,...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,"[Regression, slopes, for, all, groups, did, no...","{'entities': [[0, 17, 'OLS GLM']]}","[1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2,"[Within, these, analyses,, the, percent, of, p...","{'entities': [[210, 226, 'CHI-SQUARE, FISHER E...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3,"[Statistical, differences, across, outcome, gr...","{'entities': [[68, 104, 'OLS GLM'], [109, 141,...","[0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 2, 2, 2, 0, 15,..."
4,"[Because, the, population, distribution, of, R...","{'entities': [[81, 100, 'NON-PARAMETRIC TESTS']]}","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 7, 8, 8, ..."
...,...,...,...
609,"[n, the, current, study,, a, new, 2-factor, an...","{'entities': [[27, 44, 'SCALE DEVELOPMENT, FAC...","[0, 0, 0, 0, 0, 0, 11, 12, 0, 0, 0, 0, 0, 0, 0..."
610,"[A, series, of, analyses, of, variance,, discr...","{'entities': [[12, 32, 'OLS GLM'], [34, 64, 'S...","[0, 0, 0, 1, 2, 2, 11, 12, 12, 0, 11, 12, 0, 0..."
611,"[Moreover,, results, of, factor, and, discrimi...","{'entities': [[32, 62, 'SCALE DEVELOPMENT, FAC...","[0, 0, 0, 0, 0, 11, 12, 12, 0, 0, 0, 0, 0, 0, ..."
612,"[All, symptom, items, for, both, disorders, we...","{'entities': [[57, 72, 'SCALE DEVELOPMENT, FAC...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 11, 12, 0, 0, 0..."


In [None]:
df.tail()

Unnamed: 0,sentences,entities,Coded
609,"[n, the, current, study,, a, new, 2-factor, an...","{'entities': [[27, 44, 'SCALE DEVELOPMENT, FAC...","[0, 0, 0, 0, 0, 0, 11, 12, 0, 0, 0, 0, 0, 0, 0..."
610,"[A, series, of, analyses, of, variance,, discr...","{'entities': [[12, 32, 'OLS GLM'], [34, 64, 'S...","[0, 0, 0, 1, 2, 2, 11, 12, 12, 0, 11, 12, 0, 0..."
611,"[Moreover,, results, of, factor, and, discrimi...","{'entities': [[32, 62, 'SCALE DEVELOPMENT, FAC...","[0, 0, 0, 0, 0, 11, 12, 12, 0, 0, 0, 0, 0, 0, ..."
612,"[All, symptom, items, for, both, disorders, we...","{'entities': [[57, 72, 'SCALE DEVELOPMENT, FAC...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 11, 12, 0, 0, 0..."
613,"[Similarly,, when, entered, into, a, factor, a...","{'entities': [[31, 46, 'SCALE DEVELOPMENT, FAC...","[0, 0, 0, 0, 0, 11, 12, 0, 0, 0, 0, 0, 0, 0, 0..."


In [None]:
df = df.drop('entities',axis=1)

In [None]:
df = df.rename(columns={"sentences": "Sentences"})
df

Unnamed: 0,Sentences,Coded
0,"[To, explore, longitudinal, trends, in, cognit...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,"[Regression, slopes, for, all, groups, did, no...","[1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2,"[Within, these, analyses,, the, percent, of, p...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3,"[Statistical, differences, across, outcome, gr...","[0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 2, 2, 2, 0, 15,..."
4,"[Because, the, population, distribution, of, R...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 7, 8, 8, ..."
...,...,...
609,"[n, the, current, study,, a, new, 2-factor, an...","[0, 0, 0, 0, 0, 0, 11, 12, 0, 0, 0, 0, 0, 0, 0..."
610,"[A, series, of, analyses, of, variance,, discr...","[0, 0, 0, 1, 2, 2, 11, 12, 12, 0, 11, 12, 0, 0..."
611,"[Moreover,, results, of, factor, and, discrimi...","[0, 0, 0, 0, 0, 11, 12, 12, 0, 0, 0, 0, 0, 0, ..."
612,"[All, symptom, items, for, both, disorders, we...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 11, 12, 0, 0, 0..."


In [None]:
#tds = Dataset.from_pandas(train)
#tsds = Dataset.from_pandas(test)
#vds = Dataset.from_pandas(validate)
#ds = DatasetDict()
#ds['train'] = tds
#ds['test'] = tsds
#ds['validate'] = vds

In [None]:
ds = Dataset.from_pandas(df).train_test_split(test_size=0.2)
ds

DatasetDict({
    train: Dataset({
        features: ['Sentences', 'Coded'],
        num_rows: 491
    })
    test: Dataset({
        features: ['Sentences', 'Coded'],
        num_rows: 123
    })
})

In [None]:
model = AutoModelForTokenClassification.from_pretrained("bert-base-cased", num_labels=len(label_list))
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")

Downloading config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/416M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForTokenClassification: ['cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-cas

Downloading tokenizer_config.json:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading vocab.txt:   0%|          | 0.00/208k [00:00<?, ?B/s]

Downloading tokenizer.json:   0%|          | 0.00/426k [00:00<?, ?B/s]

In [None]:
label_all_tokens = True

def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(examples["Sentences"], truncation=True, is_split_into_words=True)

    labels = []
    for i, label in enumerate(examples["Coded"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            # Special tokens have a word id that is None. We set the label to -100 so they are automatically
            # ignored in the loss function.
            if word_idx is None:
                label_ids.append(-100)
            # We set the label for the first token of each word.
            elif word_idx != previous_word_idx:
                label_ids.append(label[word_idx])
            # For the other tokens in a word, we set the label to either the current label or -100, depending on
            # the label_all_tokens flag.
            else:
                label_ids.append(label[word_idx] if label_all_tokens else -100)
            previous_word_idx = word_idx

        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

In [None]:
tokenized_datasets = ds.map(tokenize_and_align_labels, batched=True)

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

In [None]:
batch_size=8

args = TrainingArguments(
    "bert-finetune",
    evaluation_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=20,
    weight_decay=0.01,
    logging_steps=1
)

data_collator = DataCollatorForTokenClassification(tokenizer)

metric = load_metric("seqeval")

Downloading builder script:   0%|          | 0.00/2.47k [00:00<?, ?B/s]

In [None]:
def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    # Remove ignored index (special tokens)
    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = metric.compute(predictions=true_predictions, references=true_labels, zero_division=1)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }

In [None]:
def model_init():
    return AutoModelForTokenClassification.from_pretrained("bert-base-cased", num_labels=len(label_list))

In [None]:
trainer = Trainer(
    model,
    args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    model_init = model_init
)
#trainer.train()



#PBT

In [None]:
import ray
from ray import tune
from ray.tune.schedulers import PopulationBasedTraining
from ray.tune import CLIReporter

In [None]:
tune_config = {
    "per_device_train_batch_size": 32,
    "per_device_eval_batch_size": 32,
    "num_train_epochs": tune.choice([8, 10, 12, 15]),
    #"max_steps": 1 if smoke_test else -1,  # Used for smoke test.
}

In [None]:
scheduler = PopulationBasedTraining(
    time_attr="training_iteration",
    metric="eval_f1",
    mode="max",
    perturbation_interval=5,
    hyperparam_mutations={
      "weight_decay": tune.uniform(0.0, 0.3), 
      "learning_rate": tune.uniform(1e-5, 5e-5),
      "per_device_train_batch_size": [16, 32, 64]
    }
)

In [None]:
reporter = CLIReporter(
    parameter_columns={
        "weight_decay": "w_decay",
        "learning_rate": "lr",
        "per_device_train_batch_size": "train_bs/gpu",
        "num_train_epochs": "num_epochs",
    },
    metric_columns=["eval_f1", "eval_loss", "epoch"],
)

In [None]:
import multiprocessing

cores = multiprocessing.cpu_count() # Count the number of cores in a computer
cores

4

In [None]:
hpsearch = trainer.hyperparameter_search(
    hp_space=lambda _: tune_config,
    backend="ray",
    n_trials=3,
    resources_per_trial={"cpu": 2, "gpu": 1},
    scheduler=scheduler,
    keep_checkpoints_num=1,
    checkpoint_score_attr="training_iteration",
    progress_reporter=reporter,
    reuse_actors=True,
    local_dir="/content/drive/MyDrive/Colab Notebooks/",
    name="tune_transformer_pbt_BERT_1",
    log_to_file=True,
)

In [None]:
hpsearch

BestRun(run_id='eda22_00001', objective=2.947333066645549, hyperparameters={'per_device_train_batch_size': 32, 'per_device_eval_batch_size': 32, 'num_train_epochs': 15, 'weight_decay': 0.17959754525911098, 'learning_rate': 1.624074561769746e-05})

BestRun(run_id='b7cde_00001', objective=2.6809938291042568, hyperparameters={'per_device_train_batch_size': 32, 'per_device_eval_batch_size': 32, 'num_train_epochs': 10, 'weight_decay': 0.1790550473839461, 'learning_rate': 2.783331011414365e-05})

#PBT Results

#Cross-Validation

In [None]:
#4-Fold psuedo CV
ds1 = Dataset.from_pandas(df).train_test_split(test_size=0.2, seed=1)
ds2 = Dataset.from_pandas(df).train_test_split(test_size=0.2, seed=2)
ds3 = Dataset.from_pandas(df).train_test_split(test_size=0.2, seed=3)
ds4 = Dataset.from_pandas(df).train_test_split(test_size=0.2, seed=4)
tds1 = ds1.map(tokenize_and_align_labels, batched=True)
tds2 = ds2.map(tokenize_and_align_labels, batched=True)
tds3 = ds3.map(tokenize_and_align_labels, batched=True)
tds4 = ds4.map(tokenize_and_align_labels, batched=True)

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

In [None]:
args = TrainingArguments(
    "bert-finetune",
    evaluation_strategy = "epoch",
    learning_rate=3.887995089067299e-05,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    num_train_epochs=15,
    weight_decay=0.01692347370813,
    logging_steps=1   
)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [None]:
#Fold 1
model = AutoModelForTokenClassification.from_pretrained("bert-base-cased", num_labels=len(label_list))
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")

trainer = Trainer(
    model,
    args,
    train_dataset=tds1["train"],
    eval_dataset=tds1["test"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    #model_init = model_init
)
trainer.train()

trainer.evaluate()

predictions, labels, _ = trainer.predict(tds1["test"])
predictions = np.argmax(predictions, axis=2)

true_predictions = [
    [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
    for prediction, label in zip(predictions, labels)
]
true_labels = [
    [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
    for prediction, label in zip(predictions, labels)
]

results_fold1 = metric.compute(predictions=true_predictions, references=true_labels, zero_division=1)
print(results_fold1)

#Fold 2
model = AutoModelForTokenClassification.from_pretrained("bert-base-cased", num_labels=len(label_list))
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")

trainer = Trainer(
    model,
    args,
    train_dataset=tds2["train"],
    eval_dataset=tds2["test"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    #model_init = model_init
)
trainer.train()

trainer.evaluate()

predictions, labels, _ = trainer.predict(tds2["test"])
predictions = np.argmax(predictions, axis=2)

true_predictions = [
    [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
    for prediction, label in zip(predictions, labels)
]
true_labels = [
    [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
    for prediction, label in zip(predictions, labels)
]

results_fold2 = metric.compute(predictions=true_predictions, references=true_labels, zero_division=1)
print(results_fold2)

#Fold 3
model = AutoModelForTokenClassification.from_pretrained("bert-base-cased", num_labels=len(label_list))
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")

trainer = Trainer(
    model,
    args,
    train_dataset=tds3["train"],
    eval_dataset=tds3["test"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    #model_init = model_init
)
trainer.train()

trainer.evaluate()

predictions, labels, _ = trainer.predict(tds3["test"])
predictions = np.argmax(predictions, axis=2)

true_predictions = [
    [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
    for prediction, label in zip(predictions, labels)
]
true_labels = [
    [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
    for prediction, label in zip(predictions, labels)
]

results_fold3 = metric.compute(predictions=true_predictions, references=true_labels, zero_division=1)
print(results_fold3)
#Fold 4
model = AutoModelForTokenClassification.from_pretrained("bert-base-cased", num_labels=len(label_list))
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")

trainer = Trainer(
    model,
    args,
    train_dataset=tds4["train"],
    eval_dataset=tds4["test"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    #model_init = model_init
)
trainer.train()

trainer.evaluate()

predictions, labels, _ = trainer.predict(tds4["test"])
predictions = np.argmax(predictions, axis=2)

true_predictions = [
    [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
    for prediction, label in zip(predictions, labels)
]
true_labels = [
    [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
    for prediction, label in zip(predictions, labels)
]

results_fold4 = metric.compute(predictions=true_predictions, references=true_labels, zero_division=1)
print(results_fold4)

In [None]:
print(results_fold1)
print(results_fold2)
print(results_fold3)
print(results_fold4)

{'BAYES FACTOR, BAYESIAN ESTIMATION': {'precision': 0.6222222222222222, 'recall': 0.7368421052631579, 'f1': 0.6746987951807228, 'number': 38}, 'CHI-SQUARE, FISHER EXACT': {'precision': 0.3333333333333333, 'recall': 0.5, 'f1': 0.4, 'number': 18}, 'MIXED EFFECTS, MLMS, GROWTH MODELS, GENERALISED LMS (INC LOGREG)': {'precision': 0.7105263157894737, 'recall': 0.5625, 'f1': 0.627906976744186, 'number': 48}, 'NON-PARAMETRIC TESTS': {'precision': 0.6333333333333333, 'recall': 0.6785714285714286, 'f1': 0.6551724137931035, 'number': 28}, 'OLS GLM': {'precision': 0.5963302752293578, 'recall': 0.6074766355140186, 'f1': 0.6018518518518519, 'number': 107}, 'OTHER': {'precision': 1.0, 'recall': 0.0, 'f1': 0.0, 'number': 2}, 'ROBUST MODELS (BOOTSTRAPPED OLS, HCSES, M-ESTIMATORS, ROBUST TRIMMING)': {'precision': 0.5263157894736842, 'recall': 0.8333333333333334, 'f1': 0.6451612903225806, 'number': 24}, 'SCALE DEVELOPMENT, FACTOR ANALYSIS, PCA': {'precision': 0.7058823529411765, 'recall': 0.72, 'f1': 0.

In [None]:
folds = [results_fold1, results_fold2, results_fold3, results_fold4]
CV_F1S = [i['overall_f1'] for i in folds]
CV_P = [i['overall_precision'] for i in folds]
CV_R = [i['overall_recall'] for i in folds]

print("Mean F1 across folds: {}".format(np.mean(CV_F1S)))
print("F1 Std across folds: {}".format(np.std(CV_F1S)))
print("Mean Precision across folds: {}".format(np.mean(CV_P)))
print("Precision Std across folds: {}".format(np.std(CV_P)))
print("Mean Recall across folds: {}".format(np.mean(CV_R)))
print("Recall Std across folds: {}".format(np.std(CV_R)))

Mean F1 across folds: 0.6573959134876934
F1 Std across folds: 0.03420293054496009
Mean Precision across folds: 0.614629591627967
Precision Std across folds: 0.048248536276856734
Mean Recall across folds: 0.7097212878751025
Recall Std across folds: 0.041376035637473094


#Final Model

In [None]:
args = TrainingArguments(
    "bert-finetune",
    evaluation_strategy = "epoch",
    learning_rate = 3.887995089067299e-05,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    num_train_epochs=20,
    weight_decay=0.01692347370813,
    logging_steps=1
)

In [None]:
trainer = Trainer(
    model,
    args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    #model_init = model_init
)
trainer.train()

The following columns in the training set don't have a corresponding argument in `BertForTokenClassification.forward` and have been ignored: Coded, Sentences. If Coded, Sentences are not expected by `BertForTokenClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 491
  Num Epochs = 20
  Instantaneous batch size per device = 32
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 1
  Total optimization steps = 320


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,0.4514,0.520835,1.0,0.0,0.0,0.897786
2,0.3057,0.330222,0.252964,0.22695,0.239252,0.919626
3,0.373,0.264402,0.328767,0.425532,0.370943,0.927351
4,0.0954,0.221452,0.450617,0.51773,0.481848,0.940722
5,0.0319,0.193571,0.507289,0.617021,0.5568,0.95231
6,0.1064,0.190736,0.550847,0.691489,0.613208,0.951567
7,0.0097,0.207786,0.598746,0.677305,0.635607,0.954836
8,0.0309,0.189667,0.591716,0.70922,0.645161,0.957361
9,0.0277,0.211334,0.563889,0.719858,0.632399,0.95231
10,0.0221,0.229223,0.585227,0.730496,0.649842,0.954242


The following columns in the evaluation set don't have a corresponding argument in `BertForTokenClassification.forward` and have been ignored: Coded, Sentences. If Coded, Sentences are not expected by `BertForTokenClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 123
  Batch size = 32
The following columns in the evaluation set don't have a corresponding argument in `BertForTokenClassification.forward` and have been ignored: Coded, Sentences. If Coded, Sentences are not expected by `BertForTokenClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 123
  Batch size = 32
The following columns in the evaluation set don't have a corresponding argument in `BertForTokenClassification.forward` and have been ignored: Coded, Sentences. If Coded, Sentences are not expected by `BertForTokenClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
 

TrainOutput(global_step=320, training_loss=0.12153879325378511, metrics={'train_runtime': 230.618, 'train_samples_per_second': 42.581, 'train_steps_per_second': 1.388, 'total_flos': 638783669157000.0, 'train_loss': 0.12153879325378511, 'epoch': 20.0})

In [None]:
trainer.evaluate()

predictions, labels, _ = trainer.predict(tokenized_datasets["test"])
predictions = np.argmax(predictions, axis=2)

true_predictions = [
    [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
    for prediction, label in zip(predictions, labels)
]
true_labels = [
    [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
    for prediction, label in zip(predictions, labels)
]

results = metric.compute(predictions=true_predictions, references=true_labels, zero_division=1)
print(results)

The following columns in the evaluation set don't have a corresponding argument in `BertForTokenClassification.forward` and have been ignored: Coded, Sentences. If Coded, Sentences are not expected by `BertForTokenClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 123
  Batch size = 32


The following columns in the test set don't have a corresponding argument in `BertForTokenClassification.forward` and have been ignored: Coded, Sentences. If Coded, Sentences are not expected by `BertForTokenClassification.forward`,  you can safely ignore this message.
***** Running Prediction *****
  Num examples = 123
  Batch size = 32


{'BAYES FACTOR, BAYESIAN ESTIMATION': {'precision': 0.6612903225806451, 'recall': 0.8367346938775511, 'f1': 0.7387387387387386, 'number': 49}, 'CHI-SQUARE, FISHER EXACT': {'precision': 0.3333333333333333, 'recall': 0.5263157894736842, 'f1': 0.4081632653061224, 'number': 19}, 'NON-PARAMETRIC TESTS': {'precision': 0.6451612903225806, 'recall': 0.7142857142857143, 'f1': 0.6779661016949152, 'number': 28}, 'OLS GLM': {'precision': 0.6060606060606061, 'recall': 0.6896551724137931, 'f1': 0.6451612903225807, 'number': 145}, 'ROBUST MODELS (BOOTSTRAPPED OLS, HCSES, M-ESTIMATORS, ROBUST TRIMMING)': {'precision': 0.5294117647058824, 'recall': 0.75, 'f1': 0.6206896551724139, 'number': 12}, 'SCALE DEVELOPMENT, FACTOR ANALYSIS, PCA': {'precision': 0.6428571428571429, 'recall': 0.9310344827586207, 'f1': 0.7605633802816901, 'number': 29}, 'overall_precision': 0.5965417867435159, 'overall_recall': 0.7340425531914894, 'overall_f1': 0.65818759936407, 'overall_accuracy': 0.9539444361907592}


In [None]:
results

{'BAYES FACTOR, BAYESIAN ESTIMATION': {'precision': 0.6216216216216216,
  'recall': 0.6571428571428571,
  'f1': 0.6388888888888888,
  'number': 35},
 'CHI-SQUARE, FISHER EXACT': {'precision': 0.6666666666666666,
  'recall': 0.6363636363636364,
  'f1': 0.6511627906976744,
  'number': 22},
 'MIXED EFFECTS, MLMS, GROWTH MODELS, GENERALISED LMS (INC LOGREG)': {'precision': 0.6428571428571429,
  'recall': 0.6923076923076923,
  'f1': 0.6666666666666666,
  'number': 39},
 'NON-PARAMETRIC TESTS': {'precision': 0.7804878048780488,
  'recall': 0.8888888888888888,
  'f1': 0.8311688311688312,
  'number': 36},
 'OLS GLM': {'precision': 0.58,
  'recall': 0.5742574257425742,
  'f1': 0.5771144278606964,
  'number': 101},
 'OTHER': {'precision': 1.0, 'recall': 0.0, 'f1': 0.0, 'number': 2},
 'ROBUST MODELS (BOOTSTRAPPED OLS, HCSES, M-ESTIMATORS, ROBUST TRIMMING)': {'precision': 0.3409090909090909,
  'recall': 0.6521739130434783,
  'f1': 0.44776119402985076,
  'number': 23},
 'SCALE DEVELOPMENT, FACTOR A

In [None]:
model.save_pretrained("/content/drive/MyDrive/Colab Notebooks/scibert_cased-finetune")