In [None]:
!pip install transformers accelerate seqeval datasets>=1.8.0 torch>=1.3

In [None]:
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
# tokenizer
from transformers import AutoTokenizer, DistilBertTokenizerFast
# sequence tagging model + training-related
from transformers import DistilBertForTokenClassification, Trainer, TrainingArguments
import numpy as np
import pandas as pd
import torch
import json
import sys
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Load the data from Google drive

In [None]:
# mount Google Drive
from google.colab import drive
import os

drive.mount("/content/drive", force_remount=True)

# data location
#parent_dir = "/content/drive/MyDrive"
parent_dir = "/content/drive/MyDrive/"
mk_data_path = lambda filename: os.path.join(parent_dir, filename)
#data_path = f"{parent_dir}/BERT_data_final.xlsx"
data_path = mk_data_path("BERT_data_final.xlsx")

# once finished, flush changes to files to make them visible (assuming we write to Google Drive)
# drive.flush_and_unmount()

Mounted at /content/drive


In [None]:
# mount Google Drive
from google.colab import drive
import os

drive.mount("/content/drive", force_remount=True)

# data location
#parent_dir = "/content/drive/MyDrive"
parent_dir = "/content/drive/MyDrive/"
mk_data_path = lambda filename: os.path.join(parent_dir, filename)
#data_path = f"{parent_dir}/BERT_data_final.xlsx"
data_path = mk_data_path("final_changed_data.xlsx")

# once finished, flush changes to files to make them visible (assuming we write to Google Drive)
# drive.flush_and_unmount()

Mounted at /content/drive


# Tokenizer setup

BERT relies on a wordpiece tokenization strategy that makes use of a set of special tokens used for prediction.

In [None]:
model_name = "distilbert-base-uncased" # "distilbert-base-uncased"
#tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer = DistilBertTokenizerFast.from_pretrained(model_name)
# convenience function for wordpiece tokenization of a list of tokens
tokenize = lambda ds: tokenizer(ds, is_split_into_words=True, return_offsets_mapping=True, padding=True, truncation=True)
RANDOM_SEED = 42

Downloading (â€¦)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (â€¦)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (â€¦)/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading (â€¦)lve/main/config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

In [None]:
# special tokens
tokenizer.special_tokens_map

{'unk_token': '[UNK]',
 'sep_token': '[SEP]',
 'pad_token': '[PAD]',
 'cls_token': '[CLS]',
 'mask_token': '[MASK]'}

In [None]:
#data

In [None]:
data = pd.read_excel(data_path)
data.head(4)

Unnamed: 0,Causal-relation text,BIO Code,"{""words"":Causal-relation text,""ner"":BIO Code}",BIO Code_changed,BIO Code-changed-NER
0,"[""When "",""a "",""policyholder "",""or "",""insured ""...","[""O"",""O"",""B-C"",""I-C"",""I-C"",""I-C"",""I-C"",""I-C"",""...","{""words"":[""When "",""a "",""policyholder "",""or "",""...","[""O"",""O"",""C"",""C"",""C"",""C"",""C"",""C"",""C"",""O"",""O"",""...","[""O"",""O"",""B-ENTITY"",""B-ENTITY"",""B-ENTITY"",""B-E..."
1,"[""During "",""2020 "",""in "",""response "",""to "",""th...","[""O"",""O"",""O"",""O"",""O"",""B-C"",""I-C"",""I-C"",""I-C"",""...","{""words"":[""During "",""2020 "",""in "",""response "",...","[""O"",""O"",""O"",""O"",""O"",""C"",""C"",""C"",""C"",""C"",""C"",""...","[""O"",""O"",""O"",""O"",""O"",""B-ENTITY"",""B-ENTITY"",""B-..."
2,"[""Prolonged "",""periods "",""of "",""low "",""interes...","[""B-C"",""I-C"",""I-C"",""I-C"",""I-C"",""I-C"",""O"",""B-CT...","{""words"":[""Prolonged "",""periods "",""of "",""low ""...","[""C"",""C"",""C"",""C"",""C"",""C"",""O"",""CT"",""E"",""E"",""O"",...","[""B-ENTITY"",""B-ENTITY"",""B-ENTITY"",""B-ENTITY"",""..."
3,"[""Conversely "",""a "",""rise "",""in "",""interest "",...","[""O"",""O"",""B-C"",""I-C"",""I-C"",""I-C"",""O"",""B-CT"",""O...","{""words"":[""Conversely "",""a "",""rise "",""in "",""in...","[""O"",""O"",""C"",""C"",""C"",""C"",""O"",""CT"",""O"",""O"",""O"",...","[""O"",""O"",""B-ENTITY"",""B-ENTITY"",""B-ENTITY"",""B-E..."


In [None]:
#7-label

In [None]:
# load data and rename relevant columns to X (input) and y (output)
dataset = data[["Causal-relation text", "BIO Code"]].rename(columns={"Causal-relation text": "X", "BIO Code": "y"})
# load each row's data as a list of strings
X_raw, y_raw = [json.loads(tokens) for tokens in dataset.X.values], [json.loads(labels) for labels in dataset.y.values]

In [None]:
#4-label

In [None]:
# load data and rename relevant columns to X (input) and y (output)
dataset = data[["Causal-relation text", "BIO Code_changed"]].rename(columns={"Causal-relation text": "X", "BIO Code_changed": "y"})
# load each row's data as a list of strings
X_raw, y_raw = [json.loads(tokens) for tokens in dataset.X.values], [json.loads(labels) for labels in dataset.y.values]

In [None]:
unique_tags = set(label for labels in y_raw for label in labels)
tag2id = {tag: id for id, tag in enumerate(unique_tags)}
id2tag = {id: tag for tag, id in tag2id.items()}

In [None]:
print(id2tag)

{0: 'O', 1: 'C', 2: 'CT', 3: 'E'}


Split the data into train, dev using train_test_split. Further divide the train data into train and test data using k-fold cross validation.

In [None]:
# Our data is split into sentences


# FIXME: we also need test data!
X_train_raw, X_dev_raw, y_train_raw, y_dev_raw = train_test_split(X_raw, y_raw, test_size=.2, random_state=RANDOM_SEED)

In [None]:
X_train_final = np.array(X_train_raw)
y_train_final = np.array(y_train_raw)

  X_train_final = np.array(X_train_raw)
  y_train_final = np.array(y_train_raw)


In [None]:
print(np.shape(X_dev_raw))
print(np.shape(y_dev_raw))
print(np.shape(X_train_raw))


(447,)
(447,)
(1787,)


  result = asarray(a).shape


In [None]:
import os
k=2
seed = 1
kf = KFold(n_splits=k,random_state=seed, shuffle=True)
metrics=[]
i=0
for train_index, val_index in kf.split(X_train_final,y_train_final):
  train_texts, test_texts =X_train_final[train_index].tolist(),X_train_final[val_index].tolist()
  train_labels, test_labels = y_train_final[train_index].tolist(),y_train_final[val_index].tolist()

In [None]:
print(type(test_labels))

<class 'list'>


In [None]:
X_train_raw

[['Further',
  'disintermediation',
  'has',
  'continued',
  'because',
  'of',
  'historically',
  'low',
  'interest',
  'rate',
  'environment',
  'and',
  'bank',
  'deleveraging'],
 ['Employee',
  'compensation',
  'and',
  'benefits',
  'increased',
  'as',
  'a',
  'result',
  'of',
  'higher',
  'average',
  'salaries',
  'a',
  'larger',
  'headcount',
  'base',
  'and',
  'a',
  'voluntary',
  'early',
  'retirement',
  'program',
  'which',
  'was',
  'initiated',
  'in',
  'the',
  'fourth',
  'quarter',
  'of',
  '2020'],
 ['Our',
  'business',
  'could',
  'be',
  'harmed',
  'because',
  'of',
  'our',
  'potential',
  'exposure',
  'to',
  'asbestos',
  'and',
  'environmental',
  'claims',
  'and',
  'related',
  'litigation'],
 ['In',
  'December',
  '2016',
  'we',
  'were',
  'issued',
  'a',
  '6',
  'million',
  'fine',
  'by',
  'the',
  'SFSA',
  'as',
  'a',
  'result',
  'of',
  'findings',
  'in',
  'connection',
  'with',
  'its',
  'investigation',
  'The'

In [None]:
# ensure our data is well-formed:
for ds in [X_train_raw, X_train_raw]:
  for row in ds:
    #print(row)
    assert all(len(tok) > 0 for tok in row)

In [None]:
# ensure our data is well-formed:
for ds in [test_texts, test_texts]:
  for row in ds:
    assert all(len(tok) > 0 for tok in row)

In [None]:
# ensure our data is well-formed:
for ds in [train_texts, train_texts]:
  for row in ds:
    assert all(len(tok) > 0 for tok in row)

In [None]:
def encode_tags(tags, encodings, tag2id):
    """
    See https://huggingface.co/transformers/custom_datasets.html#tok-ner

    >If the tokenizer splits a token into multiple sub-tokens, then we will end up with a mismatch between our tokens and our labels.
    >One way to handle this is to only train on the tag labels for the first subtoken of a split token.
    >We can do this in ðŸ¤— Transformers by setting the labels we wish to ignore to -100.
    >In the example above, if the label for @HuggingFace is 3 (indexing `B-corporation`),
    >we would set the labels of ['@', 'hugging', '##face'] to [3, -100, -100].
    """
    labels = [[tag2id[tag] for tag in doc] for doc in tags]
    encoded_labels = []
    for doc_labels, doc_offset in zip(labels, encodings.offset_mapping):
        # create an empty array of -100
        doc_enc_labels = np.ones(len(doc_offset),dtype=int) * -100
        arr_offset = np.array(doc_offset)

        # set labels whose first offset position is 0 and the second is not 0
        doc_enc_labels[(arr_offset[:,0] == 0) & (arr_offset[:,1] != 0)] = doc_labels
        encoded_labels.append(doc_enc_labels.tolist())

    return encoded_labels

In [None]:
type(train_texts)

list

In [None]:
print(type(train_labels))

<class 'list'>


In [None]:
train_labels

[['O',
  'B-E',
  'I-E',
  'I-E',
  'B-CT',
  'I-CT',
  'B-C',
  'I-C',
  'I-C',
  'I-C',
  'I-C',
  'I-C',
  'I-C',
  'I-C',
  'I-C'],
 ['O',
  'O',
  'O',
  'O',
  'B-E',
  'I-E',
  'I-E',
  'I-E',
  'I-E',
  'I-E',
  'I-E',
  'I-E',
  'I-E',
  'B-CT',
  'I-CT',
  'I-CT',
  'O',
  'B-C',
  'I-C',
  'I-C',
  'I-C',
  'I-C',
  'I-C',
  'I-C',
  'I-C',
  'I-C',
  'I-C',
  'I-C',
  'I-C',
  'I-C',
  'I-C',
  'I-C',
  'I-C',
  'I-C',
  'I-C',
  'I-C'],
 ['B-C',
  'I-C',
  'I-C',
  'I-C',
  'I-C',
  'I-C',
  'I-C',
  'I-C',
  'I-C',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'B-CT',
  'B-E',
  'I-E',
  'I-E',
  'I-E',
  'I-E',
  'I-E',
  'I-E',
  'I-E'],
 ['B-CT',
  'B-C',
  'I-C',
  'I-C',
  'I-C',
  'I-C',
  'I-C',
  'I-C',
  'I-C',
  'O',
  'B-E',
  'I-E',
  'I-E',
  'I-E',
  'I-E',
  'I-E',
  'I-E',
  'I-E',
  'I-E'],
 ['B-C',
  'I-C',
  'I-C',
  'I-C',
  'I-C',
  'I-C',
  'I-C',
  'I-C',
  'I-C',
  'I-C',
  'I-C',
  'I-C',
  'I-C',
  'I-C',
  'B-CT',
  'B-

In [None]:
# encoded X
X_train = tokenize(train_texts)
X_dev = tokenize(X_dev_raw)
X_test= tokenize(test_texts)

y_train = encode_tags(tags=train_labels, encodings=X_train, tag2id=tag2id)
y_dev = encode_tags(tags=y_dev_raw, encodings=X_dev, tag2id=tag2id)
y_test = encode_tags(tags=test_labels, encodings=X_test, tag2id=tag2id)


for X in [X_train, X_dev, X_test]:
  X.pop("offset_mapping")

In [None]:
type(X_test)

transformers.tokenization_utils_base.BatchEncoding

In [None]:
class CustomDataset(torch.utils.data.Dataset):
    """
    Pytorch dataset for easy batching
    """
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = CustomDataset(X_train, y_train)
val_dataset = CustomDataset(X_dev, y_dev)
test_dataset = CustomDataset(X_test,y_test)

In [None]:
import numpy as np
from datasets import load_metric

metric = load_metric("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=y_train)

  metric = load_metric("accuracy")


Downloading builder script:   0%|          | 0.00/1.65k [00:00<?, ?B/s]

In [None]:
training_args = TrainingArguments(
    #output_dir=mk_data_path("results"),   # output directory
    output_dir = '/content/DistilBertforTokenClassification',
    num_train_epochs=3,                   # total number of training epochs
    per_device_train_batch_size=16,       # batch size per device during training
    per_device_eval_batch_size=64,        # batch size for evaluation
    warmup_steps=500,                     # number of warmup steps for learning rate scheduler
    weight_decay=0.01,                    # strength of weight decay
    #logging_dir=mk_data_path("logs"),     # directory for storing logs
    logging_dir= '/content/logs',
    logging_steps=10,
    evaluation_strategy = "epoch",
    #report_to = "wandb"
)

model = DistilBertForTokenClassification.from_pretrained(model_name, num_labels=len(unique_tags))

trainer = Trainer(
    model=model,                          # the instantiated ðŸ¤— Transformers model to be trained
    args=training_args,                   # training arguments, defined above
    train_dataset=train_dataset,          # training dataset
    eval_dataset=val_dataset,             # evaluation dataset
    #compute_metrics = compute_metrics
)

trainer.train()
# pred = trainer.predict(test_dataset)
# preds = pred.predictions.argmax(-1)
# indices = val_index
#final_pred = pd.Series(preds).tolist()

Downloading pytorch_model.bin:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForTokenClassification: ['vocab_projector.weight', 'vocab_layer_norm.bias', 'vocab_transform.weight', 'vocab_transform.bias', 'vocab_projector.bias', 'vocab_layer_norm.weight']
- This IS expected if you are initializing DistilBertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForTokenClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN t

Epoch,Training Loss,Validation Loss
1,1.2158,1.162124
2,0.9913,0.945847
3,0.6876,0.499029


TrainOutput(global_step=168, training_loss=1.0441427032152812, metrics={'train_runtime': 46.9355, 'train_samples_per_second': 57.142, 'train_steps_per_second': 3.579, 'total_flos': 132093529694064.0, 'train_loss': 1.0441427032152812, 'epoch': 3.0})

In [None]:
trainer.save_model('/content/DistilBertforTokenClassification')

In [None]:
!zip -r '/content/DistilBertforTokenClassification.zip' '/content/DistilBertforTokenClassification'

  adding: content/DistilBertforTokenClassification/ (stored 0%)
  adding: content/DistilBertforTokenClassification/config.json (deflated 49%)
  adding: content/DistilBertforTokenClassification/pytorch_model.bin (deflated 8%)
  adding: content/DistilBertforTokenClassification/training_args.bin (deflated 49%)


In [None]:
# raw_pred, _, _ = trainer.predict(test_dataset)
# # Preprocess raw predictions
# y_pred = np.argmax(raw_pred, axis=1)

***** Running Prediction *****
  Num examples = 893
  Batch size = 64


In [None]:
#Predictions from the model

In [None]:
preds

array([[4, 4, 4, ..., 3, 3, 3],
       [4, 0, 4, ..., 3, 3, 3],
       [0, 0, 4, ..., 4, 4, 4],
       ...,
       [4, 3, 3, ..., 4, 4, 4],
       [4, 0, 3, ..., 4, 4, 4],
       [4, 0, 4, ..., 4, 4, 4]])

In [None]:
preds.shape

(893, 133)

In [None]:
np.shape(y_test)

(893, 133)

In [None]:
#classification report for the each sentence in the test data.

In [None]:
from sklearn.metrics import classification_report
# Performance BERT Metrics
for i,j in zip(preds,y_test):
  print(classification_report(i, j, zero_division = 1))

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
           0       1.00      0.30      0.47        46
           1       1.00      0.33      0.49        64
           2       1.00      0.06      0.11        18
           3       1.00      0.33      0.50         3
           4       1.00      1.00      1.00         2
           5       0.00      1.00      0.00         0
           6       0.00      1.00      0.00         0

    accuracy                           0.29       133
   macro avg       0.62      0.63      0.32       133
weighted avg       1.00      0.29      0.44       133

              precision    recall  f1-score   support

        -100       0.00      1.00      0.00         0
           0       1.00      0.27      0.42        30
           1       1.00      0.06      0.12        80
           2       0.25      0.15      0.19        13
           3       1.00      1.00      1.00         1
           4       1.00      1.00      1.00         1
           5  

In [None]:
#flatten the list of the predictions 

In [None]:
flat_list = [item for sublist in preds for item in sublist]

In [None]:
np.shape(flat_list)

(118769,)

In [None]:
#flatten test data

In [None]:
flat_list1 = [item1 for sublist1 in y_test for item1 in sublist1]

In [None]:
np.shape(flat_list1)

(118769,)

In [None]:
flat_list_testtexts = [(ind,item1) for sublist1 in test_texts for ind,item1 in enumerate(sublist1)]

In [None]:
flat_list_testtexts_frame = pd.DataFrame(flat_list_testtexts)

In [None]:
flat_list_testtexts_frame

Unnamed: 0,0,1
0,0,Employee
1,1,compensation
2,2,and
3,3,benefits
4,4,increased
...,...,...
29669,26,to
29670,27,the
29671,28,increased
29672,29,capital


In [None]:
flat_list_testtexts_frame.columns = ['index','texts']

In [None]:
flat_list_testtexts_frame

Unnamed: 0,index,texts
0,0,Employee
1,1,compensation
2,2,and
3,3,benefits
4,4,increased
...,...,...
29669,26,to
29670,27,the
29671,28,increased
29672,29,capital


In [None]:
df1 = flat_list_testtexts_frame[flat_list_testtexts_frame.texts != ' ']

In [None]:
df1

Unnamed: 0,index,texts
0,0,Employee
1,1,compensation
2,2,and
3,3,benefits
4,4,increased
...,...,...
29669,26,to
29670,27,the
29671,28,increased
29672,29,capital


In [None]:
df1.to_excel('/content/text.xlsx')

In [None]:
#Overall accuracy for the entire test data. Flatten the list and compare predicted with the gold label.

In [None]:
print(id2tag)

{0: 'O', 1: 'B-E', 2: 'B-CT', 3: 'I-C', 4: 'I-E', 5: 'I-CT', 6: 'B-C'}


In [None]:
#targetname = {'B-E','B-C','I-C','O','I-CT','I-E','B-CT'}
targetname = {'O', 'B-E', 'B-CT','I-C', 'I-E', 'I-CT', 'B-C'}

In [None]:
#no determiners
print(id2tag)

{0: 'B-C', 1: 'I-E', 2: 'B-CT', 3: 'I-CT', 4: 'B-E', 5: 'I-C', 6: 'O'}


In [None]:
targetname = ['O', 'B-E', 'B-CT','I-C', 'I-E', 'I-CT', 'B-C']

In [None]:
type(test_texts)

list

In [None]:
lis_preds_frame = pd.DataFrame(flat_list)
lis_test_labels_frame = pd.DataFrame(flat_list1)

In [None]:
frame_combine_test = pd.concat([lis_preds_frame,lis_test_labels_frame],axis=1)

In [None]:
frame_combine_test.columns = ['preds','gold_data']

In [None]:
frame_combine_test

Unnamed: 0,preds,gold_data
0,4,-100
1,4,1
2,4,-100
3,4,-100
4,4,-100
...,...,...
118764,4,-100
118765,4,-100
118766,4,-100
118767,4,-100


In [None]:
#remove the word-piece tokenized(PAD) perdictions

In [None]:
frame_combine_test_new = frame_combine_test[frame_combine_test.gold_data != -100]

In [None]:
frame_combine_test_new

Unnamed: 0,preds,gold_data
1,4,1
5,4,4
6,4,4
7,4,4
8,4,4
...,...,...
118667,5,5
118668,5,0
118669,6,6
118670,3,3


In [None]:
targetname = {'B-E', 'O',  'I-C',  'B-C', 'I-CT', 'I-E', 'B-CT'}

In [None]:
from sklearn.metrics import classification_report
print(classification_report(y_true=frame_combine_test_new['gold_data'], y_pred=frame_combine_test_new['preds'], zero_division = 1,target_names = list(targetname))) #id2tag.keys()

              precision    recall  f1-score   support

         B-E       0.67      0.06      0.10       986
        B-CT       0.89      0.85      0.87       914
         I-E       0.72      0.94      0.82     10330
         I-C       0.76      0.83      0.79     10340
        I-CT       0.76      0.95      0.84       866
           O       0.59      0.27      0.37      5216
         B-C       0.68      0.27      0.39      1022

    accuracy                           0.73     29674
   macro avg       0.73      0.60      0.60     29674
weighted avg       0.72      0.73      0.70     29674



In [None]:
df1.to_excel('/content/text.xlsx')
frame_combine_test_new.to_excel('/content/preds_gold.xlsx')

In [None]:
####################################################################################

In [None]:
#####################################################################################################################################

In [None]:
data

Unnamed: 0,Causal-relation text,BIO Code,"{""words"":Causal-relation text,""ner"":BIO Code}",BIO Code_changed,BIO Code-changed-NER
0,"[""when "",""a "",""policyholder "",""or "",""insured ""...","[""O"",""O"",""B-C"",""I-C"",""I-C"",""I-C"",""I-C"",""I-C"",""...","{""words"":[""When "",""a "",""policyholder "",""or "",""...","[""O"",""O"",""C"",""C"",""C"",""C"",""C"",""C"",""C"",""O"",""O"",""...","[""O"",""O"",""B-ENTITY"",""B-ENTITY"",""B-ENTITY"",""B-E..."
1,"[""during "",""2020 "",""in "",""response "",""to "",""th...","[""O"",""O"",""O"",""O"",""O"",""B-C"",""I-C"",""I-C"",""I-C"",""...","{""words"":[""During "",""2020 "",""in "",""response "",...","[""O"",""O"",""O"",""O"",""O"",""C"",""C"",""C"",""C"",""C"",""C"",""...","[""O"",""O"",""O"",""O"",""O"",""B-ENTITY"",""B-ENTITY"",""B-..."
2,"[""prolonged "",""periods "",""of "",""low "",""interes...","[""B-C"",""I-C"",""I-C"",""I-C"",""I-C"",""I-C"",""O"",""B-CT...","{""words"":[""Prolonged "",""periods "",""of "",""low ""...","[""C"",""C"",""C"",""C"",""C"",""C"",""O"",""CT"",""E"",""E"",""O"",...","[""B-ENTITY"",""B-ENTITY"",""B-ENTITY"",""B-ENTITY"",""..."
3,"[""conversely "",""a "",""rise "",""in "",""interest "",...","[""O"",""O"",""B-C"",""I-C"",""I-C"",""I-C"",""O"",""B-CT"",""O...","{""words"":[""Conversely "",""a "",""rise "",""in "",""in...","[""O"",""O"",""C"",""C"",""C"",""C"",""O"",""CT"",""O"",""O"",""O"",...","[""O"",""O"",""B-ENTITY"",""B-ENTITY"",""B-ENTITY"",""B-E..."
4,"[""further "",""because "",""of "",""the "",""concentra...","[""O"",""B-CT"",""O"",""O"",""B-C"",""I-C"",""I-C"",""I-C"",""I...","{""words"":[""Further "",""because "",""of "",""the "",""...","[""O"",""CT"",""O"",""O"",""C"",""C"",""C"",""C"",""C"",""C"",""C"",...","[""O"",""B-ENTITY"",""O"",""O"",""B-ENTITY"",""B-ENTITY"",..."
...,...,...,...,...,...
2229,"[""the "",""increase "",""in "",""adjusted "",""ppnr "",...","[""O"",""B-E"",""I-E"",""I-E"",""I-E"",""I-E"",""I-E"",""I-E""...","{""words"":[""The "",""increase "",""in "",""adjusted ""...","[""O"",""E"",""E"",""E"",""E"",""E"",""E"",""E"",""E"",""E"",""E"",""...","[""O"",""B-ENTITY"",""I-ENTITY"",""I-ENTITY"",""I-ENTIT..."
2230,"[""modest "",""noninterest "",""income "",""growth "",...","[""B-E"",""I-E"",""I-E"",""I-E"",""B-CT"",""I-CT"",""B-C"",""...","{""words"":[""Modest "",""noninterest "",""income "",""...","[""E"",""E"",""E"",""E"",""CT"",""CT"",""C"",""C"",""C"",""C"",""C""...","[""B-ENTITY"",""I-ENTITY"",""I-ENTITY"",""I-ENTITY"",""..."
2231,"[""wealth "",""management "",""and "",""trust "",""fees...","[""B-E"",""I-E"",""I-E"",""I-E"",""I-E"",""I-E"",""B-CT"",""I...","{""words"":[""Wealth "",""management "",""and "",""trus...","[""E"",""E"",""E"",""E"",""E"",""E"",""CT"",""CT"",""CT"",""O"",""C...","[""B-ENTITY"",""I-ENTITY"",""I-ENTITY"",""I-ENTITY"",""..."
2232,"[""the "",""2019 "",""provision "",""increased "","" "",...","[""O"",""O"",""B-E"",""I-E"",""B-CT"",""I-CT"",""B-C"",""I-C""...","{""words"":[""The "",""2019 "",""provision "",""increas...","[""O"",""O"",""E"",""E"",""CT"",""CT"",""C"",""C"",""C"",""C"",""C""...","[""O"",""O"",""B-ENTITY"",""I-ENTITY"",""B-ENTITY"",""B-E..."


In [None]:
# load data and rename relevant columns to X (input) and y (output)
dataset = data[["Causal-relation text", "BIO Code_changed"]].rename(columns={"Causal-relation text": "X", "BIO Code_changed": "y"})
# load each row's data as a list of strings
X_raw, y_raw = [json.loads(tokens) for tokens in dataset.X.values], [json.loads(labels) for labels in dataset.y.values]

In [None]:
unique_tags = set(label for labels in y_raw for label in labels)
tag2id = {tag: id for id, tag in enumerate(unique_tags)}
id2tag = {id: tag for tag, id in tag2id.items()}

In [None]:
tag2id

{'C': 2, 'CT': 3, 'E': 1, 'O': 0}

In [None]:
model_name = "distilbert-base-cased" # "distilbert-base-uncased"
#tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer = DistilBertTokenizerFast.from_pretrained(model_name)
# convenience function for wordpiece tokenization of a list of tokens
tokenize = lambda ds: tokenizer(ds, is_split_into_words=True, return_offsets_mapping=True, padding=True, truncation=True)
RANDOM_SEED = 42

loading file https://huggingface.co/distilbert-base-cased/resolve/main/vocab.txt from cache at /root/.cache/huggingface/transformers/ba377304984dc63e3ede0e23a938bbbf04d5c3835b66d5bb48343aecca188429.437aa611e89f6fc6675a049d2b5545390adbc617e7d655286421c191d2be2791
loading file https://huggingface.co/distilbert-base-cased/resolve/main/tokenizer.json from cache at /root/.cache/huggingface/transformers/acb5c2138c1f8c84f074b86dafce3631667fccd6efcb1a7ea1320cf75c386a36.3dab63143af66769bbb35e3811f75f7e16b2320e12b7935e216bd6159ce6d9a6
loading file https://huggingface.co/distilbert-base-cased/resolve/main/added_tokens.json from cache at None
loading file https://huggingface.co/distilbert-base-cased/resolve/main/special_tokens_map.json from cache at None
loading file https://huggingface.co/distilbert-base-cased/resolve/main/tokenizer_config.json from cache at /root/.cache/huggingface/transformers/81e970e5e6ec68be12da0f8f3b2f2469c78d579282299a2ea65b4b7441719107.ec5c189f89475aac7d8cbd243960a0655cfad

In [None]:
# Our data is split into sentences


# FIXME: we also need test data!
X_train_raw, X_dev_raw, y_train_raw, y_dev_raw = train_test_split(X_raw, y_raw, test_size=.2, random_state=RANDOM_SEED)

In [None]:
X_train_final = np.array(X_train_raw)
y_train_final = np.array(y_train_raw)

  """Entry point for launching an IPython kernel.
  


In [None]:
import os
k=2
seed = 1
kf = KFold(n_splits=k,random_state=seed, shuffle=True)
metrics=[]
i=0
for train_index, val_index in kf.split(X_train_final,y_train_final):
  train_texts, test_texts =X_train_final[train_index].tolist(),X_train_final[val_index].tolist()
  train_labels, test_labels = y_train_final[train_index].tolist(),y_train_final[val_index].tolist()

In [None]:
# ensure our data is well-formed:
for ds in [X_train_raw, X_train_raw]:
  for row in ds:
    assert all(len(tok) > 0 for tok in row)

In [None]:
# ensure our data is well-formed:
for ds in [test_texts, test_texts]:
  for row in ds:
    assert all(len(tok) > 0 for tok in row)

In [None]:
# ensure our data is well-formed:
for ds in [train_texts, train_texts]:
  for row in ds:
    assert all(len(tok) > 0 for tok in row)

In [None]:
def encode_tags(tags, encodings, tag2id):
    """
    See https://huggingface.co/transformers/custom_datasets.html#tok-ner

    >If the tokenizer splits a token into multiple sub-tokens, then we will end up with a mismatch between our tokens and our labels.
    >One way to handle this is to only train on the tag labels for the first subtoken of a split token.
    >We can do this in ðŸ¤— Transformers by setting the labels we wish to ignore to -100.
    >In the example above, if the label for @HuggingFace is 3 (indexing `B-corporation`),
    >we would set the labels of ['@', 'hugging', '##face'] to [3, -100, -100].
    """
    labels = [[tag2id[tag] for tag in doc] for doc in tags]
    encoded_labels = []
    for doc_labels, doc_offset in zip(labels, encodings.offset_mapping):
        # create an empty array of -100
        doc_enc_labels = np.ones(len(doc_offset),dtype=int) * -100
        arr_offset = np.array(doc_offset)

        # set labels whose first offset position is 0 and the second is not 0
        doc_enc_labels[(arr_offset[:,0] == 0) & (arr_offset[:,1] != 0)] = doc_labels
        encoded_labels.append(doc_enc_labels.tolist())

    return encoded_labels

In [None]:
# encoded X
X_train = tokenize(train_texts)
X_dev = tokenize(X_dev_raw)
X_test= tokenize(test_texts)

y_train = encode_tags(tags=train_labels, encodings=X_train, tag2id=tag2id)
y_dev = encode_tags(tags=y_dev_raw, encodings=X_dev, tag2id=tag2id)
y_test = encode_tags(tags=test_labels, encodings=X_test, tag2id=tag2id)


for X in [X_train, X_dev, X_test]:
  X.pop("offset_mapping")

In [None]:
class CustomDataset(torch.utils.data.Dataset):
    """
    Pytorch dataset for easy batching
    """
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = CustomDataset(X_train, y_train)
val_dataset = CustomDataset(X_dev, y_dev)
test_dataset = CustomDataset(X_test,y_test)

In [None]:
import numpy as np
from datasets import load_metric

metric = load_metric("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

In [None]:
training_args = TrainingArguments(
    output_dir=mk_data_path("results"),   # output directory
    num_train_epochs=3,                   # total number of training epochs
    per_device_train_batch_size=16,       # batch size per device during training
    per_device_eval_batch_size=64,        # batch size for evaluation
    warmup_steps=500,                     # number of warmup steps for learning rate scheduler
    weight_decay=0.01,                    # strength of weight decay
    logging_dir=mk_data_path("logs"),     # directory for storing logs
    logging_steps=10,
)

model = DistilBertForTokenClassification.from_pretrained(model_name, num_labels=len(unique_tags))

trainer = Trainer(
    model=model,                          # the instantiated ðŸ¤— Transformers model to be trained
    args=training_args,                   # training arguments, defined above
    train_dataset=train_dataset,          # training dataset
    eval_dataset=val_dataset,             # evaluation dataset
)

trainer.train()
pred = trainer.predict(test_dataset)
preds = pred.predictions.argmax(-1)
indices = val_index
#final_pred = pd.Series(preds).tolist()

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
loading configuration file https://huggingface.co/distilbert-base-cased/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/ebe1ea24d11aa664488b8de5b21e33989008ca78f207d4e30ec6350b693f073f.302bfd1b5e031cc1b17796e0b6e5b242ba2045d31d00f97589e12b458ebff27a
Model config DistilBertConfig {
  "activation": "gelu",
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2",
    "3": "LABEL_3"
  },
  "initializer_range": 0.02,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_2": 2,
    "LABEL_3": 3
  },
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  

Step,Training Loss
10,1.4148
20,1.3769
30,1.319
40,1.2597
50,1.2075
60,1.142
70,1.1113
80,1.0572
90,1.0672
100,0.9131




Training completed. Do not forget to share your model on huggingface.co/models =)


***** Running Prediction *****
  Num examples = 893
  Batch size = 64


In [None]:
preds.shape

(893, 134)

In [None]:
preds

array([[1, 1, 1, ..., 2, 2, 2],
       [1, 1, 1, ..., 2, 2, 2],
       [1, 1, 1, ..., 1, 1, 1],
       ...,
       [1, 2, 2, ..., 1, 1, 1],
       [1, 2, 2, ..., 1, 1, 1],
       [1, 1, 1, ..., 1, 1, 1]])

In [None]:
flat_list = [item for sublist in preds for item in sublist]

In [None]:
np.shape(flat_list)

(119662,)

In [None]:
flat_list1 = [item1 for sublist1 in y_test for item1 in sublist1]

In [None]:
np.shape(flat_list1)

(119662,)

In [None]:
lis_preds_frame = pd.DataFrame(flat_list)
lis_test_labels_frame = pd.DataFrame(flat_list1)


frame_combine_test = pd.concat([lis_preds_frame,lis_test_labels_frame],axis=1)

frame_combine_test.columns = ['preds','gold_data']

In [None]:
frame_combine_test

Unnamed: 0,preds,gold_data
0,1,-100
1,1,1
2,1,1
3,1,1
4,1,1
...,...,...
119657,1,-100
119658,1,-100
119659,1,-100
119660,1,-100


In [None]:
frame_combine_test_new = frame_combine_test[frame_combine_test.gold_data != -100]

In [None]:
frame_combine_test_new

Unnamed: 0,preds,gold_data
1,1,1
2,1,1
3,1,1
4,1,1
5,1,1
...,...,...
119559,3,3
119560,2,0
119561,2,2
119562,2,2


In [None]:
id2tag

{0: 'O', 1: 'E', 2: 'C', 3: 'CT'}

In [None]:
#targetname = {'C', 'CT', 'E', 'O'}
targetname = {'O', 'E', 'C', 'CT'}

In [None]:
from sklearn.metrics import classification_report
print(classification_report(y_true=frame_combine_test_new['gold_data'], y_pred=frame_combine_test_new['preds'], zero_division = 1,target_names = list(targetname))) #id2tag.keys()

              precision    recall  f1-score   support

           O       0.69      0.21      0.32      5216
           C       0.78      0.93      0.85     11316
          CT       0.77      0.86      0.81     11362
           E       0.88      0.94      0.91      1780

    accuracy                           0.78     29674
   macro avg       0.78      0.74      0.73     29674
weighted avg       0.77      0.78      0.75     29674



In [None]:
frame_combine_test_new.to_excel('/content/reduced_labels.xlsx')