In [1]:
!pip install transformers nltk datasets rouge_score accelerate -U

Collecting transformers
  Downloading transformers-4.41.1-py3-none-any.whl (9.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.1/9.1 MB[0m [31m74.2 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
Collecting nltk
  Downloading nltk-3.8.1-py3-none-any.whl (1.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.5/1.5 MB[0m [31m85.3 MB/s[0m eta [36m0:00:00[0m
Collecting datasets
  Downloading datasets-2.19.1-py3-none-any.whl (542 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m542.0/542.0 kB[0m [31m71.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting rouge_score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25ldone
[?25hCollecting accelerate
  Downloading accelerate-0.30.1-py3-none-any.whl (302 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m302.6/302.6 kB[0m [31m58.0 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers<0.20,>=0.19
  Downloading tokenizers-0.19.1

In [2]:
import pandas as pd
from transformers import (
    AutoModelForSeq2SeqLM,
    AutoTokenizer,
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer,
    DataCollatorForSeq2Seq,
)
import numpy as np
import nltk
import datasets
import pandas as pd
from tqdm import tqdm
from datasets import load_dataset, load_metric
from sklearn.preprocessing import LabelEncoder
from datasets import Dataset

In [3]:
import pandas as pd
import random

def create_data_frame(file_path):
    with open(file_path, "r") as file:
        data = file.readlines()

    relations = []
    sentences = []

    for line in data:
        parts = line.strip().split(" ")
        relation = parts[1]
        sentence = " ".join(parts[3:])
        relations.append(relation)
        sentences.append(sentence)

    return pd.DataFrame({"Relation": relations, "Sentence": sentences})

augmentations = {
    "Cause-Effect": ["reason-result", "consequence", "led to", "outcome of", "causation", "impact"],
    "Component-Whole": ["part-whole", "element", "make up", "composed of", "constituent", "component parts"],
    "Content-Container": ["substance-holding", "material", "contains", "stored in", "contents", "containerized"],
    "Entity-Destination": ["object-place", "target", "headed to", "destination of", "entity's final stop", "endpoint"],
    "Entity-Origin": ["source-entity", "origin", "came from", "origin of", "entity's starting point", "birthplace of"],
    "Instrument-Agency": ["tool-agent", "means", "used by", "agency behind", "instrumentality", "mechanism"],
    "Member-Collection": ["individual-group", "part", "belongs to", "collection of", "member", "set of"],
    "Message-Topic": ["communication-subject", "theme", "talks about", "topic of", "message content", "subject matter"],
    "Product-Producer": ["outcome-maker", "creation", "made by", "producer of", "product's origin", "manufacturer"]
}

def augment_data(data):
    augmented_data = []

    for index, row in data.iterrows():
        relation = row['Relation'].split('(')[0]
        sentence = row['Sentence']
        
        if relation not in augmentations:
            continue

        augmentations_for_relation = augmentations.get(relation)

        for augmentation in augmentations_for_relation:
            augmented_data.append({"Sentence": sentence, "Relation": augmentation + "(e2,e1)"})
    
    return pd.DataFrame(augmented_data)

def add_augmentations(data):
    subset_size = int(0.5 * len(data))
    subset_data = train_data.sample(n=subset_size, random_state=42)
    augmented_subset_data = augment_data(subset_data)
    combined_train_data = pd.concat([data, augmented_subset_data])
    shuffled_train_data = combined_train_data.sample(frac=1, random_state=42)
    return shuffled_train_data

train_data = create_data_frame("../data/train_attn_sp.txt")
test_data = create_data_frame("../data/test_attn_sp.txt")
validation_data = create_data_frame("../data/val_attn_sp.txt")

train_data = add_augmentations(train_data)
validation_data = add_augmentations(validation_data)



In [4]:
converted_train_data = Dataset.from_pandas(train_data)
converted_test_data = Dataset.from_pandas(test_data)
converted_val_data = Dataset.from_pandas(validation_data)

In [5]:
def batch_tokenize_preprocess(batch, tokenizer, max_source_length, max_target_length):
    source, target = batch["Sentence"], batch["Relation"]
    source_tokenized = tokenizer(
        source, padding="max_length", truncation=True, max_length=max_source_length
    )
    target_tokenized = tokenizer(
        target, padding="max_length", truncation=True, max_length=max_target_length
    )

    batch = {k: v for k, v in source_tokenized.items()}
    batch["labels"] = [
        [-100 if token == tokenizer.pad_token_id else token for token in l]
        for l in target_tokenized["input_ids"]
    ]
    return batch

In [6]:
tokenizer = AutoTokenizer.from_pretrained('facebook/bart-base')
model = AutoModelForSeq2SeqLM.from_pretrained('facebook/bart-base').to('cuda')

encoder_max_length = 128
decoder_max_length = 32



config.json:   0%|          | 0.00/1.72k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/558M [00:00<?, ?B/s]

In [7]:
batch_train_data = converted_train_data.map(
    lambda batch: batch_tokenize_preprocess(
        batch, tokenizer, encoder_max_length, decoder_max_length
    ),
    batched=True,
    remove_columns=converted_train_data.column_names,
)

Map:   0%|          | 0/25040 [00:00<?, ? examples/s]

In [8]:
batch_validation_data = converted_val_data.map(
    lambda batch: batch_tokenize_preprocess(
        batch, tokenizer, encoder_max_length, decoder_max_length
    ),
    batched=True,
    remove_columns=converted_train_data.column_names,
)

Map:   0%|          | 0/1356 [00:00<?, ? examples/s]

In [10]:
batch_test_data = converted_test_data.map(
    lambda batch: batch_tokenize_preprocess(
        batch, tokenizer, encoder_max_length, decoder_max_length
    ),
    batched=True,
    remove_columns=converted_test_data.column_names,
)

Map:   0%|          | 0/2717 [00:00<?, ? examples/s]

In [11]:
nltk.download("punkt", quiet=True)
metric = datasets.load_metric("rouge")

def postprocess_text(preds, labels):
    if preds is None or labels is None:
        print("Warning: Either preds or labels is None.")
        return [], []

    preds = [pred.strip() for pred in preds]
    labels = [label.strip() for label in labels]

    # rougeLSum expects newline after each sentence
    preds = ["\n".join(nltk.sent_tokenize(pred)) for pred in preds]
    labels = ["\n".join(nltk.sent_tokenize(label)) for label in labels]

    return preds, labels

  metric = datasets.load_metric("rouge")
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


Downloading builder script:   0%|          | 0.00/2.17k [00:00<?, ?B/s]

In [18]:
def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)

    # Replace -100 in the labels as we can't decode them.
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Some simple post-processing
    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)

    result = metric.compute(
        predictions=decoded_preds, references=decoded_labels, use_stemmer=True
    )

    # Extract a few results from ROUGE
    result = {key: value.mid.fmeasure * 100 for key, value in result.items()}

    prediction_lens = [
        np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds
    ]
    result["gen_len"] = np.mean(prediction_lens)
    result = {k: round(v, 4) for k, v in result.items()}
    return result

In [19]:
training_args = Seq2SeqTrainingArguments(
    output_dir="../models/augmentations_50",
    num_train_epochs=20,
    # save_strategy = 'epoch',
    evaluation_strategy='epoch',
    learning_rate=1e-5,
    warmup_ratio=0.2,
    weight_decay=1e-3,
    predict_with_generate=True,
    logging_dir="logs",
    logging_steps=200,
    report_to = 'none',
    save_total_limit=1,
)


data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=batch_train_data,
    eval_dataset=batch_validation_data,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [20]:
trainer.train()

Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len
1,0.4228,0.344225,56.5007,27.5263,47.1198,47.1946,12.778
2,0.2632,0.230546,60.7372,36.267,53.7372,53.7736,12.8761
3,0.2266,0.204333,62.6789,38.8424,55.9392,55.8665,12.7869
4,0.203,0.201893,63.639,37.8978,57.5275,57.5455,12.7625
5,0.195,0.185693,69.9681,44.3818,63.5318,63.5808,12.8001
6,0.1849,0.185865,68.95,44.1359,62.912,62.993,12.694
7,0.1807,0.193562,67.9122,41.7168,62.169,62.1661,12.6143
8,0.1772,0.187993,69.2077,45.0634,63.8168,63.8203,12.5811
9,0.1737,0.192781,67.5602,39.9457,61.322,61.4084,12.2817
10,0.1703,0.195633,66.6783,41.6742,61.3406,61.3559,12.5627


Non-default generation parameters: {'early_stopping': True, 'num_beams': 4, 'no_repeat_ngram_size': 3, 'forced_bos_token_id': 0, 'forced_eos_token_id': 2}
Non-default generation parameters: {'early_stopping': True, 'num_beams': 4, 'no_repeat_ngram_size': 3, 'forced_bos_token_id': 0, 'forced_eos_token_id': 2}
Non-default generation parameters: {'early_stopping': True, 'num_beams': 4, 'no_repeat_ngram_size': 3, 'forced_bos_token_id': 0, 'forced_eos_token_id': 2}
Non-default generation parameters: {'early_stopping': True, 'num_beams': 4, 'no_repeat_ngram_size': 3, 'forced_bos_token_id': 0, 'forced_eos_token_id': 2}
Non-default generation parameters: {'early_stopping': True, 'num_beams': 4, 'no_repeat_ngram_size': 3, 'forced_bos_token_id': 0, 'forced_eos_token_id': 2}
Non-default generation parameters: {'early_stopping': True, 'num_beams': 4, 'no_repeat_ngram_size': 3, 'forced_bos_token_id': 0, 'forced_eos_token_id': 2}
Non-default generation parameters: {'early_stopping': True, 'num_beams

TrainOutput(global_step=62600, training_loss=0.23833673565532454, metrics={'train_runtime': 8636.5306, 'train_samples_per_second': 57.986, 'train_steps_per_second': 7.248, 'total_flos': 3.8169502285824e+16, 'train_loss': 0.23833673565532454, 'epoch': 20.0})

In [21]:
def get_f1(key, prediction, none_id):
    correct_by_relation = ((key == prediction) & (prediction != none_id)).astype(np.int32).sum()
    guessed_by_relation = (prediction != none_id).astype(np.int32).sum()
    gold_by_relation = (key != none_id).astype(np.int32).sum()

    prec_micro = 1.0
    if guessed_by_relation > 0:
        prec_micro = float(correct_by_relation) / float(guessed_by_relation)
    recall_micro = 1.0
    if gold_by_relation > 0:
        recall_micro = float(correct_by_relation) / float(gold_by_relation)
    f1_micro = 0.0
    if prec_micro + recall_micro > 0.0:
        f1_micro = 2.0 * prec_micro * recall_micro / (prec_micro + recall_micro)
    return prec_micro, recall_micro, f1_micro

In [22]:
inv_augmentations = {i: k for k, v in augmentations.items() for i in v}

In [31]:
def get_score():
    
    pred = trainer.predict(
            batch_test_data, metric_key_prefix="predict", max_length=32, num_beams=4
        )

    label_seq2seq = []
    pred_seq2seq = []
    
    print('start generate pred_seq2seq')

    for k, d in tqdm(enumerate(batch_test_data)):
        
        tt = d['labels']
        temp_label = tokenizer.decode(tt[:np.sum(np.array(tt) != -100)], skip_special_tokens=True, clean_up_tokenization_spaces=False)
        temp_pred = tokenizer.decode(pred[0][k], skip_special_tokens=True, clean_up_tokenization_spaces=False)
        label_seq2seq.append(temp_label)
        
        if temp_pred.split("(")[0] in inv_augmentations:
            pred_seq2seq.append(inv_augmentations.get(temp_pred))
        elif temp_pred.split("(")[0] in augmentations:
            pred_seq2seq.append(temp_pred)
        else:
            pred_seq2seq.append("Other")
        
    print('*****finish predict*****')
    def func(x):
        if x in label_seq2seq:
            return x
        else:
            return 'Other'
        
    pred_seq2seq = [func(x) for x in pred_seq2seq]
    
    df = pd.DataFrame()

    df['label'] = label_seq2seq
    df['pred'] = pred_seq2seq
    print('*****finish df*****')
    
    lb = LabelEncoder()
    lb.fit(list(df['label']))
    label_lb = lb.transform(list(df['label']))
    pred_lb = lb.transform(list(df['pred']))

    print('*****finish encode*****')

    P, R, F1 = get_f1(label_lb, pred_lb, lb.transform(['Other'])[0])
    
    return P, R, F1

In [32]:
P, R, F1 = get_score()
print('{}:, P:{}, R:{}, F1 :{} \n'.format('score', P, R, F1))

start generate pred_seq2seq


2717it [00:00, 4456.17it/s]

*****finish predict*****
*****finish df*****
*****finish encode*****
score:, P:0.806697108066971, R:0.7026071586389748, F1 :0.7510628247520076 






In [33]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, pipeline
trained_model_path = "../models/augmentations_50/checkpoint-62500/"
model = AutoModelForSeq2SeqLM.from_pretrained(trained_model_path)
tokenizer = AutoTokenizer.from_pretrained(trained_model_path)

In [None]:
model.push_to_hub("Mazin100/augmentation_50_relation_extractoin", model_path=trained_model_path)
tokenizer.push_to_hub("Mazin100/augmentation_50_relation_extractoin", model_path=trained_model_path)

Non-default generation parameters: {'early_stopping': True, 'num_beams': 4, 'no_repeat_ngram_size': 3, 'forced_bos_token_id': 0, 'forced_eos_token_id': 2}


model.safetensors:   0%|          | 0.00/558M [00:00<?, ?B/s]