In [19]:
!pip install transformers nltk datasets rouge_score accelerate -U

[0m

In [1]:
import pandas as pd
from transformers import (
    AutoModelForSeq2SeqLM,
    AutoTokenizer,
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer,
    DataCollatorForSeq2Seq
)
import numpy as np
import nltk
import datasets
import pandas as pd
from tqdm import tqdm
from datasets import load_dataset, load_metric
from sklearn.preprocessing import LabelEncoder
from datasets import Dataset

In [2]:
import pandas as pd
import random

def create_data_frame(file_path):
    with open(file_path, "r") as file:
        data = file.readlines()

    relations = []
    sentences = []

    for line in data:
        parts = line.strip().split(" ")
        relation = parts[1]
        sentence = " ".join(parts[3:])
        relations.append(relation)
        sentences.append(sentence)

    return pd.DataFrame({"Relation": relations, "Sentence": sentences})

augmentations = {
    "Cause-Effect": ["reason-result", "consequence", "led to", "outcome of", "causation", "impact"],
    "Component-Whole": ["part-whole", "element", "make up", "composed of", "constituent", "component parts"],
    "Content-Container": ["substance-holding", "material", "contains", "stored in", "contents", "containerized"],
    "Entity-Destination": ["object-place", "target", "headed to", "destination of", "entity's final stop", "endpoint"],
    "Entity-Origin": ["source-entity", "origin", "came from", "origin of", "entity's starting point", "birthplace of"],
    "Instrument-Agency": ["tool-agent", "means", "used by", "agency behind", "instrumentality", "mechanism"],
    "Member-Collection": ["individual-group", "part", "belongs to", "collection of", "member", "set of"],
    "Message-Topic": ["communication-subject", "theme", "talks about", "topic of", "message content", "subject matter"],
    "Product-Producer": ["outcome-maker", "creation", "made by", "producer of", "product's origin", "manufacturer"]
}

def augment_data(data):
    augmented_data = []

    for index, row in data.iterrows():
        relation = row['Relation'].split('(')[0]
        sentence = row['Sentence']
        
        if relation not in augmentations:
            continue

        augmentations_for_relation = augmentations.get(relation)

        for augmentation in augmentations_for_relation:
            augmented_data.append({"Sentence": sentence, "Relation": augmentation + "(e2,e1)"})
    
    return pd.DataFrame(augmented_data)

def add_augmentations(data):
    subset_size = int(0.25 * len(data))
    subset_data = train_data.sample(n=subset_size, random_state=42)
    augmented_subset_data = augment_data(subset_data)
    combined_train_data = pd.concat([data, augmented_subset_data])
    shuffled_train_data = combined_train_data.sample(frac=1, random_state=42)
    return shuffled_train_data.reset_index(drop=True)

train_data = create_data_frame("./data/train_attn_sp.txt")
test_data = create_data_frame("./data/test_attn_sp.txt")
validation_data = create_data_frame("data/val_attn_sp.txt") 

In [3]:
train_data

Unnamed: 0,Relation,Sentence
0,"Component-Whole(e2,e1)",system as described above has its greatest app...
1,Other,E1_START child E1_END was carefully wrapped an...
2,"Instrument-Agency(e2,e1)",E1_START author E1_END of a keygen uses a E2_S...
3,Other,misty E1_START ridge E1_END uprises from the E...
4,"Member-Collection(e1,e2)",E1_START student E1_END E2_START association E...
...,...,...
7203,"Product-Producer(e1,e2)",E1_START streaks E1_END are from a passing E2_...
7204,Other,V8 E1_START engine E1_END mated with a manual ...
7205,Other,the E1_START notice E1_END is sent by E2_START...
7206,"Entity-Destination(e1,e2)","test this , we placed a kitchen E1_START match..."


In [4]:
augmented_train_data = add_augmentations(train_data)
augmented_validation_data = add_augmentations(validation_data)

In [5]:
augmented_train_data

Unnamed: 0,Relation,Sentence
0,Other,", E1_START varieties E1_END of this sort were ..."
1,"Product-Producer(e2,e1)",steering E1_START committee E1_END developed t...
2,"entity's starting point(e2,e1)",ugly E1_START duckling E1_END runs away from t...
3,Other,E1_START nun E1_END fulfills his fantasy with ...
4,"constituent(e2,e1)",marked the E1_START handle E1_END of the E2_ST...
...,...,...
16155,"part-whole(e2,e1)",E1_START treatment E1_END includes an aromathe...
16156,"Message-Topic(e1,e2)",E1_START script E1_END was written describing ...
16157,Other,study found that people who took the E1_START ...
16158,"creation(e2,e1)",remaining E1_START settlers E1_END dug a 10 mi...


In [6]:
converted_train_data = Dataset.from_pandas(augmented_train_data)
converted_val_data = Dataset.from_pandas(augmented_validation_data)
converted_test_data = Dataset.from_pandas(test_data)

In [7]:
def batch_tokenize_preprocess(batch, tokenizer, max_source_length, max_target_length):
    source, target = batch["Sentence"], batch["Relation"]
    source_tokenized = tokenizer(
        source, truncation=True, max_length=max_source_length
    )
    target_tokenized = tokenizer(
        target, truncation=True, max_length=max_target_length
    )

    batch = {k: v for k, v in source_tokenized.items()}
    batch["labels"] = [
        [-100 if token == tokenizer.pad_token_id else token for token in l]
        for l in target_tokenized["input_ids"]
    ]
    return batch

In [8]:
tokenizer = AutoTokenizer.from_pretrained('facebook/bart-base')
model = AutoModelForSeq2SeqLM.from_pretrained('facebook/bart-base')

encoder_max_length = 128
decoder_max_length = 32



In [9]:
batch_train_data = converted_train_data.map(
    lambda batch: batch_tokenize_preprocess(
        batch, tokenizer, encoder_max_length, decoder_max_length
    ),
    batched=True,
    remove_columns=converted_train_data.column_names,
)

Map:   0%|          | 0/16160 [00:00<?, ? examples/s]

In [10]:
batch_validation_data = converted_val_data.map(
    lambda batch: batch_tokenize_preprocess(
        batch, tokenizer, encoder_max_length, decoder_max_length
    ),
    batched=True,
    remove_columns=converted_train_data.column_names,
)

Map:   0%|          | 0/1788 [00:00<?, ? examples/s]

In [11]:
batch_test_data = converted_test_data.map(
    lambda batch: batch_tokenize_preprocess(
        batch, tokenizer, encoder_max_length, decoder_max_length
    ),
    batched=True,
    remove_columns=converted_train_data.column_names,
)

Map:   0%|          | 0/2717 [00:00<?, ? examples/s]

In [12]:
nltk.download("punkt", quiet=True)
metric = datasets.load_metric("rouge")

def postprocess_text(preds, labels):
    if preds is None or labels is None:
        print("Warning: Either preds or labels is None.")
        return [], []

    preds = [pred.strip() for pred in preds]
    labels = [label.strip() for label in labels]

    # rougeLSum expects newline after each sentence
    preds = ["\n".join(nltk.sent_tokenize(pred)) for pred in preds]
    labels = ["\n".join(nltk.sent_tokenize(label)) for label in labels]

    return preds, labels

  metric = datasets.load_metric("rouge")
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


In [13]:
def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)

    # Replace -100 in the labels as we can't decode them.
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Some simple post-processing
    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)

    result = metric.compute(
        predictions=decoded_preds, references=decoded_labels, use_stemmer=True
    )

    # Extract a few results from ROUGE
    result = {key: value.mid.fmeasure * 100 for key, value in result.items()}

    prediction_lens = [
        np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds
    ]
    result["gen_len"] = np.mean(prediction_lens)
    result = {k: round(v, 4) for k, v in result.items()}
    return result

In [14]:
training_args = Seq2SeqTrainingArguments(
    output_dir="./models/augmentations_25",
    num_train_epochs=20,
    # save_strategy = 'steps',
    evaluation_strategy='epoch',
    learning_rate=1e-5,
    warmup_ratio=0.2,
    weight_decay=1e-3,
    predict_with_generate=True,
    logging_dir="logs",
    logging_steps=200,
    report_to = 'none',
    save_total_limit=1,
)


data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=batch_train_data,
    eval_dataset=batch_validation_data,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [15]:
trainer.train()

Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len
1,0.4884,0.351408,56.5324,27.9728,46.983,46.9732,13.4457
2,0.2897,0.238618,64.4565,37.4317,55.6377,55.7089,13.1997
3,0.2273,0.19774,66.517,42.9894,59.9,59.8766,13.3949
4,0.2074,0.180968,69.015,44.5138,62.2825,62.2906,13.2925
5,0.1839,0.176751,69.6453,45.1683,63.0191,63.0863,13.2534
6,0.1752,0.172329,69.594,46.8322,63.5596,63.5888,13.1706
7,0.1692,0.164314,69.2898,46.2165,63.9023,63.9004,12.8954
8,0.1579,0.158531,69.1129,49.6252,65.1568,65.1066,12.9799
9,0.1556,0.163082,68.9412,50.0025,65.8216,65.8231,12.8535
10,0.1495,0.161528,68.2329,51.8879,66.6074,66.6166,12.8367


Non-default generation parameters: {'early_stopping': True, 'num_beams': 4, 'no_repeat_ngram_size': 3, 'forced_bos_token_id': 0, 'forced_eos_token_id': 2}
Non-default generation parameters: {'early_stopping': True, 'num_beams': 4, 'no_repeat_ngram_size': 3, 'forced_bos_token_id': 0, 'forced_eos_token_id': 2}
Non-default generation parameters: {'early_stopping': True, 'num_beams': 4, 'no_repeat_ngram_size': 3, 'forced_bos_token_id': 0, 'forced_eos_token_id': 2}
Non-default generation parameters: {'early_stopping': True, 'num_beams': 4, 'no_repeat_ngram_size': 3, 'forced_bos_token_id': 0, 'forced_eos_token_id': 2}
Non-default generation parameters: {'early_stopping': True, 'num_beams': 4, 'no_repeat_ngram_size': 3, 'forced_bos_token_id': 0, 'forced_eos_token_id': 2}
Non-default generation parameters: {'early_stopping': True, 'num_beams': 4, 'no_repeat_ngram_size': 3, 'forced_bos_token_id': 0, 'forced_eos_token_id': 2}
Non-default generation parameters: {'early_stopping': True, 'num_beams

TrainOutput(global_step=40400, training_loss=0.23724334126651878, metrics={'train_runtime': 4806.9603, 'train_samples_per_second': 67.236, 'train_steps_per_second': 8.404, 'total_flos': 1.075558916210688e+16, 'train_loss': 0.23724334126651878, 'epoch': 20.0})

In [16]:
def get_f1(key, prediction, none_id):
    correct_by_relation = ((key == prediction) & (prediction != none_id)).astype(np.int32).sum()
    guessed_by_relation = (prediction != none_id).astype(np.int32).sum()
    gold_by_relation = (key != none_id).astype(np.int32).sum()

    prec_micro = 1.0
    if guessed_by_relation > 0:
        prec_micro = float(correct_by_relation) / float(guessed_by_relation)
    recall_micro = 1.0
    if gold_by_relation > 0:
        recall_micro = float(correct_by_relation) / float(gold_by_relation)
    f1_micro = 0.0
    if prec_micro + recall_micro > 0.0:
        f1_micro = 2.0 * prec_micro * recall_micro / (prec_micro + recall_micro)
    return prec_micro, recall_micro, f1_micro

In [17]:
inv_augmentations = {i: k for k, v in augmentations.items() for i in v}

In [18]:
def get_score():
    
    pred = trainer.predict(
            batch_test_data, metric_key_prefix="predict", max_length=32, num_beams=4
        )

    label_seq2seq = []
    pred_seq2seq = []
    
    print('start generate pred_seq2seq')

    for k, d in tqdm(enumerate(batch_test_data)):
        
        tt = d['labels']
        temp_label = tokenizer.decode(tt[:np.sum(np.array(tt) != -100)], skip_special_tokens=True, clean_up_tokenization_spaces=False)
        temp_pred = tokenizer.decode(pred[0][k], skip_special_tokens=True, clean_up_tokenization_spaces=False)
        
        label_seq2seq.append(temp_label)
        
        if temp_pred.split("(")[0] in inv_augmentations:
            pred_seq2seq.append(inv_augmentations.get(temp_pred))
        elif temp_pred.split("(")[0] in augmentations:
            pred_seq2seq.append(temp_pred)
        else:
            pred_seq2seq.append("Other")
    
    print('*****finish predict*****')
    def func(x):
        if x in label_seq2seq:
            return x
        else:
            return 'Other'
        
    pred_seq2seq = [func(x) for x in pred_seq2seq]
    
    df = pd.DataFrame()

    df['label'] = label_seq2seq
    df['pred'] = pred_seq2seq
    print('*****finish df*****')
    
    lb = LabelEncoder()
    lb.fit(list(df['label']))
    label_lb = lb.transform(list(df['label']))
    pred_lb = lb.transform(list(df['pred']))

    print('*****finish encode*****')

    P, R, F1 = get_f1(label_lb, pred_lb, lb.transform(['Other'])[0])
    
    return P, R, F1

In [19]:
P, R, F1 = get_score()
print('{}:, P:{}, R:{}, F1 :{} \n'.format('score', P, R, F1))

start generate pred_seq2seq


2717it [00:00, 6730.84it/s]

*****finish predict*****
*****finish df*****
*****finish encode*****
score:, P:0.8273703041144902, R:0.8174988952717631, F1 :0.8224049788841965 






In [5]:
trained_model_path = "../models/augmentations_25/checkpoint-40000/"
model = AutoModelForSeq2SeqLM.from_pretrained(trained_model_path)
tokenizer = AutoTokenizer.from_pretrained(trained_model_path)

In [2]:
model.push_to_hub("Mazin100/augmentation_25_relation_extractoin", model_path=trained_model_path)
tokenizer.push_to_hub("Mazin100/augmentation_25_relation_extractoin", model_path=trained_model_path)

Non-default generation parameters: {'early_stopping': True, 'num_beams': 4, 'no_repeat_ngram_size': 3, 'forced_bos_token_id': 0, 'forced_eos_token_id': 2}


model.safetensors:   0%|          | 0.00/558M [00:00<?, ?B/s]

README.md:   0%|          | 0.00/5.17k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/Mazin100/augmentation_25_relation_extractoin/commit/943b0261b3230a1099e7bf7e643f36965426449c', commit_message='Upload tokenizer', commit_description='', oid='943b0261b3230a1099e7bf7e643f36965426449c', pr_url=None, pr_revision=None, pr_num=None)