In [12]:
!pip install transformers nltk datasets rouge_score accelerate -U

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
[0m

In [3]:
import pandas as pd
from transformers import (
    AutoModelForSeq2SeqLM,
    AutoTokenizer,
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer,
    DataCollatorForSeq2Seq,
)
import numpy as np
import nltk
import datasets
import pandas as pd
from tqdm import tqdm
from datasets import load_dataset, load_metric
from sklearn.preprocessing import LabelEncoder
from datasets import Dataset

In [21]:
def create_data_frame(file_path):

  with open(file_path, "r") as file:
      data = file.readlines()

  relations = []
  sentences = []

  for line in data:
    parts = line.strip().split(" ")
    relation = parts[1]
    sentence = " ".join(parts[3:])
    relations.append(relation)
    sentences.append(sentence)

  return pd.DataFrame({"Relation": relations, "Sentence": sentences})

train_data = create_data_frame("./data/train_attn_sp.txt")
test_data = create_data_frame("./data/test_attn_sp.txt")
validation_data = create_data_frame("./data/val_attn_sp.txt")
train_data.head()

Unnamed: 0,Relation,Sentence
0,"Component-Whole(e2,e1)",system as described above has its greatest app...
1,Other,E1_START child E1_END was carefully wrapped an...
2,"Instrument-Agency(e2,e1)",E1_START author E1_END of a keygen uses a E2_S...
3,Other,misty E1_START ridge E1_END uprises from the E...
4,"Member-Collection(e1,e2)",E1_START student E1_END E2_START association E...


In [5]:
converted_train_data = Dataset.from_pandas(train_data)
converted_test_data = Dataset.from_pandas(test_data)
converted_val_data = Dataset.from_pandas(validation_data)

In [6]:
def batch_tokenize_preprocess(batch, tokenizer, max_source_length, max_target_length):
    source, target = batch["Sentence"], batch["Relation"]
    source_tokenized = tokenizer(
        source, padding="max_length", truncation=True, max_length=max_source_length
    )
    target_tokenized = tokenizer(
        target, padding="max_length", truncation=True, max_length=max_target_length
    )

    batch = {k: v for k, v in source_tokenized.items()}
    batch["labels"] = [
        [-100 if token == tokenizer.pad_token_id else token for token in l]
        for l in target_tokenized["input_ids"]
    ]
    return batch

In [7]:
tokenizer = AutoTokenizer.from_pretrained('facebook/bart-base')
model = AutoModelForSeq2SeqLM.from_pretrained('facebook/bart-base')



In [8]:
encoder_max_length = 128
decoder_max_length = 32

In [9]:
batch_train_data = converted_train_data.map(
    lambda batch: batch_tokenize_preprocess(
        batch, tokenizer, encoder_max_length, decoder_max_length
    ),
    batched=True,
    remove_columns=converted_train_data.column_names,
)

Map:   0%|          | 0/7208 [00:00<?, ? examples/s]

In [10]:
batch_validation_data = converted_val_data.map(
    lambda batch: batch_tokenize_preprocess(
        batch, tokenizer, encoder_max_length, decoder_max_length
    ),
    batched=True,
    remove_columns=converted_train_data.column_names,
)

Map:   0%|          | 0/792 [00:00<?, ? examples/s]

In [11]:
batch_test_data = converted_test_data.map(
    lambda batch: batch_tokenize_preprocess(
        batch, tokenizer, encoder_max_length, decoder_max_length
    ),
    batched=True,
    remove_columns=converted_train_data.column_names,
)

Map:   0%|          | 0/2717 [00:00<?, ? examples/s]

In [12]:
nltk.download("punkt", quiet=True)
metric = datasets.load_metric("rouge")

def postprocess_text(preds, labels):
    if preds is None or labels is None:
        print("Warning: Either preds or labels is None.")
        return [], []

    preds = [pred.strip() for pred in preds]
    labels = [label.strip() for label in labels]

    # rougeLSum expects newline after each sentence
    preds = ["\n".join(nltk.sent_tokenize(pred)) for pred in preds]
    labels = ["\n".join(nltk.sent_tokenize(label)) for label in labels]

    return preds, labels

  metric = datasets.load_metric("rouge")
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


In [13]:
def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)

    # Replace -100 in the labels as we can't decode them.
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Some simple post-processing
    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)

    result = metric.compute(
        predictions=decoded_preds, references=decoded_labels, use_stemmer=True
    )

    # Extract a few results from ROUGE
    result = {key: value.mid.fmeasure * 100 for key, value in result.items()}

    prediction_lens = [
        np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds
    ]
    result["gen_len"] = np.mean(prediction_lens)
    result = {k: round(v, 4) for k, v in result.items()}
    return result

In [14]:
training_args = Seq2SeqTrainingArguments(
    output_dir="./0_no_augmentaions",
    num_train_epochs=20,
    # save_strategy = 'steps',
    save_strategy = 'epoch',
    evaluation_strategy='epoch',
    learning_rate=1e-5,
    warmup_ratio=0.2,
    weight_decay=1e-3,
    predict_with_generate=True,
    logging_dir="logs",
    logging_steps=200,
    save_total_limit=1,
    report_to = 'none',
)


data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=batch_train_data,
    eval_dataset=batch_validation_data,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [15]:
trainer.train()

Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len
1,0.3614,0.225624,58.5701,34.638,50.726,50.726,13.6641
2,0.1761,0.123649,71.9697,60.6061,69.1604,69.1604,13.2336
3,0.0983,0.074503,79.3561,72.9798,78.4091,78.4091,13.1806
4,0.0779,0.068664,81.0606,75.1263,80.303,80.2399,13.2045
5,0.0641,0.060616,82.9297,71.633,82.0797,82.1677,12.4558
6,0.0487,0.067385,84.2623,74.0025,83.5205,83.5723,12.5682
7,0.034,0.062774,86.1742,75.4125,85.4595,85.4482,12.5404
8,0.0379,0.063019,85.5805,75.1557,85.089,85.1243,12.4369
9,0.0261,0.074421,87.0055,76.3468,86.532,86.5583,12.524
10,0.019,0.073447,86.798,75.202,86.2697,86.2659,12.3636


Non-default generation parameters: {'early_stopping': True, 'num_beams': 4, 'no_repeat_ngram_size': 3, 'forced_bos_token_id': 0, 'forced_eos_token_id': 2}
Non-default generation parameters: {'early_stopping': True, 'num_beams': 4, 'no_repeat_ngram_size': 3, 'forced_bos_token_id': 0, 'forced_eos_token_id': 2}
Non-default generation parameters: {'early_stopping': True, 'num_beams': 4, 'no_repeat_ngram_size': 3, 'forced_bos_token_id': 0, 'forced_eos_token_id': 2}
Non-default generation parameters: {'early_stopping': True, 'num_beams': 4, 'no_repeat_ngram_size': 3, 'forced_bos_token_id': 0, 'forced_eos_token_id': 2}
Non-default generation parameters: {'early_stopping': True, 'num_beams': 4, 'no_repeat_ngram_size': 3, 'forced_bos_token_id': 0, 'forced_eos_token_id': 2}
Non-default generation parameters: {'early_stopping': True, 'num_beams': 4, 'no_repeat_ngram_size': 3, 'forced_bos_token_id': 0, 'forced_eos_token_id': 2}
Non-default generation parameters: {'early_stopping': True, 'num_beams

TrainOutput(global_step=18020, training_loss=0.12972239907107397, metrics={'train_runtime': 2651.3384, 'train_samples_per_second': 54.373, 'train_steps_per_second': 6.797, 'total_flos': 1.09874509774848e+16, 'train_loss': 0.12972239907107397, 'epoch': 20.0})

In [16]:
def get_f1(key, prediction, none_id):
    correct_by_relation = ((key == prediction) & (prediction != none_id)).astype(np.int32).sum()
    guessed_by_relation = (prediction != none_id).astype(np.int32).sum()
    gold_by_relation = (key != none_id).astype(np.int32).sum()

    prec_micro = 1.0
    if guessed_by_relation > 0:
        prec_micro = float(correct_by_relation) / float(guessed_by_relation)
    recall_micro = 1.0
    if gold_by_relation > 0:
        recall_micro = float(correct_by_relation) / float(gold_by_relation)
    f1_micro = 0.0
    if prec_micro + recall_micro > 0.0:
        f1_micro = 2.0 * prec_micro * recall_micro / (prec_micro + recall_micro)
    return prec_micro, recall_micro, f1_micro

In [17]:
def get_score():
    
    pred = trainer.predict(
            batch_test_data, metric_key_prefix="predict", max_length=32, num_beams=4
        )

    label_seq2seq = []
    pred_seq2seq = []
    
    print('start generate pred_seq2seq')

    for k, d in tqdm(enumerate(batch_test_data)):
        
        tt = d['labels']
        temp_label = tokenizer.decode(tt[:np.sum(np.array(tt) != -100)], skip_special_tokens=True, clean_up_tokenization_spaces=False)
        temp_pred = tokenizer.decode(pred[0][k], skip_special_tokens=True, clean_up_tokenization_spaces=False)
        label_seq2seq.append(temp_label)
        pred_seq2seq.append(temp_pred)

    print('*****finish predict*****')
    def func(x):
        if x in label_seq2seq:
            return x
        else:
            return 'Other'
        
    pred_seq2seq = [func(x) for x in pred_seq2seq]
    
    df = pd.DataFrame()

    df['label'] = label_seq2seq
    df['pred'] = pred_seq2seq
    print('*****finish df*****')
    
    lb = LabelEncoder()
    lb.fit(list(df['label']))
    label_lb = lb.transform(list(df['label']))
    pred_lb = lb.transform(list(df['pred']))

    print('*****finish encode*****')

    P, R, F1 = get_f1(label_lb, pred_lb, lb.transform(['Other'])[0])
    
    return P, R, F1

In [18]:
P, R, F1 = get_score()
print('{}:, P:{}, R:{}, F1 :{} \n'.format('score', P, R, F1))

start generate pred_seq2seq


2717it [00:00, 4942.89it/s]

*****finish predict*****
*****finish df*****
*****finish encode*****
score:, P:0.855893377759267, R:0.9080866106937693, F1 :0.8812178387650085 






In [10]:
from huggingface_hub import login
login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [11]:
trained_model_path = "../models/no_augmentations/checkpoint-18020/"
model = AutoModelForSeq2SeqLM.from_pretrained(trained_model_path)
model.push_to_hub("Mazin100/no_augmentation_relation_extractoin", model_path=trained_model_path)

Non-default generation parameters: {'early_stopping': True, 'num_beams': 4, 'no_repeat_ngram_size': 3, 'forced_bos_token_id': 0, 'forced_eos_token_id': 2}


model.safetensors:   0%|          | 0.00/558M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/Mazin100/no_augmentation_relation_extractoin/commit/3dda9a74d94ca2f98a98a2e805125a1fe3d3b6ec', commit_message='Upload BartForConditionalGeneration', commit_description='', oid='3dda9a74d94ca2f98a98a2e805125a1fe3d3b6ec', pr_url=None, pr_revision=None, pr_num=None)

In [18]:
tokenizer = AutoTokenizer.from_pretrained(trained_model_path)
tokenizer.push_to_hub("Mazin100/no_augmentation_relation_extractoin", model_path=trained_model_path)

README.md:   0%|          | 0.00/5.17k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/Mazin100/no_augmentation_relation_extractoin/commit/ec7923c6c43b1a5c1055e8b85ffca6afa8649b54', commit_message='Upload tokenizer', commit_description='', oid='ec7923c6c43b1a5c1055e8b85ffca6afa8649b54', pr_url=None, pr_revision=None, pr_num=None)