In [3]:
import torch
import os
import evaluate
import pandas as pd
import numpy as np

if torch.cuda.is_available():
    print("GPU is enabled.")
    print("device count: {}, current device: {}".format(torch.cuda.device_count(), torch.cuda.current_device()))
else:
    print("GPU is not enabled.")

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

from datasets import load_dataset, DatasetDict, Dataset
from transformers import AutoTokenizer, AutoConfig, AutoModelForSeq2SeqLM
from transformers import DataCollatorForSeq2Seq
from transformers import Seq2SeqTrainingArguments
from transformers import Seq2SeqTrainer
from nltk.tokenize import RegexpTokenizer


GPU is enabled.
device count: 1, current device: 0


In [2]:
model_checkpoint = 'google/flan-t5-large'
model_code = model_checkpoint.split("/")[-1]
metric = evaluate.load("meteor")

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\jwilder\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\jwilder\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\jwilder\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [4]:
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

config = AutoConfig.from_pretrained(
    model_checkpoint,
    max_length=128,
    length_penalty=0.6,
    no_repeat_ngram_size=3,
    num_beams=16,
)

model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint, config=config).to(device)

data_collator = DataCollatorForSeq2Seq(
    tokenizer,
    model=model,
    return_tensors="pt")

print(model_checkpoint)

google/flan-t5-large


In [None]:
train_data = pd.read_csv("data/train-eng.csv")
val_data = pd.read_csv("data/dev-eng.csv")

In [5]:
print(train_data.loc[0, "post"])

Lieutenant Retired General Asif Mumtaz appointed as Chairman Pakistan Medical Commission PMC Lieutenant Retired General Asif Mumtaz appointed as Chairman Pakistan Medical Commission PMC Lieutenant Retired General Asif Mumtaz appointed as Chairman Pakistan Medical Commission PMC None


In [6]:
def preprocess_data(df):
    df = df.copy()
    for index, row in train_data.iterrows():
        row["post"] = (
            "Please read the following social media post and extract the claim made within it. "
            "Normalize the claim by rephrasing it in a clear and concise manner.\n\n"
            f"Post: {row['post']}\n\nExtracted Claim:"
        )
    return df.sample(frac=1).reset_index(drop=True)

In [7]:
train_data = preprocess_data(train_data)
val_data = preprocess_data(val_data)

ds = DatasetDict({
        'train': Dataset.from_pandas(train_data),
        'validation': Dataset.from_pandas(val_data)
})

In [8]:
print(train_data.loc[2, "post"])

Please read the following social media post and extract the claim made within it. Normalize the claim by rephrasing it in a clear and concise manner.

Post: Dr. Kafeel is released and reached home,
ignore the rumors

Extracted Claim:


In [9]:
def tokenize_sample_data(data):
    # Max token size is set to 1024 and 128 for inputs and labels, respectively.
    input_feature = tokenizer(data["post"], truncation=True, max_length=1024)
    label = tokenizer(data["normalized claim"], truncation=True, max_length=128)
    return {
        "input_ids": input_feature["input_ids"],
        "attention_mask": input_feature["attention_mask"],
        "labels": label["input_ids"],
    }


tokenized_ds = ds.map(
    tokenize_sample_data,
    remove_columns=["normalized claim", "post"],
    batched=True,
    batch_size=1) # can we increase??


def tokenize_sentence(arg):
    encoded_arg = tokenizer(arg)
    return tokenizer.convert_ids_to_tokens(encoded_arg.input_ids)

def metrics_func(eval_arg):
    preds, labels = eval_arg
    # Replace -100
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    # Convert id tokens to text
    text_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    text_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    # Insert a line break (\n) in each sentence for scoring
    text_preds = [(p if p.endswith(("!", "！", "?", "？", "。")) else p + "。") for p in text_preds]
    text_labels = [(l if l.endswith(("!", "！", "?", "？", "。")) else l + "。") for l in text_labels]
    sent_tokenizer_jp = RegexpTokenizer(u'[^!！?？。]*[!！?？。]')
    text_preds = ["\n".join(np.char.strip(sent_tokenizer_jp.tokenize(p))) for p in text_preds]
    text_labels = ["\n".join(np.char.strip(sent_tokenizer_jp.tokenize(l))) for l in text_labels]
    # compute METEOR score with custom tokenization
    return metric.compute(
        predictions=text_preds,
        references=text_labels,
        tokenizer=tokenize_sentence
    )

Map:   0%|          | 0/11374 [00:00<?, ? examples/s]

Map:   0%|          | 0/1171 [00:00<?, ? examples/s]

In [10]:
training_args = Seq2SeqTrainingArguments(
    output_dir = f"saved-models-{model_code}",
    num_train_epochs = 10, # epochs
    learning_rate = 3e-4,
    lr_scheduler_type = "linear",
    warmup_ratio=0.1,
    optim = "adamw_torch",
    weight_decay = 0.01,
    per_device_train_batch_size = 1,
    per_device_eval_batch_size = 1,
    gradient_accumulation_steps = 16,
    eval_steps = 100,
    predict_with_generate=True,
    generation_max_length = 128,
    logging_steps = 10,
    push_to_hub = False
)

trainer = Seq2SeqTrainer(
    model = model,
    args = training_args,
    data_collator = data_collator,
    compute_metrics = metrics_func,
    train_dataset = tokenized_ds["train"],
    eval_dataset = tokenized_ds["validation"],
    tokenizer = tokenizer
)

trainer.train()

os.makedirs(f"{model_code}/finetuned_{model_code}", exist_ok=True)

if hasattr(trainer.model, "module"):
    trainer.model.module.save_pretrained(f"./{model_code}/finetuned_{model_code}")
else:
    trainer.model.save_pretrained(f"./{model_code}/finetuned_{model_code}")

print("Training done")

  trainer = Seq2SeqTrainer(
Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Step,Training Loss
10,2.9912
20,2.7745
30,2.5491
40,2.718
50,2.5743
60,2.5583
70,2.3792
80,2.4775
90,2.3121
100,2.4717




Training done


In [5]:
model = AutoModelForSeq2SeqLM.from_pretrained(f"./{model_code}/finetuned_{model_code}")
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

In [None]:
input_text = "Ukrainian citizens destroy 2 Russian tanks with molotov cocktails(Petrol, Gasoline) in Kiev. They are using the guerrilla war tactics to counter russian army in their capital. #Ukraine #UkraineUnderAttack #Kiev #RussiaUkraineWar #Kyiv Ukrainian citizens destroy 2 Russian tanks with molotov cocktails(Petrol, Gasoline) in Kiev. They are using the guerrilla war tactics to counter russian army in their capital. #Ukraine #UkraineUnderAttack #Kiev #RussiaUkraineWar #Kyiv Ukrainian citizens destroy 2 Russian tanks with molotov cocktails(Petrol, Gasoline) in Kiev. They are using the guerrilla war tactics to counter russian army in their capital. #Ukraine #UkraineUnderAttack #Kiev #RussiaUkraineWar #Kyiv None"

# Tokenize the Input Text
inputs = tokenizer(input_text, return_tensors="pt", padding=True, truncation=True, max_length=128)

model.eval()  # Set the model to evaluation mode
with torch.no_grad():  # Disable gradient calculation
    generated_ids = model.generate(inputs["input_ids"], max_length=128, num_beams=5, early_stopping=True)

# Decode the Generated Output
output_text = tokenizer.decode(generated_ids[0], skip_special_tokens=True)

# Print the Output
print(f"Generated Output: {output_text}")

Generated Output: Video shows Ukrainians destroying Russian tanks amid ongoing Ukraine-Russia conflict
