# UGP project

#### Author - Michał Dulski - 434678
#### Dataset - ag-news

## Installing Dependencies

In [76]:
%pip install 'transformers[torch]==4.34.1' tokenizers==0.14.1 sentencepiece==0.1.99 datasets==2.14.7 evaluate==0.4.1 sacrebleu==2.3.2 scikit-learn==1.3.1 protobuf==3.20.3

46947.78s - pydevd: Sending message related to process being replaced timed-out after 5 seconds


Note: you may need to restart the kernel to use updated packages.


## Dataset preparation

In [55]:
#!/usr/bin/env python3

import json
import logging
from pathlib import Path

from datasets import load_dataset

LOGGER = logging.getLogger(__name__)

LABEL_MAP = {
    0: "World",
    1: "Sports",
    2: "Business",
    3: "Sci/Tech"
}

# ag-news train split size: 120k
# ag-news test split size: ~7.6k
# training split can be reduced to 5k but test split has to be splitted 
# into ~3.8k validation and ~3.8k test therefore we'll reduce training
# dataset split to 5k and reduce test and validation splits to 3k

def reduce_and_save_train_dataset(file_path: Path, data_to_save: list[dict]) -> None:
    data_class_0, data_class_1, data_class_2, data_class_3 = [], [], [], []

    for data in data_to_save:
        label = data["label"]
        if (label == 0 or label == LABEL_MAP[0]):
            data_class_0.append(data)
        elif (label == 1 or label == LABEL_MAP[1]):
            data_class_1.append(data)
        elif (label == 2 or label == LABEL_MAP[2]):
            data_class_2.append(data)
        elif (label == 3 or label == LABEL_MAP[3]):
            data_class_3.append(data)

    quarter_of_dataset_size = 1250  # 5000 / 4

    reduced_dataset = data_class_0[:quarter_of_dataset_size] + data_class_1[:quarter_of_dataset_size] + data_class_2[:quarter_of_dataset_size] + data_class_3[:quarter_of_dataset_size]

    save_path = file_path.parent / f"{file_path.stem}-5k.json"
    with open(save_path, "wt") as f_write:
      for data_line in reduced_dataset:
        data_line_str = json.dumps(data_line)
        f_write.write(f"{data_line_str}\n")

def save_train_with_label_mapping(original_path: Path, data_to_save: list[dict]) -> None:
    file_name = "s2s-" + original_path.name
    file_path = original_path.parent / file_name
    LOGGER.info(data_to_save)

    with open(file_path, "wt") as f_write:
      for data_line in data_to_save:
        label = data_line["label"]
        translated_label = LABEL_MAP[label]
        data_line["label"] = translated_label
        data_line_str = json.dumps(data_line)
        f_write.write(f"{data_line_str}\n")

    LOGGER.info(data_to_save)
    reduce_and_save_train_dataset(file_path, data_to_save)

def reduce_and_save_valid_test_dataset(file_path: Path, data_to_save: list[dict]) -> None:
    data_class_0, data_class_1, data_class_2, data_class_3 = [], [], [], []

    for data in data_to_save:
        label = data["label"]
        if (label == 0 or label == LABEL_MAP[0]):
            data_class_0.append(data)
        elif (label == 1 or label == LABEL_MAP[1]):
            data_class_1.append(data)
        elif (label == 2 or label == LABEL_MAP[2]):
            data_class_2.append(data)
        elif (label == 3 or label == LABEL_MAP[3]):
            data_class_3.append(data)

    quarter_of_dataset_size = 750  # 3000 / 4

    reduced_dataset = data_class_0[:quarter_of_dataset_size] + data_class_1[:quarter_of_dataset_size] + data_class_2[:quarter_of_dataset_size] + data_class_3[:quarter_of_dataset_size]

    save_path = file_path.parent / f"{file_path.stem}-3k.json"
    with open(save_path, "wt") as f_write:
      for data_line in reduced_dataset:
        data_line_str = json.dumps(data_line)
        f_write.write(f"{data_line_str}\n")

def save_valid_test_with_label_mapping(original_path: Path, data_to_save: list[dict]) -> None:
    file_name = "s2s-" + original_path.name
    file_path = original_path.parent / file_name

    with open(file_path, "wt") as f_write:
      for data_line in data_to_save:
        label = data_line["label"]
        translated_label = LABEL_MAP[label]
        data_line["label"] = translated_label
        data_line_str = json.dumps(data_line)
        f_write.write(f"{data_line_str}\n")

    reduce_and_save_valid_test_dataset(file_path, data_to_save)


def main() -> None:
    news_dataset = load_dataset("ag_news")

    save_path = Path("data/")
    save_train_path = save_path / "train.json"
    save_valid_path = save_path / "valid.json"
    save_test_path = save_path / "test.json"

    if(not save_path.exists()):
      save_path.mkdir()


    data_train, data_valid, data_test = [], [], []

    for source_data, dataset in [
        (news_dataset["train"], data_train),
        (news_dataset["test"], data_valid),
    ]:
        for i, data in enumerate(source_data):
            data_line = {
                "label": int(data["label"]),
                "text": data["text"]
            }

            dataset.append(data_line)

    data_class_0, data_class_1, data_class_2, data_class_3 = [], [], [], []

    for data in data_valid:
        label = data["label"]
        if (label == 0):
            data_class_0.append(data)
        elif (label == 1):
            data_class_1.append(data)
        elif (label == 2):
            data_class_2.append(data)
        elif (label == 3):
            data_class_3.append(data)

    half_of_class_0_size = int(len(data_class_0) / 2)
    half_of_class_1_size = int(len(data_class_1) / 2)
    half_of_class_2_size = int(len(data_class_2) / 2)
    half_of_class_3_size = int(len(data_class_3) / 2)

    data_valid = data_class_0[:half_of_class_0_size] + data_class_1[:half_of_class_1_size] + data_class_2[:half_of_class_2_size] + data_class_3[:half_of_class_3_size]

    data_test = data_class_0[half_of_class_0_size:] + data_class_1[half_of_class_1_size:] + data_class_2[half_of_class_2_size:] + data_class_3[half_of_class_3_size:]

    for file_path, data_to_save in [
        (save_train_path, data_train),
        (save_valid_path, data_valid),
        (save_test_path, data_test)
    ]:
        with open(file_path, "wt") as f_write:
            for data_line in data_to_save:
                data_line_str = json.dumps(data_line)
                f_write.write(f"{data_line_str}\n")

        if file_path == save_train_path:
                reduce_and_save_train_dataset(file_path, data_to_save)
                save_train_with_label_mapping(file_path, data_to_save)
        else:
            reduce_and_save_valid_test_dataset(file_path, data_to_save)
            save_valid_test_with_label_mapping(file_path, data_to_save)

if __name__ == "__main__":
    main()

## GPU Info

In [46]:
!nvidia-smi

UsageError: Line magic function `%nvidia-smi` not found.


# Custom model and classification head can be found in custom_model.py file

# Encoder - RoBERTa
## Modifications

Model:
- custom classification head
    - danse layers
    - recurrent LSTM layer
    - multihead attention layer
    - normalization layer
    - ELU activation function
- frozen 2nd, 5th and 7th layers

Training arguments:
- --warmup_steps 60 # batch size 8, steps per epoch 5000/8=625, 625*10%=62,5, rounded down to 60
- --eval_steps 625 # evaluate model after each epoch

Model Scheme, frozen weights and whether the hidden state was used information is available in training history (run_glue.py lines 421, 441-446 )

In [23]:
!python run_glue.py \
  --weight_decay 1e-5 \
  --warmup_steps 60 \
  --cache_dir .cache_training \
  --model_name_or_path roberta-base \
  --custom_model roberta_custom \
  --train_file data/train-5k.json  \
  --validation_file data/valid-3k.json \
  --per_device_train_batch_size 8 \
  --per_device_eval_batch_size 8 \
  --do_train \
  --do_eval \
  --max_seq_length 128 \
  --learning_rate 2e-5 \
  --num_train_epochs 1 \
  --save_strategy steps \
  --save_steps 625 \
  --save_total_limit 5 \
  --logging_strategy steps \
  --logging_steps 50 \
  --eval_steps 625 \
  --evaluation_strategy steps \
  --metric_for_best_model 'accuracy' \
  --greater_is_better 'True' \
  --load_best_model_at_end 'True' \
  --report_to 'none' \
  --output_dir out/ag-news/roberta_custom

01/14/2024 19:21:24 - INFO - __main__ - Training/evaluation parameters TrainingArguments(
_n_gpu=1,
adafactor=False,
adam_beta1=0.9,
adam_beta2=0.999,
adam_epsilon=1e-08,
auto_find_batch_size=False,
bf16=False,
bf16_full_eval=False,
data_seed=None,
dataloader_drop_last=False,
dataloader_num_workers=0,
dataloader_pin_memory=True,
ddp_backend=None,
ddp_broadcast_buffers=None,
ddp_bucket_cap_mb=None,
ddp_find_unused_parameters=None,
ddp_timeout=1800,
debug=[],
deepspeed=None,
disable_tqdm=False,
dispatch_batches=None,
do_eval=True,
do_predict=False,
do_train=True,
eval_accumulation_steps=None,
eval_delay=0,
eval_steps=625,
evaluation_strategy=steps,
fp16=False,
fp16_backend=auto,
fp16_full_eval=False,
fp16_opt_level=O1,
fsdp=[],
fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_grad_ckpt': False},
fsdp_min_num_params=0,
fsdp_transformer_layer_cls_to_wrap=None,
full_determinism=False,
gradient_accumulation_steps=1,
gradient_checkpointing=False,
greater_is_better=True,
group_by_leng

# Decoder GPT2
## Modifications

Model:
- custom classification head
    - new dense layer
    - normalization layer
    - PReLU activation function
- frozen 2nd, 4th and 7th layers

Training arguments:
- --warmup_steps 60 # batch size 8, steps per epoch 5000/8=625, 625*10%=62,5, rounded down to 60
- --eval_steps 625 # evaluate model after each epoch

Model Scheme, frozen weights and whether the hidden state was used information is available in training history (run_glue.py lines 421, 441-446 )

In [24]:
!python run_glue.py \
  --weight_decay 1e-5 \
  --warmup_steps 60 \
  --cache_dir .cache_training \
  --model_name_or_path gpt2 \
  --custom_model gpt2_custom \
  --train_file data/train-5k.json  \
  --validation_file data/valid-3k.json \
  --per_device_train_batch_size 8 \
  --per_device_eval_batch_size 8 \
  --do_train \
  --do_eval \
  --max_seq_length 128 \
  --learning_rate 2e-5 \
  --num_train_epochs 1 \
  --save_strategy steps \
  --save_steps 1000 \
  --save_total_limit 5 \
  --logging_strategy steps \
  --logging_steps 50 \
  --eval_steps 1000 \
  --evaluation_strategy steps \
  --metric_for_best_model 'accuracy' \
  --greater_is_better 'True' \
  --load_best_model_at_end 'True' \
  --report_to 'none' \
  --output_dir out/ag-news/gpt2_custom

01/14/2024 19:28:36 - INFO - __main__ - Training/evaluation parameters TrainingArguments(
_n_gpu=1,
adafactor=False,
adam_beta1=0.9,
adam_beta2=0.999,
adam_epsilon=1e-08,
auto_find_batch_size=False,
bf16=False,
bf16_full_eval=False,
data_seed=None,
dataloader_drop_last=False,
dataloader_num_workers=0,
dataloader_pin_memory=True,
ddp_backend=None,
ddp_broadcast_buffers=None,
ddp_bucket_cap_mb=None,
ddp_find_unused_parameters=None,
ddp_timeout=1800,
debug=[],
deepspeed=None,
disable_tqdm=False,
dispatch_batches=None,
do_eval=True,
do_predict=False,
do_train=True,
eval_accumulation_steps=None,
eval_delay=0,
eval_steps=1000,
evaluation_strategy=steps,
fp16=False,
fp16_backend=auto,
fp16_full_eval=False,
fp16_opt_level=O1,
fsdp=[],
fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_grad_ckpt': False},
fsdp_min_num_params=0,
fsdp_transformer_layer_cls_to_wrap=None,
full_determinism=False,
gradient_accumulation_steps=1,
gradient_checkpointing=False,
greater_is_better=True,
group_by_len

# Encoder-Decoder T5
## Modifications

Model:
- frozen layers 7-9

Training arguments:
--warmup_steps 60 # batch size 8, steps per epoch 5000/8=625, 625*10%=62,5, round it down to 60
--eval_steps 625 # evaluate model after each epoch

Model Scheme, frozen weights information is available in training history (run_translation.py lines 432-437)

In [25]:
!python run_translation.py \
  --cache_dir .cache_training \
  --model_name_or_path "google/t5-v1_1-base" \
  --freeze_weights \
  --train_file data/s2s-train-5k.json \
  --validation_file data/s2s-valid-3k.json \
  --per_device_train_batch_size 8 \
  --per_device_eval_batch_size 8 \
  --source_lang "text" \
  --target_lang "label" \
  --source_prefix "ag-news classification" \
  --max_source_length 256 \
  --max_target_length 128 \
  --generation_max_length 128 \
  --do_train \
  --do_eval \
  --predict_with_generate \
  --num_train_epochs 1 \
  --save_strategy steps \
  --save_steps 1000 \
  --save_total_limit 5 \
  --logging_strategy steps \
  --logging_steps 50 \
  --eval_steps 1000 \
  --evaluation_strategy steps \
  --metric_for_best_model 'accuracy' \
  --greater_is_better 'True' \
  --load_best_model_at_end 'True' \
  --report_to 'none' \
  --output_dir out/ag-news/t5_v1_1-base

01/14/2024 19:32:56 - INFO - __main__ - Training/evaluation parameters Seq2SeqTrainingArguments(
_n_gpu=1,
adafactor=False,
adam_beta1=0.9,
adam_beta2=0.999,
adam_epsilon=1e-08,
auto_find_batch_size=False,
bf16=False,
bf16_full_eval=False,
data_seed=None,
dataloader_drop_last=False,
dataloader_num_workers=0,
dataloader_pin_memory=True,
ddp_backend=None,
ddp_broadcast_buffers=None,
ddp_bucket_cap_mb=None,
ddp_find_unused_parameters=None,
ddp_timeout=1800,
debug=[],
deepspeed=None,
disable_tqdm=False,
dispatch_batches=None,
do_eval=True,
do_predict=False,
do_train=True,
eval_accumulation_steps=None,
eval_delay=0,
eval_steps=1000,
evaluation_strategy=steps,
fp16=False,
fp16_backend=auto,
fp16_full_eval=False,
fp16_opt_level=O1,
fsdp=[],
fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_grad_ckpt': False},
fsdp_min_num_params=0,
fsdp_transformer_layer_cls_to_wrap=None,
full_determinism=False,
generation_config=None,
generation_max_length=128,
generation_num_beams=None,
gradient_acc

# Prompting FLAN-T5

In [146]:
from transformers import pipeline, AutoModelForSeq2SeqLM, AutoTokenizer
import torch

def get_pipeline(pipeline_type: str, model_name: str, model_type: str, torch_dtype: torch.dtype="auto", device_map="cpu"):
    if model_type == 's2s':
        class_type = AutoModelForSeq2SeqLM
    model = class_type.from_pretrained(model_name, low_cpu_mem_usage=True, torch_dtype=torch_dtype, device_map=device_map)
    tokenizer = AutoTokenizer.from_pretrained(model_name)

    return pipeline(pipeline_type, model=model, tokenizer=tokenizer)

def generate_text_simple(model_pipeline, text: str, max_new_tokens: int = 20, is_prompt: bool = False):
    generated_text = model_pipeline(text, do_sample=False, max_new_tokens=max_new_tokens)[0]["generated_text"]
    if is_prompt and generated_text.startswith(text):
        generated_text = generated_text[len(text):].strip()
    return generated_text

lm_pipeline = get_pipeline('text2text-generation', 'google/flan-t5-base', 's2s', torch_dtype="auto", device_map="cpu")

labels = ["World", "Sports", "Business", "Sci/Tech"]

test_dataset = []
with open("data/s2s-test.json") as f:
    for line in f:
        test_dataset.append(json.loads(line))

correct = 0
total = 0

for item in test_dataset:
    text = item["text"]
    true_label = item["label"]

    prompt = f"{text} This text is about {labels[0]}, {labels[1]}, {labels[2]}, or {labels[3]}?"

    predicted_label = generate_text_simple(lm_pipeline, prompt)
    print(f"{prompt}\n{total}/3800: {predicted_label}")
    if predicted_label == true_label:
        correct += 1
    total += 1

accuracy = correct / total
print(f"Total number of samples: {total}")
print(f"Correct predictions: {correct}")
print(f"Prompting accuracy: {accuracy}")

Downloading generation_config.json: 100%|██████████| 147/147 [00:00<00:00, 406kB/s]


Supreme Court to Review Inmate Freedom Law (AP) AP - The Supreme Court agreed Tuesday to consider the constitutionality of a federal law that requires state prisons to accommodate inmate religions, from Christianity to Satanism. This text is about World, Sports, Business, or Sci/Tech?
0/3800: World
7 U.S. Groups Ask U.N. for Vote Observers (AP) AP - Seven American activist groups asked the United Nations on Monday to provide international observers for next month's presidential election. This text is about World, Sports, Business, or Sci/Tech?
1/3800: World
US troops accelerate operations against Sunni insurgents _ US troops are on the offensive in Iraq ahead of the holy month of Ramadan, which is expected to start at the end of the week. The operations appear aimed at preventing a repeat of the  This text is about World, Sports, Business, or Sci/Tech?
2/3800: World
Iraqi forces raid Ramadi mosques US forces stepped up operations yesterday across a wide swath of the Sunni insurgent str

Zero-shot

In [14]:
from transformers import pipeline
import json

classifier = pipeline("zero-shot-classification", model='google/flan-t5-base')

labels = ["World", "Sports", "Business", "Sci/Tech"]

test_dataset = []
with open("data/s2s-test.json") as f:
    for line in f:
        test_dataset.append(json.loads(line))

correct = 0
total = 0

for item in test_dataset:
    text = item["text"]
    true_label = item["label"]

    prediction = classifier(text, labels)

    # The predicted label is the label with the highest score
    predicted_label = prediction["labels"][0]

    print(f"{total}/3800\nInput: {text}\nPrediction {predicted_label}")
    if predicted_label == true_label:
        correct += 1
    total += 1

accuracy = correct / total
print(f"Total number of samples: {total}")
print(f"Correct predictions: {correct}")
print(f"Zero-shot learning accuracy: {accuracy}")

Some weights of T5ForSequenceClassification were not initialized from the model checkpoint at google/flan-t5-base and are newly initialized: ['classification_head.dense.weight', 'classification_head.dense.bias', 'classification_head.out_proj.weight', 'classification_head.out_proj.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Failed to determine 'entailment' label id from the label2id mapping in the model config. Setting to -1. Define a descriptive label2id mapping in the model config to ensure correct outputs.


Supreme Court to Review Inmate Freedom Law (AP) AP - The Supreme Court agreed Tuesday to consider the constitutionality of a federal law that requires state prisons to accommodate inmate religions, from Christianity to Satanism.
0/3800: Sci/Tech
7 U.S. Groups Ask U.N. for Vote Observers (AP) AP - Seven American activist groups asked the United Nations on Monday to provide international observers for next month's presidential election.
1/3800: Sci/Tech
US troops accelerate operations against Sunni insurgents _ US troops are on the offensive in Iraq ahead of the holy month of Ramadan, which is expected to start at the end of the week. The operations appear aimed at preventing a repeat of the 
2/3800: Sci/Tech
Iraqi forces raid Ramadi mosques US forces stepped up operations yesterday across a wide swath of the Sunni insurgent strongholds northwest of the capital, pounding targets in three urban centers from the air and supporting Iraqi troops in raids on mosques suspected of harboring 
3/

KeyboardInterrupt: 