In [None]:
import os
os.environ["TF_ENABLE_ONEDNN_OPTS"] = "0"

In [None]:
!nvidia-smi

Thu Oct 31 19:25:00 2024       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 525.116.04   Driver Version: 525.116.04   CUDA Version: 12.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA RTX A6000    Off  | 00000000:00:05.0 Off |                  Off |
| 30%   38C    P8    25W / 300W |      1MiB / 49140MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [None]:
import pandas as pd
df = pd.read_csv("train_data.csv")

In [None]:
from transformers import PegasusForConditionalGeneration, PegasusTokenizer, Trainer, TrainingArguments
import torch
import pandas as pd
from datasets import Dataset

class PegasusDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels['input_ids'][idx])  # Adjusted for labels
        return item

    def __len__(self):
        return len(self.labels['input_ids'])

def prepare_data(model_name,
                 train_texts, train_labels,
                 val_texts, val_labels):
    """
    Prepare input data for model fine-tuning
    """
    tokenizer = PegasusTokenizer.from_pretrained(model_name)

    def tokenize_data(texts, labels):
        encodings = tokenizer(texts, max_length=10000, truncation=True, padding=True)  # Adjusted
        decodings = tokenizer(labels, max_length=1000, truncation=True, padding=True)  # Adjusted
        dataset_tokenized = PegasusDataset(encodings, decodings)
        return dataset_tokenized

    train_dataset = tokenize_data(train_texts, train_labels)
    val_dataset = tokenize_data(val_texts, val_labels)

    return train_dataset, val_dataset, tokenizer
import torch
torch.cuda.empty_cache()
def prepare_fine_tuning(model_name, tokenizer, train_dataset, val_dataset=None, freeze_encoder=False, output_dir='./results'):
    """
    Prepare configurations and base model for fine-tuning
    """
    torch_device = 'cuda' if torch.cuda.is_available() else 'cpu'
    model = PegasusForConditionalGeneration.from_pretrained(model_name).to(torch_device)

    if freeze_encoder:
        for param in model.model.encoder.parameters():
            param.requires_grad = False

    training_args = TrainingArguments(
        output_dir=output_dir,           # output directory
        num_train_epochs=4,              # total number of training epochs
        per_device_train_batch_size=1,   # batch size per device during training
        per_device_eval_batch_size=1,    # batch size for evaluation                 # number of updates steps before checkpoint saves
        fp16=True,
        save_total_limit=1,              # limit the total amount of checkpoints
        evaluation_strategy='epoch',
        gradient_accumulation_steps=16,   # evaluation strategy
        logging_dir='./logs',            # directory for logs
        logging_steps=100,
    )

    trainer = Trainer(
        model=model,                         # the instantiated model to be trained
        args=training_args,                  # training arguments
        train_dataset=train_dataset,
        eval_dataset=val_dataset,            # evaluation dataset
        tokenizer=tokenizer
    )

    return trainer



In [None]:

import torch
torch.cuda.empty_cache()

if __name__=='__main__':
    # Load your datasets using pandas
    train_data = pd.read_csv("train_data.csv")  # Load training data
    val_data = pd.read_csv("val_data.csv")      # Load validation data

    # Create Hugging Face Datasets from the DataFrames
    train_dataset = Dataset.from_pandas(train_data)
    val_dataset = Dataset.from_pandas(val_data)

    # Extract the texts and labels
    train_texts = train_dataset['Judgement']  # Adjust as needed
    train_labels = train_dataset['Perspective-based Summary']

    val_texts = val_dataset['Judgement']  # Adjust as needed
    val_labels = val_dataset['Perspective-based Summary']

    # Use Pegasus X-Large model as base for fine-tuning
    model_name = 'google/pegasus-x-base'
    train_dataset, val_dataset, tokenizer = prepare_data(model_name, train_texts, train_labels, val_texts, val_labels)
    trainer = prepare_fine_tuning(model_name, tokenizer, train_dataset, val_dataset)
    torch.cuda.empty_cache()
    trainer.train()

    # Save the model and tokenizer
    output_dir = './results__'  # Or any other directory you want to use
    trainer.save_model("Pegasus_def")  # Save the trained model
    tokenizer.save_pretrained("Pegasus_def")  # Save the tokenizer


Downloading spiece.model:   0%|          | 0.00/1.82M [00:00<?, ?B/s]

Downloading special_tokens_map.json:   0%|          | 0.00/1.72k [00:00<?, ?B/s]

Downloading tokenizer_config.json:   0%|          | 0.00/1.97k [00:00<?, ?B/s]

Downloading config.json:   0%|          | 0.00/1.45k [00:00<?, ?B/s]

You are using a model of type pegasus_x to instantiate a model of type pegasus. This is not supported for all configurations of models and can yield errors.


Downloading pytorch_model.bin:   0%|          | 0.00/1.01G [00:00<?, ?B/s]

Some weights of the model checkpoint at google/pegasus-x-base were not used when initializing PegasusForConditionalGeneration: ['model.encoder.layers.5.global_self_attn_layer_norm.bias', 'model.encoder.layers.4.global_self_attn_layer_norm.bias', 'model.encoder.embed_global.weight', 'model.encoder.layers.8.global_self_attn_layer_norm.bias', 'model.encoder.layers.2.global_self_attn_layer_norm.weight', 'model.encoder.layers.4.global_self_attn_layer_norm.weight', 'model.encoder.layers.9.global_self_attn_layer_norm.bias', 'model.encoder.layers.7.global_self_attn_layer_norm.bias', 'model.encoder.layers.11.global_self_attn_layer_norm.weight', 'model.encoder.layers.3.global_self_attn_layer_norm.bias', 'model.encoder.layers.1.global_self_attn_layer_norm.weight', 'model.encoder.layers.3.global_self_attn_layer_norm.weight', 'model.encoder.layers.8.global_self_attn_layer_norm.weight', 'model.encoder.layers.0.global_self_attn_layer_norm.weight', 'model.encoder.layers.6.global_self_attn_layer_norm.b

  ········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


Epoch,Training Loss,Validation Loss
0,1.4116,1.323182
1,1.1717,1.250078
2,1.1061,1.224673
3,1.0877,1.217358


***** Running Evaluation *****
  Num examples = 100
  Batch size = 1
***** Running Evaluation *****
  Num examples = 100
  Batch size = 1
Saving model checkpoint to ./results/checkpoint-500
Configuration saved in ./results/checkpoint-500/config.json
Model weights saved in ./results/checkpoint-500/pytorch_model.bin
tokenizer config file saved in ./results/checkpoint-500/tokenizer_config.json
Special tokens file saved in ./results/checkpoint-500/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 100
  Batch size = 1
***** Running Evaluation *****
  Num examples = 100
  Batch size = 1


Training completed. Do not forget to share your model on huggingface.co/models =)


Saving model checkpoint to Pegasus_Def
Configuration saved in Pegasus_Def/config.json
Model weights saved in Pegasus_Def/pytorch_model.bin
tokenizer config file saved in Pegasus_Def/tokenizer_config.json
Special tokens file saved in Pegasus_Def/special_tokens_map.json
tokenizer config file saved in Pega

In [None]:
!zip -r Pegasus_Def.zip Pegasus_Def

updating: Pegasus_Def/ (stored 0%)
updating: Pegasus_Def/pytorch_model.bin (deflated 7%)
updating: Pegasus_Def/training_args.bin (deflated 49%)
updating: Pegasus_Def/config.json (deflated 60%)
updating: Pegasus_Def/tokenizer_config.json (deflated 77%)
updating: Pegasus_Def/special_tokens_map.json (deflated 82%)
updating: Pegasus_Def/spiece.model (deflated 50%)
