In [1]:
!pip install datasets

Collecting datasets
  Downloading datasets-3.2.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.2.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m12.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m13.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.9.0-py3-none-any.whl 

In [11]:
import zipfile
import os
from datasets import Dataset,DatasetDict
from transformers import AutoTokenizer, TrainingArguments , Trainer, DataCollatorForSeq2Seq , AutoModelForSeq2SeqLM


In [3]:
def extraction(path,dest):
  with zipfile.ZipFile(path, 'r') as zip_ref:
      zip_ref.extractall(dest)

  print(os.listdir(dest))

In [4]:
extraction('Full_MILDSum_Dataset-20241105T105654Z-001.zip','dataset')

['Full_MILDSum_Dataset']


In [5]:
extraction('dataset/dataset/MILDSum_train_2185.zip','train')
extraction('dataset/dataset/MILDSum_test_468.zip','test')
extraction('dataset/dataset/MILDSum_val_469.zip','val')

['MILDSum_train_2185']
['MILDSum_test_468']
['MILDSum_val_469']


In [7]:
def dataset_sorter(path):
    judgement = []
    summary = []
    ids = []

    for root, _ , files in os.walk(path):

        if files:
            subdirectory_name = os.path.basename(root)
            #print(subdirectory_name)
            file_path1 = os.path.join(root,files[2])
            file_path2 = os.path.join(root,files[1])
            # print(file_path1)
            # print(file_path2)

            with open(file_path1,"r") as fp:
                content = fp.read()
                judgement.append(content)

            with open(file_path2,"r") as fp:
                content = fp.read()
                summary.append(content)

            ids.append(int(subdirectory_name))


    assert len(judgement) == len(summary), "Mismatch in articles and highlights"

    # Create a dictionary to store the data
    data_dict = {
        'judgement': judgement,
        'summary': summary,
        'id': ids
    }

    # Load data into a Hugging Face Dataset


    # Display dataset info
    return data_dict

In [8]:
d_train = dataset_sorter("train/MILDSum_train_2185")
d_test = dataset_sorter("test/MILDSum_test_468")
d_val = dataset_sorter("val/MILDSum_val_469")

In [9]:
len(d_train['judgement'])

2185

In [10]:
dataset = {}
dataset['train'] = d_train
dataset['test'] = d_test
dataset['validation'] = d_val

In [12]:
model_name = 'google-t5/t5-base'
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/892M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

In [13]:
tokenizer.model_max_length = 1024
model.config.max_length = 1024

In [14]:
def filter_and_reindex(data, model_name,dir,tokenizer, token_limit=1024):
    """
    Filters judgements and summaries under the token limit,
    appends them to new lists, and creates new IDs.

    Args:
        data (dict): The dataset containing judgements and summaries.
        model_name (str): The name of the pre-trained tokenizer.
        token_limit (int): The token limit (default: 4096).

    Returns:
        dict: A dictionary containing the filtered judgements, summaries, and new IDs.
    """
    filtered_judgements = []
    filtered_summaries = []
    new_ids = []
    current_id = 0

    for judgement, summary in zip(data[dir]['judgement'], data[dir]['summary']):
        judgement_tokens = len(tokenizer.encode(judgement, truncation=True, max_length=token_limit))
        summary_tokens = len(tokenizer.encode(summary, truncation=True, max_length=token_limit))

        if judgement_tokens < token_limit and summary_tokens < token_limit:
            filtered_judgements.append(judgement)
            filtered_summaries.append(summary)
            new_ids.append(current_id)
            current_id += 1

    return {
        'judgement': filtered_judgements,
        'summary': filtered_summaries,
        'id': new_ids
    }



In [16]:
filtered_data_train = filter_and_reindex(dataset, model_name,'train',tokenizer)

print(f"Number of filtered judgements: {len(filtered_data_train['judgement'])}")
print(f"Number of filtered summaries: {len(filtered_data_train['summary'])}")
print(f"New IDs: {filtered_data_train['id'][:10]}")  # Print first 10 new IDs

filtered_data_test = filter_and_reindex(dataset, model_name,'test',tokenizer)

print(f"Number of filtered judgements: {len(filtered_data_test['judgement'])}")
print(f"Number of filtered summaries: {len(filtered_data_test['summary'])}")
print(f"New IDs: {filtered_data_test['id'][:10]}")  # Print first 10 new IDs

filtered_data_val = filter_and_reindex(dataset, model_name,'validation',tokenizer)

print(f"Number of filtered judgements: {len(filtered_data_val['judgement'])}")
print(f"Number of filtered summaries: {len(filtered_data_val['summary'])}")
print(f"New IDs: {filtered_data_val['id'][:10]}")  # Print first 10 new IDs

Number of filtered judgements: 220
Number of filtered summaries: 220
New IDs: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
Number of filtered judgements: 46
Number of filtered summaries: 46
New IDs: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
Number of filtered judgements: 48
Number of filtered summaries: 48
New IDs: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]


In [17]:
# Convert each split into a Dataset
train_dataset = Dataset.from_dict(filtered_data_train)
test_dataset = Dataset.from_dict(filtered_data_test)
validation_dataset = Dataset.from_dict(filtered_data_val)

# Combine into a DatasetDict
dataset = DatasetDict({
    'train': train_dataset,
    'test': test_dataset,
    'validation': validation_dataset
})

In [18]:
def tokenize_function(examples):
    model_inputs = tokenizer(
        examples["judgement"],
        max_length=1024,
        padding="max_length",
        truncation=True
    )
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(
            examples["summary"],
            max_length=1024,
            padding="max_length",
            truncation=True
        )
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized_datasets = dataset.map(tokenize_function, batched=True)


Map:   0%|          | 0/220 [00:00<?, ? examples/s]



Map:   0%|          | 0/46 [00:00<?, ? examples/s]

Map:   0%|          | 0/48 [00:00<?, ? examples/s]

In [19]:
training_args = TrainingArguments(
    output_dir="./T5-1024",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=5e-5,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    num_train_epochs=2,
    weight_decay=0.01,
    push_to_hub=False,
    gradient_accumulation_steps=2,
    fp16=True,
    save_total_limit=1,  # Keep only the best checkpoint
    load_best_model_at_end=True,
    save_steps=1000,

)



In [20]:
data_collator = DataCollatorForSeq2Seq(
    tokenizer=tokenizer,
    model=model,
    padding=True
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,

)

  trainer = Trainer(


In [21]:
trainer.train()

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Epoch,Training Loss,Validation Loss
1,No log,1.469747
2,No log,1.320234


There were missing keys in the checkpoint model loaded: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight', 'lm_head.weight'].


TrainOutput(global_step=110, training_loss=5.967700750177557, metrics={'train_runtime': 371.1191, 'train_samples_per_second': 1.186, 'train_steps_per_second': 0.296, 'total_flos': 535882943692800.0, 'train_loss': 5.967700750177557, 'epoch': 2.0})

In [22]:
trainer.save_model('t5base_1024_v1')

In [24]:
!tar -czvf model.tar.gz t5base_1024_v1

t5base_1024_v1/
t5base_1024_v1/spiece.model
t5base_1024_v1/training_args.bin
t5base_1024_v1/tokenizer.json
t5base_1024_v1/generation_config.json
t5base_1024_v1/config.json
t5base_1024_v1/tokenizer_config.json
t5base_1024_v1/special_tokens_map.json
t5base_1024_v1/model.safetensors
