In [None]:
#!pip install datasets transformers torch


Collecting datasets
  Downloading datasets-3.3.2-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.

In [None]:
from transformers import BartForConditionalGeneration, AutoTokenizer
from huggingface_hub import login



## Load Datasets from CSV Files

In [None]:
from datasets import load_dataset

# Load SAMSum
samsum = load_dataset("csv", data_files={
    "train": "samsum_train.csv",
    "validation": "samsum_validation.csv",
    "test": "samsum_test.csv"
})

# Load DialogSum
dialogsum = load_dataset("csv", data_files={
    "train": "dialogsum_train.csv",
    "validation": "dialogsum_validation.csv",
    "test": "dialogsum_test.csv"
})

# Load AMI (meeting transcripts)
ami = load_dataset("csv", data_files={
    "train": "ami_train.csv",
    "validation": "ami_validation.csv",
    "test": "ami_test.csv"
})
print("SAMSum columns:", samsum["train"].column_names)
print("DialogSum columns:", dialogsum["train"].column_names)
print("AMI columns:", ami["train"].column_names)

Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

SAMSum columns: ['id', 'dialogue', 'summary']
DialogSum columns: ['id', 'dialogue', 'summary', 'topic']
AMI columns: ['id', 'dialogue', 'summary']


#  Preprocess Data

In [None]:
from transformers import BartTokenizer

tokenizer = BartTokenizer.from_pretrained("facebook/bart-base")

def preprocess(batch):
    text_column = "dialogue" if "dialogue" in batch else "transcript"

    inputs = [str(text) if text else "" for text in batch[text_column]]  # Ensure strings
    summaries = [str(text) if text else "" for text in batch["summary"]]

    # Tokenize in batch mode
    model_inputs = tokenizer(inputs, max_length=1024, truncation=True, padding="max_length")
    labels = tokenizer(summaries, max_length=256, truncation=True, padding="max_length")

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

# Apply the function to all datasets
samsum = samsum.map(preprocess, batched=True, remove_columns=["dialogue", "summary", "id"])
dialogsum = dialogsum.map(preprocess, batched=True, remove_columns=["dialogue", "summary", "id", "topic"])
ami = ami.map(preprocess, batched=True, remove_columns=["dialogue", "summary", "id"])
print("SAMSum columns:", samsum["train"].column_names)
print("DialogSum columns:", dialogsum["train"].column_names)
print("AMI columns:", ami["train"].column_names)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.72k [00:00<?, ?B/s]

Map:   0%|          | 0/14732 [00:00<?, ? examples/s]

Map:   0%|          | 0/818 [00:00<?, ? examples/s]

Map:   0%|          | 0/819 [00:00<?, ? examples/s]

Map:   0%|          | 0/12460 [00:00<?, ? examples/s]

Map:   0%|          | 0/500 [00:00<?, ? examples/s]

Map:   0%|          | 0/1500 [00:00<?, ? examples/s]

Map:   0%|          | 0/209 [00:00<?, ? examples/s]

Map:   0%|          | 0/42 [00:00<?, ? examples/s]

Map:   0%|          | 0/28 [00:00<?, ? examples/s]

SAMSum columns: ['input_ids', 'attention_mask', 'labels']
DialogSum columns: ['input_ids', 'attention_mask', 'labels']
AMI columns: ['input_ids', 'attention_mask', 'labels']


## Combine SAMSum + DialogSum (Phase 1 Training)

In [None]:
from datasets import concatenate_datasets

# Combine training splits
combined_train = concatenate_datasets([samsum["train"], dialogsum["train"]])
combined_val = concatenate_datasets([samsum["validation"], dialogsum["validation"]])

## Train in Phases

### Phase 1: SAMSum + DialogSum


In [None]:
from transformers import BartForConditionalGeneration, TrainingArguments, Trainer

model = BartForConditionalGeneration.from_pretrained("facebook/bart-base")

# Phase 1: SAMSum + DialogSum Training
training_args_phase1 = TrainingArguments(
    output_dir="./results_phase1",
    run_name="BART-BASE-FINETUNING-PHASE1",
    report_to="wandb",
    num_train_epochs=3,
    per_device_train_batch_size=8,
    fp16=True,
    learning_rate=3e-5,
    eval_strategy="epoch",
    save_strategy="epoch",
    logging_steps=100,
)
trainer = Trainer(
    model=model,
    args=training_args_phase1,
    train_dataset=combined_train,
    eval_dataset=combined_val,
)
trainer.train()
# Save the model after Phase 1
model.save_pretrained("./bart_finetuned_phase1")

model.safetensors:   0%|          | 0.00/558M [00:00<?, ?B/s]

<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mbercadesjohnrichard[0m ([33mbercadesjohnrichard-laguna-state-polytechnic-university[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


Epoch,Training Loss,Validation Loss
1,0.1933,0.163717
2,0.165,0.158511
3,0.1441,0.157168




In [None]:
from safetensors.torch import load_file
import torch

# Load safetensors model
state_dict = load_file("bart_finetuned_phase1/model.safetensors")

# Save as pytorch_model.bin
torch.save(state_dict, "bart_finetuned_phase1/pytorch_model.bin")

model.save_pretrained("./bart_finetuned_phase1")
tokenizer.save_pretrained("./bart_finetuned_phase1")  # Save tokenizer too!


('./bart_finetuned_phase1/tokenizer_config.json',
 './bart_finetuned_phase1/special_tokens_map.json',
 './bart_finetuned_phase1/vocab.json',
 './bart_finetuned_phase1/merges.txt',
 './bart_finetuned_phase1/added_tokens.json')

In [None]:
!ls -lh bart_finetuned_phase1


total 1.1G
-rw-r--r-- 1 root root 1.7K Feb 21 07:50 config.json
-rw-r--r-- 1 root root  292 Feb 21 07:50 generation_config.json
-rw-r--r-- 1 root root 446K Feb 21 07:50 merges.txt
-rw-r--r-- 1 root root 533M Feb 21 07:50 model.safetensors
-rw-r--r-- 1 root root 533M Feb 21 07:50 pytorch_model.bin
-rw-r--r-- 1 root root  957 Feb 21 07:50 special_tokens_map.json
-rw-r--r-- 1 root root 1.3K Feb 21 07:50 tokenizer_config.json
-rw-r--r-- 1 root root 976K Feb 21 07:50 vocab.json


### Phase 2: AMI (Meetings)


In [None]:
from transformers import BartForConditionalGeneration, BartTokenizer, TrainingArguments, Trainer

# Load the fine-tuned model from Phase 1
model = BartForConditionalGeneration.from_pretrained("./bart_finetuned_phase1")
tokenizer = BartTokenizer.from_pretrained("./bart_finetuned_phase1")  # Fix missing keys issue

# Phase 2: AMI Fine-Tuning
training_args_phase2 = TrainingArguments(
    output_dir="./results_phase2",
    run_name="BART-BASE-FINETUNING-PHASE2",
    report_to="wandb",
    num_train_epochs=10,  # Train longer for AMI meetings
    per_device_train_batch_size=4,  # Reduce batch size to prevent OOM
    gradient_accumulation_steps=2,  # Simulate batch size 8
    fp16=True,
    learning_rate=3e-6,  # Lower LR for stability
    eval_strategy="epoch",
    save_strategy="epoch",
    logging_steps=25,  # More frequent logging
    save_total_limit=2,
    load_best_model_at_end=True,  # Load best model based on validation loss
)

trainer = Trainer(
    model=model,
    args=training_args_phase2,
    train_dataset=ami["train"],
    eval_dataset=ami["validation"],
)
trainer.train()



Epoch,Training Loss,Validation Loss
1,2.7823,2.301549
2,2.6376,2.286858
3,2.6655,2.277011
4,2.6249,2.267263
5,2.7067,2.260145
6,2.5385,2.250039
7,2.6149,2.245305
8,2.5474,2.243803
9,2.5851,2.242277


There were missing keys in the checkpoint model loaded: ['model.encoder.embed_tokens.weight', 'model.decoder.embed_tokens.weight', 'lm_head.weight'].


('./bart_finetuned_phase2/tokenizer_config.json',
 './bart_finetuned_phase2/special_tokens_map.json',
 './bart_finetuned_phase2/vocab.json',
 './bart_finetuned_phase2/merges.txt',
 './bart_finetuned_phase2/added_tokens.json')

In [None]:
# Load safetensors model
state_dict = load_file("bart_finetuned_phase2/model.safetensors")

# Save as pytorch_model.bin
torch.save(state_dict, "bart_finetuned_phase2/pytorch_model.bin")
# Save the final Phase 2 model
model.save_pretrained("./bart_finetuned_phase2")
tokenizer.save_pretrained("./bart_finetuned_phase2")  # Ensure tokenizer is saved too!


('./bart_finetuned_phase2/tokenizer_config.json',
 './bart_finetuned_phase2/special_tokens_map.json',
 './bart_finetuned_phase2/vocab.json',
 './bart_finetuned_phase2/merges.txt',
 './bart_finetuned_phase2/added_tokens.json')

## Save/Load Model

In [None]:
# Push model & tokenizer
model.push_to_hub("csb05/Discussion-Summary-Finetuned-B.A.R.T.-Base")
tokenizer.push_to_hub("csb05/Discussion-Summary-Finetuned-B.A.R.T.-Base")

print("Model & tokenizer pushed to Hugging Face Hub!")

save_path = "/content/drive/MyDrive/bart-discussion-summarizer"

# Save model & tokenizer
model.save_pretrained(save_path)
tokenizer.save_pretrained(save_path)

print(f"Model saved to {save_path}")

# Zip the model directory in Google Drive
!zip -r /content/bart-discussion-summarizer.zip /content/drive/MyDrive/bart-discussion-summarizer

# Download the zipped model to your local computer
from google.colab import files
files.download("/content/bart-discussion-summarizer.zip")


model.safetensors:   0%|          | 0.00/558M [00:00<?, ?B/s]

README.md:   0%|          | 0.00/5.17k [00:00<?, ?B/s]

Model & tokenizer pushed to Hugging Face Hub!
Model saved to /content/drive/MyDrive/bart-discussion-summarizer
  adding: content/drive/MyDrive/bart-discussion-summarizer/ (stored 0%)
  adding: content/drive/MyDrive/bart-discussion-summarizer/config.json (deflated 63%)
  adding: content/drive/MyDrive/bart-discussion-summarizer/generation_config.json (deflated 47%)
  adding: content/drive/MyDrive/bart-discussion-summarizer/model.safetensors (deflated 7%)
  adding: content/drive/MyDrive/bart-discussion-summarizer/tokenizer_config.json (deflated 75%)
  adding: content/drive/MyDrive/bart-discussion-summarizer/special_tokens_map.json (deflated 85%)
  adding: content/drive/MyDrive/bart-discussion-summarizer/vocab.json (deflated 59%)
  adding: content/drive/MyDrive/bart-discussion-summarizer/merges.txt (deflated 53%)
  adding: content/drive/MyDrive/bart-discussion-summarizer/tokenizer.json (deflated 82%)


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>