In [None]:
!pip install datasets transformers torch




In [None]:
from transformers import BartForConditionalGeneration, AutoTokenizer
from huggingface_hub import login



## Load Datasets from CSV Files

In [None]:
from datasets import load_dataset
# Load XSum
xsum = load_dataset("csv", data_files={
    "train": "xsum_train.csv",
    "validation": "xsum_validation.csv",
    "test": "xsum_test.csv"
})
# Load SAMSum
samsum = load_dataset("csv", data_files={
    "train": "samsum_train.csv",
    "validation": "samsum_validation.csv",
    "test": "samsum_test.csv"
})

# Load DialogSum
dialogsum = load_dataset("csv", data_files={
    "train": "dialogsum_train.csv",
    "validation": "dialogsum_validation.csv",
    "test": "dialogsum_test.csv"
})

# Load AMI (meeting transcripts)
ami = load_dataset("csv", data_files={
    "train": "ami_train.csv",
    "validation": "ami_validation.csv",
    "test": "ami_test.csv"
})
print("XSum columns:", xsum["train"].column_names)
print("SAMSum columns:", samsum["train"].column_names)
print("DialogSum columns:", dialogsum["train"].column_names)
print("AMI columns:", ami["train"].column_names)

Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

XSum columns: ['id', 'dialogue', 'summary']
SAMSum columns: ['id', 'dialogue', 'summary']
DialogSum columns: ['id', 'dialogue', 'summary', 'topic']
AMI columns: ['id', 'dialogue', 'summary']


#  Preprocess Data

In [None]:
from transformers import BartTokenizer

tokenizer = BartTokenizer.from_pretrained("facebook/bart-base")

def preprocess(batch):
    text_column = "dialogue" if "dialogue" in batch else "transcript"

    inputs = [str(text) if text else "" for text in batch[text_column]]  # Ensure strings
    summaries = [str(text) if text else "" for text in batch["summary"]]

    # Tokenize in batch mode
    model_inputs = tokenizer(inputs, max_length=1024, truncation=True, padding="max_length")
    labels = tokenizer(summaries, max_length=256, truncation=True, padding="max_length")

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

# Apply the function to all datasets
xsum = xsum.map(preprocess, batched=True, remove_columns=["dialogue", "summary", "id"])
samsum = samsum.map(preprocess, batched=True, remove_columns=["dialogue", "summary", "id"])
dialogsum = dialogsum.map(preprocess, batched=True, remove_columns=["dialogue", "summary", "id", "topic"])
ami = ami.map(preprocess, batched=True, remove_columns=["dialogue", "summary", "id"])
print("XSum columns:", xsum["train"].column_names)
print("SAMSum columns:", samsum["train"].column_names)
print("DialogSum columns:", dialogsum["train"].column_names)
print("AMI columns:", ami["train"].column_names)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.72k [00:00<?, ?B/s]

Map:   0%|          | 0/204045 [00:00<?, ? examples/s]

Map:   0%|          | 0/11332 [00:00<?, ? examples/s]

Map:   0%|          | 0/11334 [00:00<?, ? examples/s]

Map:   0%|          | 0/14732 [00:00<?, ? examples/s]

Map:   0%|          | 0/818 [00:00<?, ? examples/s]

Map:   0%|          | 0/819 [00:00<?, ? examples/s]

Map:   0%|          | 0/12460 [00:00<?, ? examples/s]

Map:   0%|          | 0/500 [00:00<?, ? examples/s]

Map:   0%|          | 0/1500 [00:00<?, ? examples/s]

Map:   0%|          | 0/209 [00:00<?, ? examples/s]

Map:   0%|          | 0/42 [00:00<?, ? examples/s]

Map:   0%|          | 0/28 [00:00<?, ? examples/s]

XSum columns: ['input_ids', 'attention_mask', 'labels']
SAMSum columns: ['input_ids', 'attention_mask', 'labels']
DialogSum columns: ['input_ids', 'attention_mask', 'labels']
AMI columns: ['input_ids', 'attention_mask', 'labels']


## Combine SAMSum + DialogSum (Phase 1 Training)

In [None]:
from datasets import concatenate_datasets

# Combine training splits
combined_train = concatenate_datasets([samsum["train"], dialogsum["train"]])
combined_val = concatenate_datasets([samsum["validation"], dialogsum["validation"]])

## Train in Phases

In [None]:
from transformers import BartForConditionalGeneration, TrainingArguments, Trainer

#Load the model
model = BartForConditionalGeneration.from_pretrained("facebook/bart-base")

model.safetensors:   0%|          | 0.00/558M [00:00<?, ?B/s]

## Phase 0: Warm-up with XSum


In [None]:
from transformers import TrainingArguments, Trainer

# Phase 0: XSum (news summaries)
training_args = TrainingArguments(
    output_dir="./results_phase0",
    run_name="BART-BASE-FINETUNINGV2-PHASE0",
    num_train_epochs=2,
    per_device_train_batch_size=8,
    fp16=True,
    learning_rate=2e-5,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=xsum["train"],
    eval_dataset=xsum["validation"],
)
trainer.train()

<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mbercadesjohnrichard[0m ([33mbercadesjohnrichard-laguna-state-polytechnic-university[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


Step,Training Loss
500,1.5251
1000,0.2645
1500,0.255
2000,0.2527
2500,0.2501
3000,0.2517
3500,0.2492
4000,0.2469
4500,0.2463
5000,0.2413




TrainOutput(global_step=51012, training_loss=0.23407084863531555, metrics={'train_runtime': 5485.3921, 'train_samples_per_second': 74.396, 'train_steps_per_second': 9.3, 'total_flos': 2.488273512431616e+17, 'train_loss': 0.23407084863531555, 'epoch': 2.0})

#### Proper Saving Phase 0

In [None]:

# ✅ Save properly
model.save_pretrained("./bart_finetuned_phase0")
tokenizer.save_pretrained("./bart_finetuned_phase0")

('./bart_finetuned_phase0/tokenizer_config.json',
 './bart_finetuned_phase0/special_tokens_map.json',
 './bart_finetuned_phase0/vocab.json',
 './bart_finetuned_phase0/merges.txt',
 './bart_finetuned_phase0/added_tokens.json')

In [None]:
!ls -lh bart_finetuned_phase0


total 534M
-rw-r--r-- 1 root root 1.7K Feb 22 17:24 config.json
-rw-r--r-- 1 root root  292 Feb 22 17:24 generation_config.json
-rw-r--r-- 1 root root 446K Feb 22 17:24 merges.txt
-rw-r--r-- 1 root root 533M Feb 22 17:24 model.safetensors
-rw-r--r-- 1 root root  957 Feb 22 17:24 special_tokens_map.json
-rw-r--r-- 1 root root 1.3K Feb 22 17:24 tokenizer_config.json
-rw-r--r-- 1 root root 976K Feb 22 17:24 vocab.json


In [None]:
# Push model & tokenizer
model.push_to_hub("csb05/Finetuned-B.A.R.T.-Base-xsum")
tokenizer.push_to_hub("csb05/Finetuned-B.A.R.T.-Base-xsum")

### Phase 1: SAMSum + DialogSum


In [None]:
from transformers import BartForConditionalGeneration, TrainingArguments, Trainer


model = BartForConditionalGeneration.from_pretrained("./bart_finetuned_phase0")
tokenizer = BartTokenizer.from_pretrained("./bart_finetuned_phase0")

# Phase 1: SAMSum + DialogSum Training
training_args_phase1 = TrainingArguments(
    output_dir="./results_phase1",
    run_name="BART-BASE-FINETUNINGV2-PHASE1",
    report_to="wandb",
    num_train_epochs=3,
    per_device_train_batch_size=8,
    fp16=True,
    learning_rate=3e-5,
    eval_strategy="epoch",
    save_strategy="epoch",
    logging_steps=100,
)
trainer = Trainer(
    model=model,
    args=training_args_phase1,
    train_dataset=combined_train,
    eval_dataset=combined_val,
)
trainer.train()


Epoch,Training Loss,Validation Loss
1,0.1919,0.164684
2,0.1639,0.159535
3,0.1426,0.158119


TrainOutput(global_step=10197, training_loss=0.17227206109048712, metrics={'train_runtime': 1057.1444, 'train_samples_per_second': 77.166, 'train_steps_per_second': 9.646, 'total_flos': 4.973986131738624e+16, 'train_loss': 0.17227206109048712, 'epoch': 3.0})

#### Proper Saving Phase 1

In [None]:

# ✅ Save properly
model.save_pretrained("./bart_finetuned_phase1")
tokenizer.save_pretrained("./bart_finetuned_phase1")

('./bart_finetuned_phase1/tokenizer_config.json',
 './bart_finetuned_phase1/special_tokens_map.json',
 './bart_finetuned_phase1/vocab.json',
 './bart_finetuned_phase1/merges.txt',
 './bart_finetuned_phase1/added_tokens.json')

In [None]:
!ls -lh bart_finetuned_phase1


total 534M
-rw-r--r-- 1 root root 1.7K Feb 22 17:42 config.json
-rw-r--r-- 1 root root  292 Feb 22 17:42 generation_config.json
-rw-r--r-- 1 root root 446K Feb 22 17:42 merges.txt
-rw-r--r-- 1 root root 533M Feb 22 17:42 model.safetensors
-rw-r--r-- 1 root root  957 Feb 22 17:42 special_tokens_map.json
-rw-r--r-- 1 root root 1.3K Feb 22 17:42 tokenizer_config.json
-rw-r--r-- 1 root root 976K Feb 22 17:42 vocab.json


### Phase 2: AMI (Meetings)


In [None]:
from transformers import BartForConditionalGeneration, BartTokenizer, TrainingArguments, Trainer


model = BartForConditionalGeneration.from_pretrained("./bart_finetuned_phase1")
tokenizer = BartTokenizer.from_pretrained("./bart_finetuned_phase1")

# Phase 2: AMI Fine-Tuning
training_args_phase2 = TrainingArguments(
    output_dir="./results_phase2",
    run_name="BART-BASE-FINETUNINGV2-PHASE2",
    report_to="wandb",
    num_train_epochs=10,  # Train longer for AMI meetings
    per_device_train_batch_size=4,  # Reduce batch size to prevent OOM
    gradient_accumulation_steps=2,  # Simulate batch size 8
    fp16=True,
    learning_rate=3e-6,  # Lower LR for stability
    eval_strategy="epoch",
    save_strategy="epoch",
    logging_steps=25,  # More frequent logging
    save_total_limit=2,
    load_best_model_at_end=True,  # Load best model based on validation loss
)

trainer = Trainer(
    model=model,
    args=training_args_phase2,
    train_dataset=ami["train"],
    eval_dataset=ami["validation"],
)
trainer.train()
# Save the model after Phase 1
model.save_pretrained("./bart_finetuned_phase2")
tokenizer.save_pretrained("./bart_finetuned_phase2")


Epoch,Training Loss,Validation Loss
1,3.9143,2.896415
2,3.3859,2.738488
3,3.2866,2.650205
4,3.1765,2.586647
5,3.2042,2.55183
6,2.9916,2.52397
7,3.0597,2.504673
8,2.9543,2.495162
9,2.9685,2.488956


There were missing keys in the checkpoint model loaded: ['model.encoder.embed_tokens.weight', 'model.decoder.embed_tokens.weight', 'lm_head.weight'].


('./bart_finetuned_phase2/tokenizer_config.json',
 './bart_finetuned_phase2/special_tokens_map.json',
 './bart_finetuned_phase2/vocab.json',
 './bart_finetuned_phase2/merges.txt',
 './bart_finetuned_phase2/added_tokens.json')

#### Proper Saving Phase 2

In [None]:

# ✅ Save properly
model.save_pretrained("./bart_finetuned_phase2")
tokenizer.save_pretrained("./bart_finetuned_phase2")

('./bart_finetuned_phase2/tokenizer_config.json',
 './bart_finetuned_phase2/special_tokens_map.json',
 './bart_finetuned_phase2/vocab.json',
 './bart_finetuned_phase2/merges.txt',
 './bart_finetuned_phase2/added_tokens.json')

In [None]:
!ls -lh bart_finetuned_phase2


total 534M
-rw-r--r-- 1 root root 1.7K Feb 22 17:43 config.json
-rw-r--r-- 1 root root  292 Feb 22 17:43 generation_config.json
-rw-r--r-- 1 root root 446K Feb 22 17:43 merges.txt
-rw-r--r-- 1 root root 533M Feb 22 17:43 model.safetensors
-rw-r--r-- 1 root root  957 Feb 22 17:43 special_tokens_map.json
-rw-r--r-- 1 root root 1.3K Feb 22 17:43 tokenizer_config.json
-rw-r--r-- 1 root root 976K Feb 22 17:43 vocab.json


## Save/Load Model

In [None]:
# Push model & tokenizer
model.push_to_hub("csb05/Discussion-Summary-Finetuned-B.A.R.T.-Base-V2")
tokenizer.push_to_hub("csb05/Discussion-Summary-Finetuned-B.A.R.T.-Base-V2")

print("Model & tokenizer pushed to Hugging Face Hub!")

save_path = "/content/drive/MyDrive/bart-discussion-summarizer-V2"

# Save model & tokenizer
model.save_pretrained(save_path)
tokenizer.save_pretrained(save_path)

print(f"Model saved to {save_path}")

# Zip the model directory in Google Drive
!zip -r /content/bart-discussion-summarizer.zip /content/drive/MyDrive/bart-discussion-summarizer-V2

# Download the zipped model to your local computer
from google.colab import files
files.download("/content/bart-discussion-summarizer-V@.zip")


README.md:   0%|          | 0.00/5.17k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/558M [00:00<?, ?B/s]

No files have been modified since last commit. Skipping to prevent empty commit.


Model & tokenizer pushed to Hugging Face Hub!
Model saved to /content/drive/MyDrive/bart-discussion-summarizer-V2
  adding: content/drive/MyDrive/bart-discussion-summarizer-V2/ (stored 0%)
  adding: content/drive/MyDrive/bart-discussion-summarizer-V2/config.json (deflated 63%)
  adding: content/drive/MyDrive/bart-discussion-summarizer-V2/generation_config.json (deflated 47%)
  adding: content/drive/MyDrive/bart-discussion-summarizer-V2/model.safetensors (deflated 7%)
  adding: content/drive/MyDrive/bart-discussion-summarizer-V2/tokenizer_config.json (deflated 75%)
  adding: content/drive/MyDrive/bart-discussion-summarizer-V2/special_tokens_map.json (deflated 85%)
  adding: content/drive/MyDrive/bart-discussion-summarizer-V2/vocab.json (deflated 68%)
  adding: content/drive/MyDrive/bart-discussion-summarizer-V2/merges.txt (deflated 53%)


FileNotFoundError: Cannot find file: /content/bart-discussion-summarizer-V@.zip