# Install and Import Packages and Libraries

In [None]:
!pip install transformers datasets evaluate transformers[torch]
!pip install py7zr

In [12]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from datasets import load_dataset
from transformers import Trainer, TrainingArguments
from huggingface_hub import notebook_login

# Model and Data Processing

In [None]:
tokenizer = AutoTokenizer.from_pretrained("facebook/bart-large-cnn")
model = AutoModelForSeq2SeqLM.from_pretrained("facebook/bart-large-cnn")

In [None]:
ds = load_dataset("samsum", trust_remote_code=True)

In [5]:
sample = ds["test"][0]['dialogue']
label = ds["test"][0]['summary']
token_ids = tokenizer(sample, return_tensors="pt")

In [6]:
summary_ids = model.generate(token_ids['input_ids'], min_length=30, max_length=250)

In [None]:
tokenizer.decode(summary_ids[0], skip_special_tokens=True)

In [None]:
def prompt_summary(sample, label, model):
  prompt = f"""Summarize this dialogue: {sample}.
  Summary:
  """
  token_ids = tokenizer(sample, return_tensors="pt")
  summary_ids = model.generate(token_ids['input_ids'], min_length=30, max_length=250)
  output = tokenizer.decode(summary_ids[0], skip_special_tokens=True)


  return output


print(f"Sample: {sample}")
print(f"Label: {label}")
print(f"Model Output: {prompt_summary(sample, label, model)}")

In [9]:
def tokenize(inputs):
  sp = "summarize: \n\n "
  ep = "\n\nSummary: "
  prompt = [sp+dialogue+ep for dialogue in inputs["dialogue"]]
  inputs['input_ids'] = tokenizer(prompt, padding='max_length', truncation=True, return_tensors="pt", max_length=512).input_ids
  inputs['labels'] = tokenizer(inputs["summary"], padding='max_length', truncation=True, return_tensors="pt", max_length=512).input_ids
  return inputs


In [None]:
tokenizer.pad_token = tokenizer.eos_token
tokenized_ds = ds.map(tokenize, batched=True)
tokenized_ds = tokenized_ds.remove_columns(["id", "dialogue", "summary"])
tokenized_ds = tokenized_ds.filter(lambda x, y: y % 50 == 0, with_indices=True)
print(tokenized_ds['train'].shape)
print(tokenized_ds['validation'].shape)
print(tokenized_ds['test'].shape)
print(tokenized_ds['train'][0].keys())

In [None]:
notebook_login()

# Training and Evaluation

In [None]:
train_args = TrainingArguments(
    output_dir="./dialogue_bart",
    learning_rate=1e-5,
    evaluation_strategy="epoch",
    num_train_epochs=1,
    weight_decay=0.01,
    per_device_train_batch_size=2,)

trainer = Trainer(model, train_args, train_dataset=tokenized_ds["train"], eval_dataset=tokenized_ds["validation"])

In [None]:
trainer.train()

In [None]:
#trainer.save_model("/content/dialogue_bart")
trainer.push_to_hub("dialogue_bart")

In [None]:
#tuned_model = AutoModelForSeq2SeqLM.from_pretrained("/content/dialogue_bart")
tuned_model = AutoModelForSeq2SeqLM.from_pretrained("ibraheemaloran/dialogue_bart")

In [None]:
for i in range(5):
  sample = ds["test"][i]
  print(f"Sample: {sample['dialogue']}")
  print(f"Label: {sample['summary']}")
  print(f"Model Output: {prompt_summary(sample['dialogue'], sample['summary'], tuned_model)}")