<a href="https://colab.research.google.com/github/HaywhyCoder/text-summarization-model/blob/main/news_headline_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### **News Headline Model**

#### Import Libraries

In [None]:
! pip install datasets sacrebleu evaluate bert-score

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics.pairwise import cosine_similarity

from datasets import Dataset, DatasetDict
from evaluate import load
from transformers import GPT2Tokenizer, GPT2LMHeadModel, Trainer, TrainingArguments, DataCollatorForLanguageModeling, pipeline
from sacrebleu import corpus_bleu
import torch

In [None]:
metric = load('bertscore')

#### Load the Dataset

In [None]:
data = pd.read_csv("/kaggle/input/news-summary/news_summary.csv", encoding='latin-1')
data.head()

In [None]:
data = data[['text', 'headlines']]

In [None]:
data.head()

In [None]:
data['text'] = data['text'].map(lambda x: x + "\nTL;DR:")
data['text'][5]

### Prepare Dataset

In [None]:
sample_data = data.sample(n=300, random_state=16, ignore_index=True)
train, test = train_test_split(sample_data, test_size=.2, random_state=42)
train, eval = train_test_split(train, test_size=.2, random_state=42)

datasets = DatasetDict({
    'train': Dataset.from_pandas(train, preserve_index=False),
    'eval': Dataset.from_pandas(eval, preserve_index=False),
    'test': Dataset.from_pandas(test, preserve_index=False)
})
datasets

In [None]:
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
model = GPT2LMHeadModel.from_pretrained("gpt2")

In [None]:
tokenizer.pad_token = tokenizer.eos_token
model.config.pad_token_id = tokenizer.pad_token_id

In [None]:
def preprocess_function(examples):
  inputs = [text for text in examples['text']]
  targets = [summary for summary in examples['headlines']]

  model_inputs = tokenizer(inputs, max_length=128, truncation=True, padding='max_length', return_tensors='pt')
  labels = tokenizer(text_target=targets, max_length=32, truncation=True, padding='max_length', return_tensors='pt')

  model_inputs['labels'] = labels['input_ids']
  return model_inputs

tokenized_datasets = datasets.map(preprocess_function, batched=True, remove_columns=datasets['train'].column_names)

In [None]:
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

In [None]:
training_args = TrainingArguments(
    output_dir='./results',
    eval_strategy='epoch',
    learning_rate=5e-5,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    num_train_epochs=10,
    weight_decay=.01,
    save_total_limit=1,
    logging_dir='./logs',
    logging_steps=10,
    report_to='none'
)

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['eval'],
    tokenizer=tokenizer,
    data_collator=data_collator,
)

trainer.train()

In [None]:
sample = datasets['test'][10]

# Detect the device (GPU or CPU)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Move the model to the detected device
model = model.to(device)

# Tokenize the input text and move tensors to the same device
inputs = tokenizer(
    sample['text'],
    return_tensors="pt",
    max_length=128,
    truncation=True,
    padding=True  # Ensures padding is applied
).to(device)

labels = tokenizer(sample['headlines'], max_length=32, truncation=True, padding=True).to(device)


input_ids = inputs['input_ids']
att_mask = inputs['attention_mask']

model.eval()
outputs = model.generate(
    input_ids=input_ids,
    attention_mask=att_mask,
    max_new_tokens= 15,
    min_length=5,  # Minimum length of the summary
    length_penalty=-3.0,  # Encourage shorter summaries
    num_beams=4,  # Use beam search for better results
    early_stopping=True  # Stop once the most probable sequence is completed
)

# print(outputs[0][att_mask.sum():])
summary = tokenizer.decode(outputs[0][att_mask.sum():], skip_special_tokens=True)
target = tokenizer.decode(labels['input_ids'], skip_special_tokens=True)

print("Summary: ",summary, '\n',"Headline: ", target)

In [None]:
model.eval()

outputs = model.generate(
    input_ids=torch.tensor(tokenized_datasets['test']['input_ids']).to(device),
    attention_mask=torch.tensor(tokenized_datasets['test']['attention_mask']).to(device),
    max_new_tokens= 15,
    min_length=5,  # Minimum length of the summary
    length_penalty=3.0,  # Encourage shorter summaries
    num_beams=4,  # Use beam search for better results
    no_repeat_ngram_size=2, # bigrams can only occur once in sequence
    # do_sample=True,
    early_stopping=True  # Stop once the most probable sequence is completed

)

inputs = tokenized_datasets['test']['input_ids']
summaries = []
targets = []
for idx, output in enumerate(outputs):
    summaries.append(tokenizer.decode(output[len(inputs[idx]):], skip_special_tokens=True))
    targets.append(tokenizer.decode(tokenized_datasets['test']['labels'][idx], skip_special_tokens=True))

df = pd.DataFrame({"Summary": summaries, "Headlines": targets})
df.head()

In [None]:
for i in range(5):
    print("summary: ", df['Summary'][i])
    print("headline: ", df['Headlines'][i], "\n")

In [None]:
from statistics import mean

# Calculate BLEU score
bleu = corpus_bleu(summaries, targets).score

# Calculate BERTScore
bert_score = metric.compute(predictions=summaries, references=targets, model_type='distilbert-base-uncased')  # use distilbert for semantic analysis
print(f"Precision: {mean(bert_score['precision']):.4f} Recall: {mean(bert_score['recall']):.4f} F1: {mean(bert_score['f1']):.4f} bleu: {bleu:.4f}")

On evaluating the model on the test set, the model got an average bert score of 0.71, indicating that the summary is similar in context to the target headline.