In [1]:
from transformers import BertTokenizer, BertModel, AutoTokenizer, AutoModel
import torch
import nltk
from nltk.tokenize import sent_tokenize

The following code uses two BERT models and the use of cosine similarity to create a summary from original text.  
The summary will be done with some basic functions and choosing most relevant sentence.

Choose Model

In [None]:
# Regular Large model 
tokenizer = BertTokenizer.from_pretrained('bert-large-uncased')
model = BertModel.from_pretrained('bert-large-uncased')

In [2]:
# Clinical Bert
tokenizer = AutoTokenizer.from_pretrained('emilyalsentzer/Bio_ClinicalBERT')
model = AutoModel.from_pretrained('emilyalsentzer/Bio_ClinicalBERT')

Read Data 

In [3]:
# Read the data 
file_path = ""
with open(file_path, 'r') as file:
    lines = file.readlines()
    # process the data

Process Data

In [13]:
# Choose a sentence
text = lines[0]

# Split text into sentences
sentences = sent_tokenize(text)

# Encode sentences
encoded_sentences = [tokenizer.encode(sentence, return_tensors='pt', add_special_tokens=True) for sentence in sentences]

In [14]:
# Convert to embeddings
sentence_embeddings = []
for encoded_sentence in encoded_sentences:
    with torch.no_grad():
        output = model(encoded_sentence)
    # Using [CLS] token embedding for representing the sentence
    sentence_embeddings.append(output[0][:, 0, :])


In [15]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

# Calculate mean embedding for the document
doc_embedding = torch.mean(torch.cat(sentence_embeddings, dim=0), dim=0)

# Calculate cosine similarity of each sentence with the document
similarities = [cosine_similarity(embedding, doc_embedding.unsqueeze(0))[0][0] for embedding in sentence_embeddings]

# Select top N sentences
N = round(len(sentences)/2)  # number of sentences to include in the summary
top_sentences = np.argsort(similarities)[-N:]
summary = " ".join([sentences[i] for i in sorted(top_sentences)])


In [46]:
# Look at the original Text
text

'Retained endobronchial foreign body removal facilitated by steroid therapy of an obstructing, inflammatory polyp. Oral and topical steroids were used to induce regression in an inflammatory, obstructing endobronchial polyp caused by a retained foreign body. The FB (a peanut half), which had been present for over six months, was then able to be easily and bloodlessly retrieved with fiberoptic bronchoscopy. \n'

In [16]:
# Look at the summary of the text
summary

'Preoperatively, all of the affected eyes had worse visual acuity (P less than .02) and more astigmatism (P less than .01) than the contralateral eyes. Of the eight patients for whom both preoperative and postoperative visual acuity measurements had been obtained, in six it had changed minimally (less than or equal to 1 line), and in two it had improved (less than or equal to 2 lines). Surgical complications included persistent epithelial defects (40%) and peripheral corneal vascularization and opacity (70%). These complications do not outweigh the cosmetic and visual benefits of dermoid excision in selected patients.'