In [1]:
import torch
from transformers import BertTokenizer, BertModel
from nltk.tokenize import sent_tokenize

In [None]:
model = BertModel.from_pretrained('bert-base-uncased')
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [2]:
from transformers import AlbertTokenizer, AlbertModel
import torch

tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2')
model = AlbertModel.from_pretrained('albert-base-v2')


Downloading (…)ve/main/spiece.model:   0%|          | 0.00/760k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.31M [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/684 [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/47.4M [00:00<?, ?B/s]

In [3]:
text = """
Machine learning is a branch of artificial intelligence that allows computers to learn and improve from experience without being explicitly programmed. It is the process of using algorithms and statistical models to analyze and draw insights from large amounts of data, and then use those insights to make predictions or decisions. Machine learning has become increasingly popular in recent years, as the amount of available data has grown and computing power has increased. There are three main types of machine learning: supervised learning, unsupervised learning, and reinforcement learning. In supervised learning, the algorithm is given a labeled dataset and learns to make predictions based on that data. In unsupervised learning, the algorithm is given an unlabeled dataset and must find patterns and relationships within the data on its own. In reinforcement learning, the algorithm learns by trial and error, receiving feedback in the form of rewards or punishments for certain actions. Machine learning is used in a wide range of applications, including image recognition, natural language processing, autonomous vehicles, fraud detection, and recommendation systems. As the technology continues to improve, it is likely that machine learning will become even more prevalent in our daily lives.
"""

sentences = sent_tokenize(text)

In [4]:
tokenized_sentences = [tokenizer.encode(sent, add_special_tokens=True) for sent in sentences]

In [5]:
tokenized_sentences

[[2,
  1940,
  2477,
  25,
  21,
  1686,
  16,
  6809,
  2872,
  30,
  2965,
  7774,
  20,
  2484,
  17,
  3545,
  37,
  1496,
  366,
  142,
  13108,
  2866,
  43,
  9,
  3],
 [2,
  32,
  25,
  14,
  953,
  16,
  568,
  15935,
  17,
  6762,
  2761,
  20,
  16051,
  17,
  2003,
  9239,
  18,
  37,
  370,
  8545,
  16,
  1054,
  15,
  17,
  94,
  275,
  273,
  9239,
  18,
  20,
  233,
  13823,
  18,
  54,
  6003,
  9,
  3],
 [2,
  1940,
  2477,
  63,
  533,
  5054,
  844,
  19,
  1764,
  122,
  15,
  28,
  14,
  2006,
  16,
  904,
  1054,
  63,
  3651,
  17,
  10626,
  414,
  63,
  1644,
  9,
  3],
 [2,
  80,
  50,
  132,
  407,
  2551,
  16,
  1940,
  2477,
  45,
  15581,
  2477,
  15,
  367,
  8542,
  3762,
  69,
  2477,
  15,
  17,
  26374,
  2477,
  9,
  3],
 [2,
  19,
  15581,
  2477,
  15,
  14,
  9083,
  25,
  504,
  21,
  14348,
  1054,
  3554,
  17,
  11346,
  20,
  233,
  13823,
  18,
  432,
  27,
  30,
  1054,
  9,
  3],
 [2,
  19,
  367,
  8542,
  3762,
  69,
  2477,
  15,
  

### Encoding the input

#### To feed the tokenized sentences into the BERT model, we need to encode them using the BERT tokenizer. We also need to add special tokens such as [CLS] (for the start of the sentence) and [SEP] (for the end of the sentence) to each input.

In [6]:
max_len = 0
for i in tokenized_sentences:
    if len(i) > max_len:
        max_len = len(i)

padded_sentences = []
for i in tokenized_sentences:
    while len(i) < max_len:
        i.append(0)
    padded_sentences.append(i)

input_ids = torch.tensor(padded_sentences)

### Generating the sentence embeddings
#### Once we have encoded the input, we can feed it into the BERT model to generate sentence embeddings. We will use the last hidden state of the BERT model as the sentence embedding.

In [7]:
with torch.no_grad():
    last_hidden_states = model(input_ids)[0]

sentence_embeddings = []
for i in range(len(sentences)):
    sentence_embeddings.append(torch.mean(last_hidden_states[i], dim=0).numpy())

We strongly recommend passing in an `attention_mask` since your input_ids may be padded. See https://huggingface.co/docs/transformers/troubleshooting#incorrect-output-when-padding-tokens-arent-masked.


### Summarizing the text
##### Finally, we can use sentence embeddings to summarize the text. One way to do this is to compute the similarity between each sentence and the other sentences and select the sentences with the highest similarity scores. We will use the cosine similarity measure for this.

In [8]:
from sklearn.metrics.pairwise import cosine_similarity

# Compute the similarity matrix
similarity_matrix = cosine_similarity(sentence_embeddings)

# Generate the summary
num_sentences = 2
summary_sentences = []
for i in range(num_sentences):
    sentence_scores = list(enumerate(similarity_matrix[i]))
    
sentence_scores = sorted(sentence_scores, key=lambda x: x[1], reverse=True)
summary_sentences.append(sentences[sentence_scores[1][0]])

summary = ' '.join(summary_sentences)
# print(summary)  

In [9]:
summary

'In unsupervised learning, the algorithm is given an unlabeled dataset and must find patterns and relationships within the data on its own.'