In [16]:
from transformers import BertModel, BertTokenizer
import torch
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

model_name = 'bert-base-uncased'
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertModel.from_pretrained(model_name)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [17]:
paragraph = "OpenAI’s groundbreaking embedding and transcription models revolutionize NLP and speech recognition, enhancing accuracy and efficiency. This blog explores OpenAI Embeddings’ potential for advanced NLP tasks, while the next focuses on Whisper transcription models. We delve into word embeddings’ basics, advantages, and OpenAI’s superior performance. Discover applications like text similarity, semantic search, and clustering, as we unveil OpenAI Embeddings’ transformative power in NLP."
tokens = tokenizer.tokenize(paragraph)
input_ids = tokenizer.encode(paragraph, return_tensors='pt', max_length=512, truncation=True)
input_ids

tensor([[  101,  2330,  4886,  1521,  1055, 23222,  7861,  8270,  4667,  1998,
         14193,  4275,  4329,  4697, 17953,  2361,  1998,  4613,  5038,  1010,
         20226, 10640,  1998,  8122,  1012,  2023,  9927, 15102,  2330,  4886,
          7861,  8270,  4667,  2015,  1521,  4022,  2005,  3935, 17953,  2361,
          8518,  1010,  2096,  1996,  2279,  7679,  2006,  7204, 14193,  4275,
          1012,  2057,  3972,  3726,  2046,  2773,  7861,  8270,  4667,  2015,
          1521, 24078,  1010, 12637,  1010,  1998,  2330,  4886,  1521,  1055,
          6020,  2836,  1012,  7523,  5097,  2066,  3793, 14402,  1010, 21641,
          3945,  1010,  1998,  9324,  2075,  1010,  2004,  2057,  4895,  3726,
          4014,  2330,  4886,  7861,  8270,  4667,  2015,  1521, 10938,  8082,
          2373,  1999, 17953,  2361,  1012,   102]])

In [18]:
with torch.no_grad():
    outputs = model(input_ids)
    embeddings = outputs.last_hidden_state

sentence_embeddings = torch.mean(embeddings, dim=1)
# print(sentence_embeddings)
similarity_matrix = cosine_similarity(sentence_embeddings, sentence_embeddings)
similarity_matrix

array([[0.99999994]], dtype=float32)

In [19]:
vectorizer = CountVectorizer()
X = vectorizer.fit_transform([paragraph])
sentence_lengths = X.toarray()[0]

importance_scores = similarity_matrix.sum(axis=1) / sentence_lengths

sorted_indices = importance_scores.argsort()[::-1]
sorted_indices

array([47, 25, 21, 20, 19, 18, 17, 16, 15, 14, 12, 11, 10,  9,  8,  7,  6,
        5,  4,  2,  1, 46, 23, 35, 34, 45, 43, 42, 40, 39, 38, 37, 36,  0,
       33, 32, 31, 30, 29, 28, 27, 41, 44, 22, 13, 24, 26,  3],
      dtype=int64)

In [20]:

top_key_points = [tokens[i] for i in sorted_indices[:3]]  
print("Top 3 key points:", top_key_points)

Top 3 key points: ['transcription', 'blog', 'and']
