In [None]:
!pip install transformers

In [None]:
import torch
import torch.nn as nn
from transformers import BertModel, BertTokenizer

# Load pre-trained BERT model and tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
bert_model = BertModel.from_pretrained('bert-base-uncased')



In [3]:

# Define a custom embedding layer
class CustomEmbedding(nn.Module):
    def __init__(self, embedding_size):
        super(CustomEmbedding, self).__init__()
        self.embedding_size = embedding_size
        self.embedding = nn.Embedding(tokenizer.vocab_size, embedding_size)
        
    def forward(self, input_ids, token_weights):
        # Get the token embeddings from the default BERT model
        embeddings = self.embedding(input_ids)
        
        # Modify the embeddings based on the token weights
        modified_embeddings = embeddings * token_weights.unsqueeze(-1)
        
        return modified_embeddings

# Example sentence
sentence = "This is an example sentence."

# Tokenize the sentence
input_ids = torch.tensor(tokenizer.encode(sentence, add_special_tokens=True)).unsqueeze(0)

# Define the token weights
token_weights = torch.tensor([1, 1, 2, 3, 1, 1, 1, 1]).float()

# Define the custom embedding layer with an embedding size of 768 (default for BERT)
custom_embedding = CustomEmbedding(768)

# Get the modified token embeddings from the custom embedding layer
modified_embeddings = custom_embedding(input_ids, token_weights)

# Pass the modified token embeddings to the BERT model for further processing
output = bert_model(inputs_embeds=modified_embeddings)

# Get the final output from the BERT model
pooled_output = output[1]

### Performance evaluation for two sentence without adjusting weights

In [5]:
sentence1 = "This is stomach related issue"
sentence2 = "I have pain in stomach"

In [39]:
# Tokenize the sentence
input_ids_1 = torch.tensor(tokenizer.encode(sentence1, add_special_tokens=True)).unsqueeze(0)

# Define the token weights
token_weights_1 = torch.tensor([1]*input_ids_1.shape[1]).float()

# Define the custom embedding layer with an embedding size of 768 (default for BERT)
custom_embedding_1 = CustomEmbedding(768)

# Get the modified token embeddings from the custom embedding layer
modified_embeddings_1 = custom_embedding_1(input_ids_1, token_weights_1)

# Pass the modified token embeddings to the BERT model for further processing
output_1 = bert_model(inputs_embeds=modified_embeddings_1)

# Get the final output from the BERT model
pooled_output_1 = output_1[1]

In [40]:
# Tokenize the sentence
input_ids_2 = torch.tensor(tokenizer.encode(sentence2, add_special_tokens=True)).unsqueeze(0)

# Define the token weights
token_weights_2 = torch.tensor([1]*input_ids_2.shape[1]).float()

# Define the custom embedding layer with an embedding size of 768 (default for BERT)
custom_embedding_2 = CustomEmbedding(768)

# Get the modified token embeddings from the custom embedding layer
modified_embeddings_2 = custom_embedding_2(input_ids_2, token_weights_2)

# Pass the modified token embeddings to the BERT model for further processing
output_2 = bert_model(inputs_embeds=modified_embeddings_2)

# Get the final output from the BERT model
pooled_output_2 = output_2[1]

In [41]:
import torch.nn.functional as F

# Calculate the cosine similarity between the two pooled outputs
cosine_sim = F.cosine_similarity(pooled_output_1, pooled_output_2)

# Print the cosine similarity
print("Cosine similarity between the two sentences:", cosine_sim.item())

Cosine similarity between the two sentences: 0.8298619985580444


### Check performance with SBERT for comparison

In [None]:
!pip install sentence_transformers

In [21]:
from sentence_transformers import SentenceTransformer, util

sentence1 = "This is stomach related issue"
sentence2 = "I have pain in stomach"

# Load pre-trained SBERT model
model = SentenceTransformer('bert-base-nli-mean-tokens')

# Get sentence embeddings
embeddings_1 = model.encode(sentence1, convert_to_tensor=True)
embeddings_2 = model.encode(sentence2, convert_to_tensor=True)

# Calculate cosine similarity between the sentence embeddings
cosine_sim = util.pytorch_cos_sim(embeddings_1, embeddings_2)

# Print the cosine similarity
print("Cosine similarity between the two sentences:", cosine_sim.item())

Downloading (…)821d1/.gitattributes:   0%|          | 0.00/391 [00:00<?, ?B/s]

Downloading (…)_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading (…)8d01e821d1/README.md:   0%|          | 0.00/3.95k [00:00<?, ?B/s]

Downloading (…)d1/added_tokens.json:   0%|          | 0.00/2.00 [00:00<?, ?B/s]

Downloading (…)01e821d1/config.json:   0%|          | 0.00/625 [00:00<?, ?B/s]

Downloading (…)ce_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

Downloading (…)nce_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading (…)821d1/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/399 [00:00<?, ?B/s]

Downloading (…)8d01e821d1/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)1e821d1/modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

Cosine similarity between the two sentences: 0.7642334699630737


In [28]:
from sentence_transformers import SentenceTransformer, util

# Load pre-trained SBERT model
model = SentenceTransformer('bert-base-nli-mean-tokens')

sentence1 = "This is stomach related issue"
sentence2 = "I have pain in stomach"

# Get sentence embeddings
embeddings_1 = model.encode(sentence1, convert_to_tensor=True)
embeddings_2 = model.encode(sentence2, convert_to_tensor=True)


token_weights = torch.tensor([1]*embeddings_1.shape[0]).float()
modified_embeddings_1 = embeddings_1 * token_weights.unsqueeze(-1)

modified_embeddings_1 = embeddings_2 * token_weights.unsqueeze(-1)

# Calculate cosine similarity between the sentence embeddings
cosine_sim = util.pytorch_cos_sim(embeddings_1, embeddings_2)

# Print the cosine similarity
print("Cosine similarity between the two sentences:", cosine_sim.item())

Cosine similarity between the two sentences: 0.7642334699630737


### Performance evaluation for two sentence after adjusting weights

In [42]:
# Tokenize the sentence
input_ids_1 = torch.tensor(tokenizer.encode(sentence1, add_special_tokens=True)).unsqueeze(0)

### We will be using 4th sentence and 6th item respectively for weightage
sentence1 = "This is stomach related issue"
sentence2 = "I have pain in stomach"


# Define the token weights
token_weights_1 = torch.tensor([1,1,1,5,1,1,1]).float()

# Define the custom embedding layer with an embedding size of 768 (default for BERT)
custom_embedding_1 = CustomEmbedding(768)

# Get the modified token embeddings from the custom embedding layer
modified_embeddings_1 = custom_embedding_1(input_ids_1, token_weights_1)

# Pass the modified token embeddings to the BERT model for further processing
output_1 = bert_model(inputs_embeds=modified_embeddings_1)

# Get the final output from the BERT model
pooled_output_1 = output_1[1]

In [43]:
# Tokenize the sentence
input_ids_2 = torch.tensor(tokenizer.encode(sentence2, add_special_tokens=True)).unsqueeze(0)

# Define the token weights
token_weights_2 = torch.tensor([1,1,1,1,1,5,1]).float()

# Define the custom embedding layer with an embedding size of 768 (default for BERT)
custom_embedding_2 = CustomEmbedding(768)

# Get the modified token embeddings from the custom embedding layer
modified_embeddings_2 = custom_embedding_2(input_ids_2, token_weights_2)

# Pass the modified token embeddings to the BERT model for further processing
output_2 = bert_model(inputs_embeds=modified_embeddings_2)

# Get the final output from the BERT model
pooled_output_2 = output_2[1]

In [44]:
import torch.nn.functional as F

# Calculate the cosine similarity between the two pooled outputs
cosine_sim = F.cosine_similarity(pooled_output_1, pooled_output_2)

# Print the cosine similarity
print("Cosine similarity between the two sentences:", cosine_sim.item())

Cosine similarity between the two sentences: 0.6122373938560486
