In [None]:
!pip install transformers

In [None]:
import torch
import torch.nn as nn
from transformers import BertModel, BertTokenizer

# Load pre-trained BERT model and tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
bert_model = BertModel.from_pretrained('bert-base-uncased')



In [None]:

# Define a custom embedding layer
class CustomEmbedding(nn.Module):
    def __init__(self, embedding_size):
        super(CustomEmbedding, self).__init__()
        self.embedding_size = embedding_size
        self.embedding = nn.Embedding(tokenizer.vocab_size, embedding_size)
        
    def forward(self, input_ids, token_weights):
        # Get the token embeddings from the default BERT model
        embeddings = self.embedding(input_ids)
        
        # Modify the embeddings based on the token weights
        modified_embeddings = embeddings * token_weights.unsqueeze(-1)
        
        return modified_embeddings

# Example sentence
sentence = "This is an example sentence."

# Tokenize the sentence
input_ids = torch.tensor(tokenizer.encode(sentence, add_special_tokens=True)).unsqueeze(0)

# Define the token weights
token_weights = torch.tensor([1, 1, 2, 3, 1, 1, 1, 1]).float()

# Define the custom embedding layer with an embedding size of 768 (default for BERT)
custom_embedding = CustomEmbedding(768)

# Get the modified token embeddings from the custom embedding layer
modified_embeddings = custom_embedding(input_ids, token_weights)

# Pass the modified token embeddings to the BERT model for further processing
output = bert_model(inputs_embeds=modified_embeddings)

# Get the final output from the BERT model
pooled_output = output[1]

### Performance evaluation for two sentence without adjusting weights

In [None]:
sentence1 = "This is stomach related issue"
sentence2 = "I have pain in stomach"

In [None]:
# Tokenize the sentence
input_ids_1 = torch.tensor(tokenizer.encode(sentence1, add_special_tokens=True)).unsqueeze(0)

# Define the token weights
token_weights_1 = torch.tensor([1]*input_ids_1.shape[1]).float()

# Define the custom embedding layer with an embedding size of 768 (default for BERT)
custom_embedding_1 = CustomEmbedding(768)

# Get the modified token embeddings from the custom embedding layer
modified_embeddings_1 = custom_embedding_1(input_ids_1, token_weights_1)

# Pass the modified token embeddings to the BERT model for further processing
output_1 = bert_model(inputs_embeds=modified_embeddings_1)

# Get the final output from the BERT model
pooled_output_1 = output_1[1]

In [None]:
# Tokenize the sentence
input_ids_2 = torch.tensor(tokenizer.encode(sentence2, add_special_tokens=True)).unsqueeze(0)

# Define the token weights
token_weights_2 = torch.tensor([1]*input_ids_2.shape[1]).float()

# Define the custom embedding layer with an embedding size of 768 (default for BERT)
custom_embedding_2 = CustomEmbedding(768)

# Get the modified token embeddings from the custom embedding layer
modified_embeddings_2 = custom_embedding_2(input_ids_2, token_weights_2)

# Pass the modified token embeddings to the BERT model for further processing
output_2 = bert_model(inputs_embeds=modified_embeddings_2)

# Get the final output from the BERT model
pooled_output_2 = output_2[1]

In [None]:
import torch.nn.functional as F

# Calculate the cosine similarity between the two pooled outputs
cosine_sim = F.cosine_similarity(pooled_output_1, pooled_output_2)

# Print the cosine similarity
print("Cosine similarity between the two sentences:", cosine_sim.item())

Cosine similarity between the two sentences: 0.8298619985580444


### Check performance with SBERT for comparison

In [None]:
!pip install sentence_transformers

In [None]:
from sentence_transformers import SentenceTransformer, util

sentence1 = "This is stomach related issue"
sentence2 = "I have pain in stomach"

# Load pre-trained SBERT model
model = SentenceTransformer('bert-base-nli-mean-tokens')

# Get sentence embeddings
embeddings_1 = model.encode(sentence1, convert_to_tensor=True)
embeddings_2 = model.encode(sentence2, convert_to_tensor=True)

# Calculate cosine similarity between the sentence embeddings
cosine_sim = util.pytorch_cos_sim(embeddings_1, embeddings_2)

# Print the cosine similarity
print("Cosine similarity between the two sentences:", cosine_sim.item())

Downloading (…)821d1/.gitattributes:   0%|          | 0.00/391 [00:00<?, ?B/s]

Downloading (…)_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading (…)8d01e821d1/README.md:   0%|          | 0.00/3.95k [00:00<?, ?B/s]

Downloading (…)d1/added_tokens.json:   0%|          | 0.00/2.00 [00:00<?, ?B/s]

Downloading (…)01e821d1/config.json:   0%|          | 0.00/625 [00:00<?, ?B/s]

Downloading (…)ce_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

Downloading (…)nce_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading (…)821d1/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/399 [00:00<?, ?B/s]

Downloading (…)8d01e821d1/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)1e821d1/modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

Cosine similarity between the two sentences: 0.7642334699630737


In [None]:
from sentence_transformers import SentenceTransformer, util

# Load pre-trained SBERT model
model = SentenceTransformer('bert-base-nli-mean-tokens')

sentence1 = "This is stomach related issue"
sentence2 = "I have pain in stomach"

# Get sentence embeddings
embeddings_1 = model.encode(sentence1, convert_to_tensor=True)
embeddings_2 = model.encode(sentence2, convert_to_tensor=True)


token_weights = torch.tensor([1]*embeddings_1.shape[0]).float()
modified_embeddings_1 = embeddings_1 * token_weights.unsqueeze(-1)

modified_embeddings_1 = embeddings_2 * token_weights.unsqueeze(-1)

# Calculate cosine similarity between the sentence embeddings
cosine_sim = util.pytorch_cos_sim(embeddings_1, embeddings_2)

# Print the cosine similarity
print("Cosine similarity between the two sentences:", cosine_sim.item())

Cosine similarity between the two sentences: 0.7642334699630737


### Performance evaluation for two sentence after adjusting weights

In [None]:
# Tokenize the sentence
input_ids_1 = torch.tensor(tokenizer.encode(sentence1, add_special_tokens=True)).unsqueeze(0)

### We will be using 4th sentence and 6th item respectively for weightage
sentence1 = "This is stomach related issue"
sentence2 = "I have pain in stomach"


# Define the token weights
token_weights_1 = torch.tensor([1,1,1,5,1,1,1]).float()

# Define the custom embedding layer with an embedding size of 768 (default for BERT)
custom_embedding_1 = CustomEmbedding(768)

# Get the modified token embeddings from the custom embedding layer
modified_embeddings_1 = custom_embedding_1(input_ids_1, token_weights_1)

# Pass the modified token embeddings to the BERT model for further processing
output_1 = bert_model(inputs_embeds=modified_embeddings_1)

# Get the final output from the BERT model
pooled_output_1 = output_1[1]

In [None]:
# Tokenize the sentence
input_ids_2 = torch.tensor(tokenizer.encode(sentence2, add_special_tokens=True)).unsqueeze(0)

# Define the token weights
token_weights_2 = torch.tensor([1,1,1,1,1,5,1]).float()

# Define the custom embedding layer with an embedding size of 768 (default for BERT)
custom_embedding_2 = CustomEmbedding(768)

# Get the modified token embeddings from the custom embedding layer
modified_embeddings_2 = custom_embedding_2(input_ids_2, token_weights_2)

# Pass the modified token embeddings to the BERT model for further processing
output_2 = bert_model(inputs_embeds=modified_embeddings_2)

# Get the final output from the BERT model
pooled_output_2 = output_2[1]

In [None]:
import torch.nn.functional as F

# Calculate the cosine similarity between the two pooled outputs
cosine_sim = F.cosine_similarity(pooled_output_1, pooled_output_2)

# Print the cosine similarity
print("Cosine similarity between the two sentences:", cosine_sim.item())

Cosine similarity between the two sentences: 0.6122373938560486


### Stanza with BERT

In [None]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.27.2-py3-none-any.whl (6.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.8/6.8 MB[0m [31m34.1 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.6/7.6 MB[0m [31m33.4 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.11.0
  Downloading huggingface_hub-0.13.3-py3-none-any.whl (199 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m199.8/199.8 KB[0m [31m11.6 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.13.3 tokenizers-0.13.2 transformers-4.27.2


In [None]:
!pip install stanza

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting stanza
  Downloading stanza-1.5.0-py3-none-any.whl (802 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m802.5/802.5 KB[0m [31m10.5 MB/s[0m eta [36m0:00:00[0m
Collecting emoji
  Downloading emoji-2.2.0.tar.gz (240 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m240.9/240.9 KB[0m [31m8.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: emoji
  Building wheel for emoji (setup.py) ... [?25l[?25hdone
  Created wheel for emoji: filename=emoji-2.2.0-py3-none-any.whl size=234926 sha256=9452bd336a849c07ba9e8e013f1160717b8b5bfea71ae12919625d9e28a9656d
  Stored in directory: /root/.cache/pip/wheels/9a/b8/0f/f580817231cbf59f6ade9fd132ff60ada1de9f7dc85521f857
Successfully built emoji
Installing collected packages: emoji, stanza
Successfully installed emoji-2.2.0 stan

In [None]:
import stanza

stanza.download('en')  # Download the English models




Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.5.0.json:   0%|   …

INFO:stanza:Downloading default packages for language: en (English) ...


Downloading https://huggingface.co/stanfordnlp/stanza-en/resolve/v1.5.0/models/default.zip:   0%|          | 0…

INFO:stanza:Finished downloading models and saved to /root/stanza_resources.


In [None]:
nlp = stanza.Pipeline(lang='en', processors="tokenize,pos,lemma,depparse")

INFO:stanza:Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.5.0.json:   0%|   …

INFO:stanza:Loading these models for language: en (English):
| Processor | Package  |
------------------------
| tokenize  | combined |
| pos       | combined |
| lemma     | combined |
| depparse  | combined |

INFO:stanza:Using device: cpu
INFO:stanza:Loading: tokenize
INFO:stanza:Loading: pos
INFO:stanza:Loading: lemma
INFO:stanza:Loading: depparse
INFO:stanza:Done loading processors!


In [None]:
import torch
from transformers import BertModel, BertTokenizer

# Load the pre-trained BERT model and tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


#### Concat Pooled layer with Additional Embeddings

In [None]:
# Define an input sequence
input_text = "The quick brown fox jumped over the lazy dog."

# Tokenize the input sequence
tokens = tokenizer(input_text, return_tensors='pt')

# Send the tokens to the model to get embeddings
outputs = model(**tokens)

# Extract the last hidden state of the model as the pooled representation
pooled_output = outputs.pooler_output

# Print the shape of the pooled output
print(pooled_output.shape)

torch.Size([1, 768])


In [None]:
tokens

{'input_ids': tensor([[  101,  1996,  4248,  2829,  4419,  5598,  2058,  1996, 13971,  3899,
          1012,   102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}

In [None]:
# Get the embeddings for the word "lazy" using Stanza
nlp = stanza.Pipeline(lang='en', processors='tokenize,pos,lemma', tokenize_no_ssplit=True)

doc = nlp(input_text)



In [None]:
# Use pre-trained word embeddings to generate fixed-length vector representations for each word
word_emb = []
for sent in dep_trees:
    emb = []
    for word in sent:
        emb.append(model.embeddings.word_embeddings(torch.tensor([tokenizer.convert_tokens_to_ids(word['text'])])))
    word_emb.append(torch.mean(torch.stack(emb), dim=1))
word_emb = torch.stack(word_emb)

In [None]:
word_emb.shape

torch.Size([1, 5, 768])

In [None]:
mean_x = torch.mean(word_emb, dim=1)
mean_x.shape


torch.Size([1, 768])

In [None]:
# Concatenate the BERT embeddings with the Stanza embeddings for "lazy"
pooled_output = torch.cat((model(**tokens)[0][:, -1], mean_x), dim=0)

# Print the shape of the pooled output
print(pooled_output.shape)

torch.Size([2, 768])
