In [1]:
# Dependency installation code
# !pip install PyPDF2 torch transformers

# Import dependencies
import PyPDF2
import torch
from transformers import AutoTokenizer, T5EncoderModel

# Load T5EncoderModel - small
# tokenizer = AutoTokenizer.from_pretrained("google-t5/t5-small", suppress_warnings=True)
# model = T5EncoderModel.from_pretrained("google-t5/t5-small")

# Load T5EncoderModel - large (>2.8 GB - longer load time)
tokenizer = AutoTokenizer.from_pretrained("google-t5/t5-large", suppress_warnings=True)
model = T5EncoderModel.from_pretrained("google-t5/t5-large")

For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on google-t5/t5-large automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.


In [13]:
# Tokenize Input sentence and query
input_ids = tokenizer('Our products are produced through sustainable practices', return_tensors = 'pt').input_ids
query_ids = tokenizer('This statement is related to Sustainable practices', 
                      return_tensors = 'pt').input_ids

# Generate Encodings
outputs = model(input_ids=input_ids)
query = model(input_ids = query_ids)

# Retrieve encodings for sentence and query.
last_hidden_states = outputs.last_hidden_state
last_hidden_states_query = query.last_hidden_state

## Max Pooling Strategy

In [14]:
# Apply max pooling along the token dimension (dim=1)
pooled_last_hidden_states = torch.max(last_hidden_states, dim=1).values.unsqueeze(0)  
pooled_last_hidden_states_query = torch.max(last_hidden_states_query, dim=1).values.unsqueeze(0)  

# Reshape pooled_last_hidden_states_query to match the shape of pooled_last_hidden_states
pooled_last_hidden_states_query = pooled_last_hidden_states_query.view(1, -1)

# Compute dot product between the pooled representations
similarity = torch.matmul(pooled_last_hidden_states, pooled_last_hidden_states_query.T)

# Print similarity score
print("Similarity Score:", similarity.item())


Similarity Score: 69.13910675048828


Max Pooling Results:

| Sentence                     | Query                        | Score (small model) | Score (large model) |
|------------------------------|------------------------------|---------------------|---------------------|
| 'This product is sustainable'| 'Where is the India? '       | 14.40               | 34.91               |
| 'India is in Asia'           | 'Where is the India? '       | 15.46               | 43.41               |      
| 'Our product is sustainable' | 'Sustainable '               | 09.38               | 27.79               |


## Mean Pooling Strategy

In [15]:
# Apply mean pooling along the token dimension (dim=1)
pooled_last_hidden_states = torch.mean(last_hidden_states, dim=1).unsqueeze(0)  
pooled_last_hidden_states_query = torch.mean(last_hidden_states_query, dim=1).unsqueeze(0)  

# Reshape pooled_last_hidden_states_query to match the shape of pooled_last_hidden_states
pooled_last_hidden_states_query = pooled_last_hidden_states_query.view(1, -1)

# Compute dot product between the pooled representations
similarity = torch.matmul(pooled_last_hidden_states, pooled_last_hidden_states_query.T)

# Print similarity score
print("Similarity Score:", similarity.item())

Similarity Score: 5.169628620147705


Mean Pooling Results:

| Sentence                     | Query                        | Score (small model)| Score (large model) |
|------------------------------|------------------------------|--------------------|---------------------|
| 'This product is sustainable'| 'Where is the India? '       | 1.74               |2.88                 |
| 'India is in Asia'           | 'Where is the India? '       | 2.56               |4.55                 |
| 'Our product is sustainable' | 'Sustainable '               | 2.65               |4.70                 |