In [1]:
from diffusers import DiffusionPipeline
from transformers import CLIPTokenizer
import torch
torch_device = "cuda" if torch.cuda.is_available() else "cpu"
torch_dtype = (torch.bfloat16 if torch_device == "cuda" else torch.float32)
import numpy as np


In [2]:
tokenizer = CLIPTokenizer.from_pretrained("CompVis/stable-diffusion-v1-4", subfolder="tokenizer", torch_dtype=torch_dtype,)

In [100]:
x = "hippopot"
y = "incomprehensible"

words_x = x.split(' ')
words_y = y.split(' ')

max_len = tokenizer.model_max_length
mapper = np.zeros((max_len, max_len))

In [101]:
def get_word_inds(text: str, word_place: int, tokenizer):
    """
    Splits the text into words. If 'word_place' is a string, it finds all occurrences of the word in the text and stores their indices. 
    If 'word_place' is an integer, it wraps it in a list for consistent processing. 
    Encodes the text into tokens and decodes each token back into string form to identify the boundaries of each word in the tokenized version. 
    It iterates over these tokens, matching them to the specified word indices ('word_place') and collecting the corresponding token indices in the output list 'out'.
    """
    split_text = text.split(" ")
    if type(word_place) is str:
        word_place = [i for i, word in enumerate(split_text) if word_place == word]
    elif type(word_place) is int:
        word_place = [word_place]
    out = []
    if len(word_place) > 0:
        words_encode = [tokenizer.decode([item]).strip("#") for item in tokenizer.encode(text)][1:-1]
        cur_len, ptr = 0, 0

        for i in range(len(words_encode)):
            cur_len += len(words_encode[i])
            if ptr in word_place:
                out.append(i + 1)
            if cur_len >= len(split_text[ptr]):
                ptr += 1
                cur_len = 0
    return np.array(out)

In [106]:
for i in range(len(words_y)): 
    if words_y[i] != words_x[i]:
        diff_indices = i 
        
source_ind = get_word_inds(x, diff_indices, tokenizer)
target_ind = get_word_inds(y, diff_indices, tokenizer)

i = j = 0  

while i < len(mapper) and j < len(mapper):
    if i == source_ind[0]:
        if len(source_ind) == len(target_ind):
            for s_idx, t_idx in zip(source_ind, target_ind):
                mapper[s_idx, t_idx] = 1.0
        elif len(source_ind) > len(target_ind):
            ratio = 1.0 / len(source_ind)
            for t_idx in target_ind:
                for s_idx in source_ind:
                    mapper[s_idx, t_idx] = ratio
        else:
            ratio = 1.0 / len(target_ind)
            for s_idx in source_ind:
                for t_idx in target_ind:
                    mapper[s_idx, t_idx] = ratio
        i += len(source_ind)
        j += len(target_ind)
    else:
            mapper[i, j] = 1.0
            i += 1
            j += 1

In [107]:
mapper[:6, :6]

array([[1.        , 0.        , 0.        , 0.        , 0.        ,
        0.        ],
       [0.        , 0.33333333, 0.33333333, 0.33333333, 0.        ,
        0.        ],
       [0.        , 0.33333333, 0.33333333, 0.33333333, 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , 0.        , 1.        ,
        0.        ],
       [0.        , 0.        , 0.        , 0.        , 0.        ,
        1.        ],
       [0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        ]])

In [82]:
# differing_indices = [i for i, (wx, wy) in enumerate(zip(words_x, words_y)) if wx != wy]
# move_r, move_c = 0, 0
# idx = 0
# mapper[0][0] = 1
# while (idx + move_r + 1) < len(mapper) and (idx + move_c + 1) < len(mapper):
#     if idx in differing_indices:
#             tokens_x = tokenizer(words_x[idx], add_special_tokens=False, return_tensors="np")['input_ids'][0]
#             tokens_y = tokenizer(words_y[idx], add_special_tokens=False, return_tensors="np")['input_ids'][0]
#             len_x, len_y = len(tokens_x), len(tokens_y)
#             move_c += (len_y-1)
#             move_r += (len_x-1)
#             if (len_y * len_x) == len_y or (len_y * len_x) == len_x:
#                 val = 1 / (len_y * len_x)
#             else:
#                 val = 1 / len_y 
#             for i in range(len_x):
#                 for j in range(len_y):
#                         mapper[idx + i + 1][idx + j + 1] = val
                    
#     else:
#         mapper[idx + move_r + 1][idx + move_c + 1] = 1
#     idx += 1



In [83]:
# mapper1 = mapper.copy()

In [84]:
import pdb

def get_word_inds(text: str, word_place: int, tokenizer):
    """
    Splits the text into words. If 'word_place' is a string, it finds all occurrences of the word in the text and stores their indices. 
    If 'word_place' is an integer, it wraps it in a list for consistent processing. 
    Encodes the text into tokens and decodes each token back into string form to identify the boundaries of each word in the tokenized version. 
    It iterates over these tokens, matching them to the specified word indices ('word_place') and collecting the corresponding token indices in the output list 'out'.
    """
    split_text = text.split(" ")
    if type(word_place) is str:
        word_place = [i for i, word in enumerate(split_text) if word_place == word]
    elif type(word_place) is int:
        word_place = [word_place]
    out = []
    if len(word_place) > 0:
        words_encode = [tokenizer.decode([item]).strip("#") for item in tokenizer.encode(text)][1:-1]
        cur_len, ptr = 0, 0

        for i in range(len(words_encode)):
            cur_len += len(words_encode[i])
            if ptr in word_place:
                out.append(i + 1)
            if cur_len >= len(split_text[ptr]):
                ptr += 1
                cur_len = 0
    return np.array(out)

mapper = np.zeros((max_len, max_len))
diff_indices = [i for i in range(len(words_y)) if words_y[i] != words_x[i]]
source_inds = [get_word_inds(x, i, tokenizer) for i in diff_indices]
target_inds = [get_word_inds(y, i, tokenizer) for i in diff_indices]
i = 0
j = 0
current = 0
while i < max_len and j < max_len:
    
    if current < len(source_inds) and source_inds[current][0] == i:
        source_inds_1 = source_inds[current]
        target_inds_1 = target_inds[current]
        
        
        if len(source_inds_1) == len(target_inds_1):
            
            for src_idx, tgt_idx in zip(source_inds_1, target_inds_1):
                mapper[src_idx, tgt_idx] = 1.0
        elif len(source_inds_1) > len(target_inds_1):
            
            ratio = 1.0 / len(source_inds_1)
            for tgt_idx in target_inds_1:
                for src_idx in source_inds_1:
                    mapper[src_idx, tgt_idx] = ratio
        else:
            
            ratio = 1.0 / len(target_inds_1)
            for src_idx in source_inds_1:
                for tgt_idx in target_inds_1:
                    mapper[src_idx, tgt_idx] = ratio
        

        current += 1
        i += len(source_inds_1)
        j += len(target_inds_1)
    else:
        
        mapper[i, j] = 1.0
        i += 1
        j += 1


In [85]:
assert mapper1.all() == mapper.all()

In [69]:
tokenizer(y)

{'input_ids': [49406, 585, 739, 512, 12050, 14507, 49407], 'attention_mask': [1, 1, 1, 1, 1, 1, 1]}

In [70]:
y

'it was incomprehensible'

In [35]:
mapper[0:6, :6]

array([[1.        , 0.        , 0.        , 0.        , 0.        ,
        0.        ],
       [0.        , 1.        , 0.        , 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 1.        , 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , 0.33333333, 0.33333333,
        0.33333333],
       [0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        ]])