In [1]:
import vec2text
import torch
from transformers import AutoModel, AutoTokenizer, PreTrainedTokenizer, PreTrainedModel


def get_gtr_embeddings(text_list,
                       encoder: PreTrainedModel,
                       tokenizer: PreTrainedTokenizer) -> torch.Tensor:

    inputs = tokenizer(text_list,
                       return_tensors="pt",
                       max_length=128,
                       truncation=True,
                       padding="max_length",).to("cuda")

    with torch.no_grad():
        model_output = encoder(input_ids=inputs['input_ids'], attention_mask=inputs['attention_mask'])
        hidden_state = model_output.last_hidden_state
        embeddings = vec2text.models.model_utils.mean_pool(hidden_state, inputs['attention_mask'])

    return embeddings


encoder = AutoModel.from_pretrained("sentence-transformers/gtr-t5-base").encoder.to("cuda")
tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/gtr-t5-base")
corrector = vec2text.load_corrector("jxm/gtr__msmarco__128")

embeddings = get_gtr_embeddings([
       "Jack Morris is a PhD student at Cornell Tech in New York City",
       "It was the best of times, it was the worst of times, it was the age of wisdom, it was the age of foolishness, it was the epoch of belief, it was the epoch of incredulity"
], encoder, tokenizer)

vec2text.invert_embeddings(
    embeddings=embeddings.cuda(),
    corrector=corrector,
    num_steps=20,
)

Some weights of T5Model were not initialized from the model checkpoint at sentence-transformers/gtr-t5-base and are newly initialized: ['decoder.block.0.layer.2.DenseReluDense.wo.weight', 'decoder.block.0.layer.0.SelfAttention.q.weight', 'decoder.block.6.layer.1.EncDecAttention.k.weight', 'decoder.block.0.layer.2.layer_norm.weight', 'decoder.block.5.layer.1.EncDecAttention.k.weight', 'decoder.block.2.layer.0.layer_norm.weight', 'decoder.block.5.layer.2.DenseReluDense.wi.weight', 'decoder.block.8.layer.2.DenseReluDense.wo.weight', 'decoder.block.0.layer.1.EncDecAttention.k.weight', 'decoder.block.7.layer.2.DenseReluDense.wi.weight', 'decoder.block.7.layer.2.DenseReluDense.wo.weight', 'decoder.block.6.layer.2.layer_norm.weight', 'decoder.block.1.layer.0.SelfAttention.v.weight', 'decoder.block.0.layer.0.layer_norm.weight', 'decoder.block.7.layer.0.SelfAttention.o.weight', 'decoder.block.1.layer.1.EncDecAttention.k.weight', 'decoder.block.7.layer.1.EncDecAttention.o.weight', 'decoder.block

Loading checkpoint shards:   0%|          | 0/8 [00:00<?, ?it/s]

Loading checkpoint shards:   0%|          | 0/6 [00:00<?, ?it/s]

['Jack Morris Morris is a PhD student at  Cornell Tech in New York City ',
 'It was the best of times, it was the worst of times, it was the age of wisdom, it was the epoch of foolishness']

In [4]:
embeddings = get_gtr_embeddings([
    'He graduated with a Bachelor of Chemical Engineering and a PhD in Biology from NUS . He did his postdoctoral studies in Howard Hughes Medical Institute , University of Pennsylvania Medical School and returned to NUS as a Lee Kuan Yew Postdoctoral Fellow and later joined his alma mater department as an Asst Professor in 2010 . He has contributed to diverse topics spanning from chemical synthesis to sensor development ; nanosafety to nanomedicine topics .'
], encoder, tokenizer)

vec2text.invert_embeddings(
    embeddings=embeddings.cuda(),
    corrector=corrector,
    num_steps=20,
)

['studied as a PhD student at the University of Washington and in 2010 joined the Lee Hughes Asylum in Chemical Engineering along with Dr. A.R']

In [7]:
embeddings = get_gtr_embeddings([
    'He graduated with a Bachelor of Chemical Engineering and a PhD in Biology from NUS . He did his postdoctoral studies in Howard Hughes Medical Institute , University of Pennsylvania Medical School and returned to NUS as a Lee Kuan Yew Postdoctoral Fellow and later joined his alma mater department as an Asst Professor in 2010 . He has contributed to diverse topics spanning from chemical synthesis to sensor development ; nanosafety to nanomedicine topics .'
], encoder, tokenizer)

vec2text.invert_embeddings(
    embeddings=embeddings.cuda(),
    corrector=corrector,
    num_steps=4,
)

['entered the University of Leeds as a PhD student and later studied Chemical Biology at MIT Alexander Kuan Hughes and completed his M.Sc. in']

In [5]:
embeddings = get_gtr_embeddings([
    'He graduated with a Bachelor of Chemical Engineering and a PhD in Biology from NUS . He did his postdoctoral studies in Howard Hughes Medical Institute.'
], encoder, tokenizer)

vec2text.invert_embeddings(
    embeddings=embeddings.cuda(),
    corrector=corrector,
    num_steps=20,
)

['He graduated with a Bachelor of Science in Biology and Chemical Engineering from Howard Hughes University. He did post-doctoral studies in the NIPS']

In [None]:
(33439,
 'Over the years , he has worked in C#.NET and .NET Web technologies . His area of expertise is in PDF and image processing . He is currently working on the multi - format document viewer and editor technologies for .NET .',
 'Since the late 2000s she has been working in the area of file sharing and document management. She is proficient in PDF and PDF/NET formats. She')

In [9]:
embeddings = get_gtr_embeddings([
    'Over the years , he has worked in C#.NET and .NET Web technologies . His area of expertise is in PDF and image processing . He is currently working on the multi - format document viewer and editor technologies for .NET .'
], encoder, tokenizer)

vec2text.invert_embeddings(
    embeddings=embeddings.cuda(),
    corrector=corrector,
    num_steps=20,
)

['he has been working in.NET technology. His areas of expertise are PDF and multi-format document viewer and web development, most recently in C']

In [11]:
embeddings = get_gtr_embeddings([
    'Over the years , he has worked in C#.NET and .NET Web technologies . His area of expertise is in PDF and image processing . He is currently working on the multi - format document viewer and editor technologies for .NET .'
], encoder, tokenizer)

vec2text.invert_embeddings(
    embeddings=embeddings.cuda(),
    corrector=corrector,
    num_steps=20,
    sequence_beam_width=1,
)

['he has been working in.NET technology. His areas of expertise are PDF and multi-format document viewer and web development, most recently in C']

In [10]:
embeddings = get_gtr_embeddings([
    'Over the years , he has worked in C#.NET and .NET Web technologies . His area of expertise is in PDF and image processing . He is currently working on the multi - format document viewer and editor technologies for .NET .'
], encoder, tokenizer)

vec2text.invert_embeddings(
    embeddings=embeddings.cuda(),
    corrector=corrector,
    num_steps=20,
    sequence_beam_width=4,
)

['in recent years he has been working in PDF and document viewer technologies. His area of expertise is working with ASP.NET, C#.NET']

In [14]:
embeddings = get_gtr_embeddings([
    'Over the years , he has worked in C#.NET and .NET Web technologies . His area of expertise is in PDF and image processing . He is currently working on the multi - format document viewer and editor technologies for .NET .'
], encoder, tokenizer)

vec2text.invert_embeddings(
    embeddings=embeddings.cuda(),
    corrector=corrector,
    num_steps=10,
    sequence_beam_width=6,
)

['In recent years he has been working in.NET technology. His areas of expertise are PDF viewer and multi-format document editor for the C#']

In [15]:
embeddings = get_gtr_embeddings([
    'Over the years , he has worked in C#.NET and .NET Web technologies . His area of expertise is in PDF and image processing . He is currently working on the multi - format document viewer and editor technologies for .NET .'
], encoder, tokenizer)

vec2text.invert_embeddings(
    embeddings=embeddings.cuda(),
    corrector=corrector,
    num_steps=20,
    sequence_beam_width=6,
)

['in recent years he has been working in.NET technology. His area of expertise is in PDF and multi-format document viewer, C# editor']

In [17]:
len('in recent years he has been working in.NET technology. His area of expertise is in PDF and multi-format document viewer, C# editor'.split())

22

In [20]:
embeddings = get_gtr_embeddings([
    'Over the years , he has worked in C#.NET and .NET Web technologies . His area of expertise is in PDF and image processing . He is currently working on the multi - format document viewer and editor technologies for .NET .'
], encoder, tokenizer)

vec2text.invert_embeddings(
    embeddings=embeddings.cuda(),
    corrector=corrector,
    num_steps=50,
    sequence_beam_width=8,
)

['in recent years he has been working in PDF and document viewer technologies. His area of expertise is working with ASP.NET, C# NET']

# jxm/gtr__msmarco__128

In [1]:
import vec2text
import torch
from transformers import AutoModel, AutoTokenizer, PreTrainedTokenizer, PreTrainedModel


def get_gtr_embeddings(text_list,
                       encoder: PreTrainedModel,
                       tokenizer: PreTrainedTokenizer) -> torch.Tensor:

    inputs = tokenizer(text_list,
                       return_tensors="pt",
                       max_length=128,
                       truncation=True,
                       padding="max_length",).to("cuda")

    with torch.no_grad():
        model_output = encoder(input_ids=inputs['input_ids'], attention_mask=inputs['attention_mask'])
        hidden_state = model_output.last_hidden_state
        embeddings = vec2text.models.model_utils.mean_pool(hidden_state, inputs['attention_mask'])

    return embeddings


encoder = AutoModel.from_pretrained("sentence-transformers/gtr-t5-base").encoder.to("cuda")
tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/gtr-t5-base")
corrector = vec2text.load_corrector("jxm/gtr__msmarco__128")

embeddings = get_gtr_embeddings([
       "Jack Morris is a PhD student at Cornell Tech in New York City",
       "It was the best of times, it was the worst of times, it was the age of wisdom, it was the age of foolishness, it was the epoch of belief, it was the epoch of incredulity"
], encoder, tokenizer)

vec2text.invert_embeddings(
    embeddings=embeddings.cuda(),
    corrector=corrector,
    num_steps=20,
)

Some weights of T5Model were not initialized from the model checkpoint at sentence-transformers/gtr-t5-base and are newly initialized: ['decoder.block.9.layer.0.layer_norm.weight', 'decoder.block.9.layer.0.SelfAttention.o.weight', 'decoder.block.11.layer.1.EncDecAttention.k.weight', 'decoder.block.6.layer.0.SelfAttention.k.weight', 'decoder.block.9.layer.1.EncDecAttention.q.weight', 'decoder.block.1.layer.2.DenseReluDense.wi.weight', 'decoder.block.1.layer.1.EncDecAttention.v.weight', 'decoder.block.5.layer.1.layer_norm.weight', 'decoder.block.7.layer.2.layer_norm.weight', 'decoder.block.2.layer.1.EncDecAttention.q.weight', 'decoder.block.9.layer.0.SelfAttention.v.weight', 'decoder.final_layer_norm.weight', 'decoder.block.5.layer.2.DenseReluDense.wo.weight', 'decoder.block.5.layer.0.SelfAttention.o.weight', 'decoder.block.8.layer.2.layer_norm.weight', 'decoder.block.5.layer.1.EncDecAttention.o.weight', 'decoder.block.6.layer.0.layer_norm.weight', 'decoder.block.4.layer.0.SelfAttention.

config.json:   0%|          | 0.00/4.50k [00:00<?, ?B/s]

pytorch_model.bin.index.json:   0%|          | 0.00/36.7k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/8 [00:00<?, ?it/s]

pytorch_model-00001-of-00008.bin:   0%|          | 0.00/193M [00:00<?, ?B/s]

pytorch_model-00002-of-00008.bin:   0%|          | 0.00/198M [00:00<?, ?B/s]

pytorch_model-00003-of-00008.bin:   0%|          | 0.00/198M [00:00<?, ?B/s]

pytorch_model-00004-of-00008.bin:   0%|          | 0.00/198M [00:00<?, ?B/s]

pytorch_model-00005-of-00008.bin:   0%|          | 0.00/144M [00:00<?, ?B/s]

pytorch_model-00006-of-00008.bin:   0%|          | 0.00/193M [00:00<?, ?B/s]

pytorch_model-00007-of-00008.bin:   0%|          | 0.00/198M [00:00<?, ?B/s]

ChunkedEncodingError: ('Connection broken: IncompleteRead(71664902 bytes read, 126578644 more expected)', IncompleteRead(71664902 bytes read, 126578644 more expected))