In [1]:
import os

DATA_DIR = "data" # This may need to be changed on different machines

# Make sure we're in the correct directory and make sure the data directory exists
if not os.path.exists(DATA_DIR):
    os.chdir("../..") # Move up two directories because we're in src/nb and the data directory/path should be in/start at the root directory 
    assert os.path.exists(DATA_DIR), f"ERROR: DATA_DIR={DATA_DIR} not found"  # If we still can't see the data directory something is wrong

import torch
from transformers import AutoTokenizer, AutoModelForCausalLM

In [2]:
device = "cuda" if torch.cuda.is_available() else "cpu"
model_name = "models/gpt2_large"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)

Some weights of the model checkpoint at models/gpt2_large were not used when initializing GPT2LMHeadModel: ['transformer.extra_embedding_project.bias', 'transformer.extra_embedding_project.weight']
- This IS expected if you are initializing GPT2LMHeadModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing GPT2LMHeadModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [3]:
model = model.to(device)

In [11]:
text = "hello, how are you?<bos>"

# tokenize to pt tensor
input_ids = tokenizer(text, return_tensors="pt")["input_ids"].to(device)
print(input_ids)

tensor([[31373,    11,   703,   389,   345,    30, 50264]], device='cuda:0')


In [12]:

generated = model.generate(input_ids, max_new_tokens=20)
print(tokenizer.decode(generated[0]))

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


hello, how are you?<bos>how are you doing?<eos>how are you doing?<eos>how are you doing?<eos>how are


In [23]:
def get_paraphrase(text):
    text += "<bos>"
    n_new_tokens = int(len(text)//4 * 1.5)
    while "<eos>" not in text:
        input_ids = tokenizer(text, return_tensors="pt")["input_ids"].to(device)
        generated = model.generate(input_ids, max_new_tokens=n_new_tokens)
        text = tokenizer.decode(generated[0])
    
    # select text between <bos> and <eos>
    return text[text.index("<bos>")+5:text.index("<eos>")]
        


In [24]:
get_paraphrase("hello, how are you?")

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


'how are you doing?'

In [25]:
get_paraphrase("What is up my homeis?? how the heckity-hek are we today?")

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


"what's up with my house today? How do we do it?"