In [1]:
import torch
from unsloth import FastLanguageModel


dtype = (
    None  # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
)


load_in_4bit = False  # Use 4bit quantization to reduce memory usage. Can be False.


# 4bit pre quantized models we support for 4x faster downloading + no OOMs.


fourbit_models = [
    "unsloth/mistral-7b-bnb-4bit",
    "unsloth/mistral-7b-instruct-v0.2-bnb-4bit",
    "unsloth/llama-2-7b-bnb-4bit",
    "unsloth/llama-2-13b-bnb-4bit",
    "unsloth/codellama-34b-bnb-4bit",
    "unsloth/tinyllama-bnb-4bit",
    "unsloth/gemma-7b-bnb-4bit",  # New Google 6 trillion tokens model 2.5x faster!
    "unsloth/gemma-2b-bnb-4bit",
]  # More models at https://huggingface.co/unsloth


model, tokenizer = FastLanguageModel.from_pretrained(
    model_name="unsloth/mistral-7b-v0.3",  # Choose ANY! eg teknium/OpenHermes-2.5-Mistral-7B
    max_seq_length=max_seq_length,
    dtype=torch.bfloat16 if dtype is None else dtype,
    # token = "hf_...", # use one if using gated models like meta-llama/Llama-2-7b-hf
)

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.


Unsloth: You passed in `unsloth/mistral-7b-v0.3` and `load_in_4bit = True`.
We shall load `unsloth/mistral-7b-v0.3-bnb-4bit` for 4x faster loading.


==((====))==  Unsloth: Fast Mistral patching release 2024.5
   \\   /|    GPU: NVIDIA A40. Max memory: 44.349 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.3.0+cu118. CUDA = 8.6. CUDA Toolkit = 11.8.
\        /    Bfloat16 = TRUE. Xformers = 0.0.26.post1+cu118. FA = True.
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth


In [2]:
model = FastLanguageModel.get_peft_model(
    model,
    r=16,  # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
    target_modules=[
        "q_proj",
        "k_proj",
        "v_proj",
        "o_proj",
        "gate_proj",
        "up_proj",
        "down_proj",
    ],
    lora_alpha=16,
    lora_dropout=0,  # Supports any, but = 0 is optimized
    bias="none",  # Supports any, but = "none" is optimized
    # [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
    use_gradient_checkpointing="unsloth",  # True or "unsloth" for very long context
    random_state=3407,
    use_rslora=False,  # We support rank stabilized LoRA
    loftq_config=None,  # And LoftQ
)

Unsloth 2024.5 patched 32 layers with 32 QKV layers, 32 O layers and 32 MLP layers.


In [3]:
from datasets import load_dataset
from trl import DataCollatorForCompletionOnlyLM

dataset = load_dataset("csv", data_files="../data/random_sft.csv")


def formatting_prompts_func(example):
    messages = [
        {"role": "system", "content": "You are a helpful embedding decoder"},
        {"role": "user", "content": example["inputs"]},
        {"role": "assistant", "content": example["label"]},
    ]
    example["text"] = tokenizer.apply_chat_template(messages, tokenize=False)
    return example


response_template = "[/INST]"
collator = DataCollatorForCompletionOnlyLM(response_template, tokenizer=tokenizer)

In [4]:
dataset = dataset.map(formatting_prompts_func)

In [6]:
from transformers import TrainingArguments
from trl import SFTTrainer
from unsloth import is_bfloat16_supported

trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=dataset["train"],
    dataset_text_field="text",
    max_seq_length=max_seq_length,
    dataset_num_proc=2,
    packing=False,  # Can make training 5x faster for short sequences.
    args=TrainingArguments(
        per_device_train_batch_size=2,
        gradient_accumulation_steps=4,
        warmup_steps=5,
        max_steps=300,  # Set num_train_epochs = 1 for full training runs
        learning_rate=6e-4,
        fp16=not is_bfloat16_supported(),
        bf16=is_bfloat16_supported(),
        logging_steps=1,
        optim="adamw_hf",
        weight_decay=0.01,
        lr_scheduler_type="linear",
        seed=3407,
        output_dir="outputs",
    ),
)

Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.
max_steps is given, it will override any value given in num_train_epochs


In [7]:
trainer.train()

[2024-05-24 20:10:07,791] [INFO] [real_accelerator.py:191:get_accelerator] Setting ds_accelerator to cuda (auto detect)


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 7,764 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 2 | Gradient Accumulation steps = 4
\        /    Total batch size = 8 | Total steps = 300
 "-____-"     Number of trainable parameters = 41,943,040
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mmaoli[0m. Use [1m`wandb login --relogin`[0m to force relogin


Step,Training Loss
1,1.6277
2,1.6283
3,1.6213
4,1.635
5,1.6197
6,1.6631
7,1.617
8,1.6162
9,1.6103
10,1.6538


KeyboardInterrupt: 

In [16]:
# alpaca_prompt = Copied from above
FastLanguageModel.for_inference(model)  # Enable native 2x faster inference


def formatting_prompts_func(example):
    messages = [
        {"role": "system", "content": "You are a helpful embedding decoder"},
        {"role": "user", "content": example["inputs"]},
        {"role": "assistant", "content": ""},
    ]
    example["text"] = tokenizer.apply_chat_template(messages, tokenize=False)
    return example


inputs = tokenizer.apply_chat_template(
    formatting_prompts_func(dataset["train"][0])["text"], return_tensors="pt"
).to("cuda")

outputs = model.generate(**inputs, max_new_tokens=64, use_cache=True)
tokenizer.batch_decode(outputs)

TemplateError: Conversation roles must alternate user/assistant/user/assistant/...

# Fine tune own model


In [1]:
import torch
from torch import nn
from transformers import AutoModelForCausalLM

class CustomModelForCausalLM(nn.Module):
    def __init__(self, model_name):
        super(CustomModelForCausalLM, self).__init__()
        self.model = AutoModelForCausalLM.from_pretrained(model_name)
        self.embedding_layer = self.model.get_input_embeddings()
        # Freeze the embedding layer
        for param in self.embedding_layer.parameters():
            param.requires_grad = False
    
    def forward(self, input_embeddings, attention_mask=None, labels=None):
        # Forward pass through the remaining part of the model
        outputs = self.model(inputs_embeds=input_embeddings, attention_mask=attention_mask, labels=labels)
        return outputs


In [1]:
from transformers import AutoTokenizer
from datasets import load_dataset

# Load tokenizer and dataset
model_name = "mistralai/Mistral-7B-Instruct-v0.3"
tokenizer = AutoTokenizer.from_pretrained(model_name)
dataset = load_dataset("your_dataset_name")

def preprocess_function(examples):
    inputs = tokenizer(examples["text"], max_length=4096, truncation=True, padding="max_length", return_tensors="pt")
    with torch.no_grad():
        embeddings = model.embedding_layer(inputs["input_ids"]).detach().cpu()
    return {"embeddings": embeddings, "attention_mask": inputs["attention_mask"], "labels": inputs["input_ids"]}

tokenized_dataset = dataset.map(preprocess_function, batched=True)

# Convert the dataset to torch tensors
tokenized_dataset.set_format(type='torch', columns=['embeddings', 'attention_mask', 'labels'])


KeyboardInterrupt: 

In [2]:
import os
import torch
from torch import nn
from transformers import TrainingArguments, AutoTokenizer, AutoModelForCausalLM
from datasets import load_dataset
from trl import SFTConfig, SFTTrainer
import pandas as pd
import wandb
# Initialize Weights and Biases for logging
wandb.init(project="sft_embedding_decoding")

# Load the pre-trained model and tokenizer
model_name = "mistralai/Mistral-7B-Instruct-v0.3"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name,device_map='auto')

# Convert the DataFrame to a Hugging Face Dataset
df = pd.read_csv("../data/mistral_sft_training.csv")

[2024-06-20 13:34:45,786] [INFO] [real_accelerator.py:191:get_accelerator] Setting ds_accelerator to cuda (auto detect)


Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mmaoli[0m. Use [1m`wandb login --relogin`[0m to force relogin


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

In [11]:
import torch

def preprocess_function(row):
    # Tokenize the instruction (assuming you have the tokenizer and model defined)
    instruction = '<s>[INST] It is possible to generate sentence based on starting sentence embedding, so please based on the following sentence embedding reconstruct the meaning of it: [/INST]'
    instruction_tokens = tokenizer(instruction, return_tensors="pt", truncation=True, padding=False)

    with torch.no_grad():
        instruction_embeddings = model.get_input_embeddings()(instruction_tokens['input_ids']).squeeze(0).cpu()

    # Get the combined embedding from the row
    combined_embedding = torch.tensor(row['combined'])

    # Print shapes for debugging


    # Check dimensions before concatenation
    if combined_embedding.dim() == 1:
        combined_embedding = combined_embedding.unsqueeze(0)
    
    # Print shapes after ensuring correct dimensions

    # Ensure the dimensions match for concatenation along dim=1
    combined_input = torch.cat((instruction_embeddings, combined_embedding), dim=0)
    assert combined_input.squeeze(0).shape == torch.Size([33, 4096])


    return combined_input.squeeze(0).tolist(), row['label']
#df['combined'] = df['combined'].apply(eval)
df['combined_input'], df['target'] = zip(*df.apply(preprocess_function, axis=1))


In [16]:
torch.tensor(df['combined_input'][0])

tensor([[-4.3335e-03,  5.4121e-05, -5.5847e-03,  ..., -3.2902e-05,
         -9.3842e-04,  1.2517e-05],
        [-4.3335e-03,  5.4121e-05, -5.5847e-03,  ..., -3.2902e-05,
         -9.3842e-04,  1.2517e-05],
        [ 9.0122e-05, -3.3379e-05,  3.8862e-05,  ...,  4.1485e-05,
         -6.0081e-05, -3.1233e-05],
        ...,
        [-1.1597e-03,  6.9809e-04, -4.1962e-04,  ...,  7.4387e-04,
          2.5940e-04,  7.2098e-04],
        [-3.8385e-05, -8.2016e-05, -8.4400e-05,  ...,  1.9455e-04,
          5.7936e-05,  7.6771e-05],
        [ 3.2698e+00,  4.9438e-01, -3.3527e+00,  ...,  4.0738e+00,
         -4.5719e-01,  4.1453e+00]])

In [24]:
df = df.loc[~df['target'].isna(),:]

In [25]:
from datasets import Dataset
dataset = Dataset.from_pandas(df[['combined_input', 'target']])
tokenizer.pad_token = tokenizer.eos_token
# Tokenize the target labels
def tokenize_targets(examples):
    labels = tokenizer(examples['target'], max_length=512, truncation=True, padding="max_length", return_tensors="pt")
    examples["labels"] = labels["input_ids"]
    examples['combined_input'] = torch.tensor(examples['combined_input'])
    return examples

tokenized_dataset = dataset.map(tokenize_targets, batched=True)

Map:   0%|          | 0/4949 [00:00<?, ? examples/s]

In [34]:

from peft import LoraConfig
# Step 2: Define the training arguments
training_args = TrainingArguments(
    output_dir="/nfs/turbo/isr-fconrad1/model/sft-mistral-decoder",
    per_device_train_batch_size=1,
    gradient_accumulation_steps=8,
    learning_rate=3e-5,
    logging_steps=30,
    num_train_epochs=10,
    max_steps=5000,
    report_to="wandb",
)

# Step 3: Define the LoraConfig
peft_config = LoraConfig(
    r=64,
    lora_alpha=16,
    bias="none",
    task_type="CAUSAL_LM",
    lora_dropout=0.05,
    target_modules=["q_proj", "v_proj"],  # Specify target modules for LoRA
)

# Step 4: Define the Custom Model
class CustomModelForCausalLM(AutoModelForCausalLM):
    def __init__(self, model_name):
        super().__init__(model_name)
        self.model = AutoModelForCausalLM.from_pretrained(model_name)
        self.embedding_layer = self.model.get_input_embeddings()
        # Freeze the embedding layer
        for param in self.embedding_layer.parameters():
            param.requires_grad = False
    
    def forward(self, input_embeddings, attention_mask=None, labels=None):
        outputs = self.model(inputs_embeds=input_embeddings, attention_mask=attention_mask, labels=labels)
        return outputs

model = CustomModelForCausalLM.from_pretrained(model_name)

# Step 5: Define the Trainer
sft_config = SFTConfig(
    max_seq_length=4096,
    dataset_text_field="combined_input",
    output_dir="/nfs/turbo/isr-fconrad1/model/mistral_decoder",
)

class CustomSFTTrainer(SFTTrainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.get("labels")
        input_embeddings = inputs.get("combined_input")
        outputs = model(input_embeddings=input_embeddings, attention_mask=inputs.get("attention_mask"), labels=labels)
        loss = outputs.loss
        return (loss, outputs) if return_outputs else loss

trainer = CustomSFTTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    peft_config=peft_config,
    sft_config=sft_config
)
# Start training
trainer.train()


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

TypeError: SFTConfig.__init__() missing 1 required positional argument: 'output_dir'

In [41]:
tokenized_dataset

Dataset({
    features: ['combined_input', 'target', '__index_level_0__', 'labels'],
    num_rows: 4949
})

In [42]:
from transformers import Trainer
# Step 4: Define a Data Collator
class DataCollatorForCustomTraining:
    def __call__(self, features):
        print(features)
        input_embeddings = torch.stack([f['combined_input'] for f in features])
        labels = torch.stack([f['labels'].squeeze(0) for f in features])
        attention_mask = torch.ones_like(input_embeddings[..., 0])  # Create attention mask
        return {'input_embeddings': input_embeddings, 'labels': labels, 'attention_mask': attention_mask}

data_collator = DataCollatorForCustomTraining()

# Step 5: Define the Trainer
class CustomTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.get("labels")
        input_embeddings = inputs.get("input_embeddings")
        attention_mask = inputs.get("attention_mask")
        outputs = model(input_embeddings=input_embeddings, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        return (loss, outputs) if return_outputs else loss

trainer = CustomTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    data_collator=data_collator,
)



Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.
max_steps is given, it will override any value given in num_train_epochs


In [1]:
import os
import torch
from torch import nn
from transformers import TrainingArguments, AutoTokenizer, AutoModelForCausalLM, Trainer
from datasets import Dataset
import pandas as pd
import wandb

# Initialize Weights and Biases for logging

wandb.init(project="sft_embedding_decoding")

# Load the pre-trained model and tokenizer
model_name = "mistralai/Mistral-7B-Instruct-v0.3"
tokenizer = AutoTokenizer.from_pretrained(model_name)
base_model = AutoModelForCausalLM.from_pretrained(model_name)

# Sample DataFrame (replace this with your actual DataFrame)
df = pd.read_csv("../data/mistral_sft_training.csv")
df = df.loc[~df['label'].isna(),:]
df['combined'] = df['combined'].apply(eval)


Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mmaoli[0m. Use [1m`wandb login --relogin`[0m to force relogin


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

In [2]:
# Step 1: Preprocess the DataFrame
def preprocess_function(row):
    instruction = '<s>[INST] It is possible to generate sentence based on starting sentence embedding, so please based on the following sentence embedding reconstruct the meaning of it: [/INST]'
    instruction_tokens = tokenizer(instruction, return_tensors="pt", truncation=True, padding="max_length", max_length=128)
    with torch.no_grad():
        instruction_embeddings = base_model.get_input_embeddings()(instruction_tokens['input_ids']).squeeze(0)
    combined_embedding = torch.tensor(row['combined']).unsqueeze(0)
    combined_input = torch.cat((instruction_embeddings, combined_embedding), dim=0)
    return {'combined_input': combined_input.squeeze(0).tolist(), 'label': row['label']}
tokenizer.pad_token = tokenizer.eos_token
processed_data = df.apply(preprocess_function, axis=1)




In [None]:
# Convert the DataFrame to a Hugging Face Dataset
processed_df = pd.DataFrame(processed_data)
dataset = Dataset.from_pandas(processed_df)
tokenizer.pad_token = tokenizer.eos_token
# Tokenize the target labels
def tokenize_targets(examples):
    labels = tokenizer(examples['label'], max_length=512, truncation=True, padding="max_length", return_tensors="pt")
    examples["labels"] = labels["input_ids"]
    return examples

tokenized_dataset = dataset.map(tokenize_targets, batched=True)

# Ensure the features include both combined_input and labels
def format_features(features):
    combined_inputs = torch.stack([torch.tensor(f['combined_input']) for f in features])
    labels = torch.stack([f['labels'].squeeze(0) for f in features])
    attention_mask = torch.ones(combined_inputs.shape[:-1])  # Create attention mask
    return {'input_embeddings': combined_inputs, 'labels': labels, 'attention_mask': attention_mask}

# Step 2: Define the training arguments
training_args = TrainingArguments(
    output_dir="/nfs/turbo/isr-fconrad1/model/mistral_decoder",
    per_device_train_batch_size=1,
    gradient_accumulation_steps=8,
    learning_rate=3e-5,
    logging_steps=30,
    num_train_epochs=10,
    max_steps=5000,
    report_to="wandb",
)

# Step 3: Define the Custom Model
class CustomModelForCausalLM(AutoModelForCausalLM):
    def __init__(self, model_name):
        super().__init__()
        self.model = AutoModelForCausalLM.from_pretrained(model_name)
        self.embedding_layer = self.model.get_input_embeddings()
        # Freeze the embedding layer
        for param in self.embedding_layer.parameters():
            param.requires_grad = False

    def forward(self, input_embeddings, attention_mask=None, labels=None):
        outputs = self.model(inputs_embeds=input_embeddings, attention_mask=attention_mask, labels=labels)
        return outputs

custom_model = CustomModelForCausalLM.from_pretrained(model_name)

# Step 4: Define the Trainer
class CustomTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.get("labels")
        input_embeddings = inputs.get("input_embeddings")
        attention_mask = inputs.get("attention_mask")
        outputs = model(input_embeddings=input_embeddings, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        return (loss, outputs) if return_outputs else loss

trainer = CustomTrainer(
    model=custom_model,
    args=training_args,
    train_dataset=tokenized_dataset,
    data_collator=format_features,
)

# Start training
trainer.train()

In [79]:
from tqdm.notebook import tqdm
import torch

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

tgt_vocab_size = tokenizer.vocab_size + 2  # Size of the target vocabulary
d_model = 768  # Dimension of the model (matches the embedding size)
nhead = 8  # Number of heads in multiheadattention
num_decoder_layers = 24  # Number of decoder layers
dim_feedforward = 2048  # Dimension of the feedforward network

decoder = TransformerDecoderModel(tgt_vocab_size, d_model, nhead, num_decoder_layers, dim_feedforward).to(device)

# Use ignore_index to ignore the padding token in the loss calculation
criterion = nn.CrossEntropyLoss(ignore_index=tokenizer.eos_token_id)
decoder_optimizer = optim.Adam(decoder.parameters(), lr=0.001)
# Create a padding mask function
def create_padding_mask(tgt, pad_token=12801):
    pad_mask = (tgt != pad_token).unsqueeze(1).unsqueeze(2)  # (batch_size, 1, 1, tgt_len)
    return pad_mask.float().masked_fill(pad_mask == 0, float('-inf')).masked_fill(pad_mask == 1, float(0.0))

def create_combined_mask(tgt, pad_token=12801):
    seq_len = tgt.size(1)
    subsequent_mask = decoder.generate_square_subsequent_mask(seq_len).to(tgt.device)
    padding_mask = create_padding_mask(tgt, pad_token).squeeze(0)  # (batch_size, 1, tgt_len)
    combined_mask = subsequent_mask.unsqueeze(0) + padding_mask  # (batch_size, tgt_len, tgt_len)
    return combined_mask

# Define the training loop with batches and padding
def train_step(batch_memory, batch_targets, decoder, decoder_optimizer, criterion):
    decoder_optimizer.zero_grad()
    
    batch_size = batch_memory.size(0)
    max_target_length = batch_targets.size(1)
    tgt_input = torch.zeros(batch_size, 1, dtype=torch.long, device=device)+128000
    
    loss = 0

    for t in range(1, max_target_length):
        print(t)
        tgt_mask = create_combined_mask(tgt_input).to(device)
        output = decoder(tgt_input, batch_memory, tgt_mask=None)

        output = output.reshape(32, -1)  # Get the last token in the sequence

        loss += criterion(output, batch_targets[:, t])

        topv, topi = output.topk(1)

        tgt_input = torch.cat((tgt_input, topi), dim=1)
        print(tgt_input.shape, batch_memory.shape)

    loss.backward()
    decoder_optimizer.step()
    
    return loss.item() / max_target_length

# Training loop with batches and padding
n_epochs = 10
print_every = 1

for epoch in range(n_epochs):
    for i, (batch_memory, batch_targets) in tqdm(enumerate(dataloader)):
        batch_memory = batch_memory.reshape(32, 1, -1).to(device)
        
        batch_targets = batch_targets.to(device)
        
        loss = train_step(batch_memory, batch_targets, decoder, decoder_optimizer, criterion)

    if epoch % print_every == 0:
        print(f'Epoch {epoch+1}/{n_epochs}, Loss: {loss}')

0it [00:00, ?it/s]

1
torch.Size([32, 2]) torch.Size([32, 1, 768])
2
torch.Size([32, 3]) torch.Size([32, 1, 768])
3


RuntimeError: CUDA error: device-side assert triggered
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1.
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.


In [81]:
batch_targets[:, 1]

RuntimeError: CUDA error: device-side assert triggered
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1.
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.


In [58]:
tgt_input = torch.zeros(batch_size, 1, dtype=torch.long, device=device)+128000

In [61]:
tokenizer.pad_token_id

128001

In [53]:
decoder.embedding(batch_targets[:, :2]).shape

torch.Size([32, 2, 768])

In [15]:
tgt_mask = decoder.generate_square_subsequent_mask(tgt_input.size(0)).to(device)

In [60]:
tokenizer.pad_token_id

In [30]:
pad_mask = (tgt != pad_token).to(torch.int).unsqueeze(1).unsqueeze(2)  # (batch_size, 1, 1, tgt_len)

torch.Size([32, 57])

In [None]:
# Inference function
def generate_sequence(decoder, memory, start_token, end_token, max_length, device):
    decoder.eval()
    with torch.no_grad():
        tgt_input = torch.tensor([[start_token]], device=device)  # Start-of-sequence token
        generated_sequence = [start_token]
        
        for _ in range(max_length):
            tgt_mask = decoder.generate_square_subsequent_mask(tgt_input.size(1)).to(device)
            pad_mask = create_padding_mask(batch_targets).to(device)
            output = decoder(tgt_input, batch_memory, tgt_mask=tgt_mask, pad_mask=pad_mask)

            output = decoder(tgt_input, memory, tgt_mask=tgt_mask)
            output = output[-1, :, :]  # Get the last token in the sequence
            topv, topi = output.topk(1)
            next_token = topi.item()
            
            if next_token == end_token:
                break
            
            generated_sequence.append(next_token)
            tgt_input = torch.cat((tgt_input, topi.view(1, 1)), dim=0)
        
    return generated_sequence

# Parameters for inference
start_token = tokenizer.bos_token_id  # Start-of-sequence token (modify as per your dataset)
end_token = tokenizer.eos_token_id     # End-of-sequence token (modify as per your dataset)
max_length = 50  # Maximum length of the generated sequence

# Example memory (encoded representation)
index = torch.randint(0, len(embeddings), (1,))
memory = embeddings[index].unsqueeze(0).to(device)

# Generate a sequence
generated_sequence = generate_sequence(decoder, memory, start_token, end_token, max_length, device)
print("Generated sequence:", tokenizer.decode(generated_sequence))

In [76]:
decoder_layer = nn.TransformerDecoderLayer(d_model=768, nhead=8, batch_first=True)
transformer_decoder = nn.TransformerDecoder(decoder_layer, num_layers=6)
memory = torch.rand(32, 1, 768)
tgt = torch.rand(32, 32, 768)
out = transformer_decoder(tgt, memory)

In [24]:
memory.shape

torch.Size([120, 32, 512])

In [23]:
decoder.pos_encoder(tgt.size(0)).to('cuda') + tgt

RuntimeError: The size of tensor a (39) must match the size of tensor b (768) at non-singleton dimension 1

In [26]:
decoder.pos_encoder(tgt.size(0)).to('cuda').shape

torch.Size([39, 39])

In [6]:
!export CUDA_LAUNCH_BLOCKING=1


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [None]:
torch.distributed.

In [58]:
print("PyTorch Version: ", torch.__version__)

PyTorch Version:  2.3.0+cu118


In [56]:
import torch
import torch.nn as nn
import torch.optim as optim

# Define a simple model
class SimpleModel(nn.Module):
    def __init__(self):
        super(SimpleModel, self).__init__()
        self.linear = nn.Linear(10, 10)
        
    def forward(self, x):
        return self.linear(x)

# Instantiate the model
model = SimpleModel()

# Try to initialize the optimizer
try:
    optimizer = optim.Adam(model.parameters(), lr=0.001)
    print("Optimizer initialized successfully.")
except Exception as e:
    print(f"Error initializing optimizer: {e}")

Error initializing optimizer: partially initialized module 'torch._dynamo' has no attribute 'external_utils' (most likely due to a circular import)


In [77]:
!pip3 install -U torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


IOStream.flush timed out
IOStream.flush timed out
IOStream.flush timed out
Defaulting to user installation because normal site-packages is not writeable
[0mLooking in indexes: https://download.pytorch.org/whl/cu118
[0m

In [75]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer

# Assuming embeddings and tokenized_sentences are already provided


# Tokenize sentences
tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-Instruct-v0.3")
tokenized_sentences = [
    tokenizer.encode(sentence, add_special_tokens=True) for sentence in sentences
]
tokenizer.pad_token_id = tokenizer.eos_token_id


class EmbeddingSentenceDataset(Dataset):
    def __init__(self, embeddings, tokenized_sentences):
        self.embeddings = embeddings
        self.tokenized_sentences = tokenized_sentences

    def __len__(self):
        return len(self.tokenized_sentences)

    def __getitem__(self, idx):
        embedding = self.embeddings[idx]
        sentence = self.tokenized_sentences[idx]
        return embedding, torch.tensor(sentence)


dataset = EmbeddingSentenceDataset(embeddings, tokenized_sentences)
dataloader = DataLoader(
    dataset,
    batch_size=16,
    shuffle=True,
    collate_fn=lambda x: (
        torch.stack([i[0] for i in x]),
        torch.nn.utils.rnn.pad_sequence(
            [i[1] for i in x], batch_first=True, padding_value=tokenizer.pad_token_id
        ),
    ),
)


class TransformerDecoderModel(nn.Module):
    def __init__(
        self, embedding_dim, vocab_size, max_seq_length, num_layers=6, nhead=8
    ):
        super(TransformerDecoderModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.positional_encoding = nn.Parameter(
            torch.zeros(1, max_seq_length, embedding_dim)
        )
        self.decoder_layer = nn.TransformerDecoderLayer(
            d_model=embedding_dim, nhead=nhead
        )
        self.transformer_decoder = nn.TransformerDecoder(
            self.decoder_layer, num_layers=num_layers
        )
        self.fc_out = nn.Linear(embedding_dim, vocab_size)

    def forward(self, embedding, target_seq):
        target_embedding = (
            self.embedding(target_seq)
            + self.positional_encoding[:, : target_seq.size(1), :]
        )
        target_embedding = target_embedding.permute(1, 0, 2)
        embedding = embedding.unsqueeze(0).repeat(target_embedding.size(0), 1, 1)
        output = self.transformer_decoder(target_embedding, embedding)
        output = output.permute(1, 0, 2)
        return self.fc_out(output)


vocab_size = tokenizer.vocab_size
embedding_dim = embeddings.shape[1]
max_seq_length = max(len(sentence) for sentence in tokenized_sentences)

model = TransformerDecoderModel(embedding_dim, vocab_size, max_seq_length)

# Define loss and optimizer
criterion = nn.CrossEntropyLoss(ignore_index=tokenizer.pad_token_id)
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Training loop
num_epochs = 10
for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    for batch_embeddings, batch_sentences in dataloader:
        print(batch_embeddings, batch_sentences)
        optimizer.zero_grad()

        input_sentences = batch_sentences[:, :-1]
        target_sentences = batch_sentences[:, 1:]

        outputs = model(batch_embeddings, input_sentences)

        outputs = outputs.view(-1, vocab_size)
        target_sentences = target_sentences.view(-1)

        loss = criterion(outputs, target_sentences)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {total_loss/len(dataloader):.4f}")

tokenizer_config.json:   0%|          | 0.00/137k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/587k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

AttributeError: partially initialized module 'torch._dynamo' has no attribute 'external_utils' (most likely due to a circular import)

In [113]:
dataset[0]

(tensor([1.8667e-01, 3.5469e-01, 7.4079e-01, 3.8495e-01, 4.5757e-01, 1.8808e-01,
         7.9332e-02, 2.6692e-01, 2.6038e-01, 9.9068e-01, 2.2083e-01, 7.8854e-01,
         4.0219e-02, 6.8807e-01, 8.9534e-01, 2.3610e-01, 5.5654e-01, 1.9255e-01,
         3.5528e-01, 5.2827e-01, 2.1058e-01, 3.9195e-02, 3.0539e-01, 5.7300e-01,
         3.2841e-01, 5.7424e-01, 8.2525e-01, 4.7575e-01, 5.4107e-02, 4.7962e-01,
         9.2722e-01, 9.1097e-01, 5.1250e-01, 2.7457e-02, 9.4006e-01, 7.4600e-01,
         3.0880e-01, 1.4981e-01, 5.6556e-01, 7.6072e-01, 7.0257e-01, 5.0587e-01,
         2.5881e-01, 8.3955e-01, 7.4624e-01, 1.8518e-01, 3.0902e-01, 2.3619e-01,
         6.8042e-01, 7.1251e-01, 4.6427e-01, 4.3929e-01, 3.8868e-01, 4.9183e-01,
         1.9045e-03, 6.0023e-01, 6.1350e-01, 6.0221e-01, 2.0156e-01, 2.9949e-01,
         4.9452e-01, 9.4434e-01, 3.0134e-02, 8.4221e-01, 5.3513e-01, 6.2812e-01,
         7.8632e-01, 4.7405e-01, 1.8452e-01, 3.7968e-01, 8.3256e-01, 8.3645e-01,
         6.2918e-01, 9.5676e

In [53]:
import torch.nn.functional as F

tokenizer.cls_token_id = tokenizer.bos_token_id


def generate_sentence(model, embedding, tokenizer, max_length=50):
    # Initialize the input sequence with the start token
    input_ids = torch.tensor([[tokenizer.cls_token_id]])  # CLS token as the start token
    embedding = embedding.unsqueeze(0)  # Ensure embedding is in the correct shape

    model.eval()
    with torch.no_grad():
        for _ in range(max_length):
            # Get the logits from the model
            outputs = model(embedding, input_ids)
            logits = outputs[:, -1, :]  # Get the logits for the last token only

            # Apply softmax to get probabilities and sample the next token
            probs = F.softmax(logits, dim=-1)
            next_token = torch.argmax(probs, dim=-1).unsqueeze(0)

            # Append the predicted token to the input sequence
            input_ids = torch.cat([input_ids, next_token], dim=1)

            # Break if the model predicts the end-of-sequence token
            if next_token.item() == tokenizer.sep_token_id:
                break

    return input_ids.squeeze().tolist()

In [54]:
# Dummy example embedding (use your actual embedding)
embedding = torch.rand((768,))

# Generate a sentence from the embedding
generated_ids = generate_sentence(model, embedding, tokenizer)
generated_sentence = tokenizer.decode(generated_ids, skip_special_tokens=True)
print(generated_sentence)

RuntimeError: The size of tensor a (3) must match the size of tensor b (2) at non-singleton dimension 1

In [111]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer


# Define the model architecture
class TransformerDecoderModel(nn.Module):
    def __init__(
        self, embedding_dim, vocab_size, max_seq_length, num_layers=6, nhead=8
    ):
        super(TransformerDecoderModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.positional_encoding = nn.Parameter(
            torch.zeros(1, max_seq_length, embedding_dim)
        )
        self.decoder_layer = nn.TransformerDecoderLayer(
            d_model=embedding_dim, nhead=nhead
        )
        self.transformer_decoder = nn.TransformerDecoder(
            self.decoder_layer, num_layers=num_layers
        )
        self.fc_out = nn.Linear(embedding_dim, vocab_size)

    def forward(self, embedding, target_seq):
        target_embedding = (
            self.embedding(target_seq)
            + self.positional_encoding[:, : target_seq.size(1), :]
        )
        target_embedding = target_embedding.permute(
            1, 0, 2
        )  # (target_seq_len, batch_size, embedding_dim)
        embedding = embedding.unsqueeze(0).repeat(target_embedding.size(0), 1, 1)
        output = self.transformer_decoder(target_embedding, embedding)
        output = output.permute(1, 0, 2)  # (batch_size, target_seq_len, embedding_dim)
        return self.fc_out(output)


# Define the dataset
class EmbeddingSentenceDataset(Dataset):
    def __init__(self, embeddings, tokenized_sentences):
        self.embeddings = embeddings
        self.tokenized_sentences = tokenized_sentences

    def __len__(self):
        return len(self.tokenized_sentences)

    def __getitem__(self, idx):
        embedding = self.embeddings[idx]
        sentence = self.tokenized_sentences[idx]
        return embedding, torch.tensor(sentence)


# Prepare the data (dummy example data)
embeddings = torch.rand((100, 768))  # Example embeddings
sentences = ["This is a sample sentence.", "Here is another one."]
tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-Instruct-v0.3")


# Check if the tokenizer has a padding token, if not, set one

tokenizer.pad_token_id = tokenizer.eos_token_id
# Check if the tokenizer has bos and eos tokens, if not, set them
tokenizer.cls_token_id = tokenizer.bos_token_id
tokenized_sentences = [
    tokenizer.encode(sentence, add_special_tokens=True) for sentence in sentences
]
dataset = EmbeddingSentenceDataset(embeddings, tokenized_sentences)
dataloader = DataLoader(
    dataset,
    batch_size=16,
    shuffle=True,
    collate_fn=lambda x: (
        torch.stack([i[0] for i in x]),
        torch.nn.utils.rnn.pad_sequence(
            [i[1] for i in x], batch_first=True, padding_value=tokenizer.pad_token_id
        ),
    ),
)

# Initialize the model
vocab_size = tokenizer.vocab_size
embedding_dim = embeddings.shape[1]
max_seq_length = max(len(sentence) for sentence in tokenized_sentences)
model = TransformerDecoderModel(embedding_dim, vocab_size, max_seq_length)

# Define loss and optimizer
criterion = nn.CrossEntropyLoss(ignore_index=tokenizer.pad_token_id)
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Training loop
num_epochs = 10
for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    for batch_embeddings, batch_sentences in dataloader:
        optimizer.zero_grad()

        input_sentences = batch_sentences[:, :-1]
        target_sentences = batch_sentences[:, 1:]

        outputs = model(batch_embeddings, input_sentences)

        outputs = outputs.reshape(-1, vocab_size)
        target_sentences = target_sentences.reshape(-1)

        loss = criterion(outputs, target_sentences)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {total_loss/len(dataloader):.4f}")


# Define the generation function
def generate_sentence(model, embedding, tokenizer, max_length=50):
    input_ids = torch.tensor([[tokenizer.bos_token_id]])  # BOS token as the start token
    embedding = embedding.unsqueeze(
        0
    )  # Ensure embedding is in the correct shape (1, embedding_dim)

    model.eval()
    with torch.no_grad():
        for _ in range(max_length):
            # Ensure input_ids is in the correct shape (batch_size, sequence_length)
            seq_len = input_ids.size(1)
            positional_encoding = model.positional_encoding[:, :seq_len, :]
            input_embedding = model.embedding(input_ids) + positional_encoding
            input_embedding = input_embedding.permute(1, 0, 2)
            embedding = embedding.unsqueeze(0).repeat(input_embedding.size(0), 1, 1)
            outputs = model.transformer_decoder(input_embedding, embedding)
            outputs = outputs.permute(1, 0, 2)
            logits = outputs[:, -1, :]  # Get the logits for the last token only
            probs = F.softmax(logits, dim=-1)
            next_token = torch.argmax(probs, dim=-1).unsqueeze(0)

            # Ensure next_token is in the right shape for concatenation
            input_ids = torch.cat([input_ids, next_token], dim=1)
            if next_token.item() == tokenizer.eos_token_id:
                break

    return input_ids.squeeze().tolist()


# Example usage
embedding = torch.rand((768,))  # Dummy example embedding (use your actual embedding)
generated_ids = generate_sentence(model, embedding, tokenizer)
generated_sentence = tokenizer.decode(generated_ids, skip_special_tokens=True)
print(generated_sentence)

Epoch [1/10], Loss: 10.2921
Epoch [2/10], Loss: 5.4214
Epoch [3/10], Loss: 4.1661
Epoch [4/10], Loss: 3.3287
Epoch [5/10], Loss: 2.4861
Epoch [6/10], Loss: 2.0809
Epoch [7/10], Loss: 1.8369
Epoch [8/10], Loss: 1.8456
Epoch [9/10], Loss: 1.7750
Epoch [10/10], Loss: 1.7494


RuntimeError: Number of dimensions of repeat dims can not be smaller than number of dimensions of tensor

In [68]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer


# Define the model architecture
class TransformerDecoderModel(nn.Module):
    def __init__(
        self, embedding_dim, vocab_size, max_seq_length, num_layers=6, nhead=8
    ):
        super(TransformerDecoderModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.positional_encoding = nn.Parameter(
            torch.zeros(1, max_seq_length, embedding_dim)
        )
        self.decoder_layer = nn.TransformerDecoderLayer(
            d_model=embedding_dim, nhead=nhead
        )
        self.transformer_decoder = nn.TransformerDecoder(
            self.decoder_layer, num_layers=num_layers
        )
        self.fc_out = nn.Linear(embedding_dim, vocab_size)

    def forward(self, embedding, target_seq):
        seq_len = target_seq.size(1)
        positional_encoding = self.positional_encoding[:, :seq_len, :]
        target_embedding = self.embedding(target_seq) + positional_encoding
        target_embedding = target_embedding.permute(
            1, 0, 2
        )  # (target_seq_len, batch_size, embedding_dim)
        embedding = embedding.unsqueeze(0).repeat(target_embedding.size(0), 1, 1)
        output = self.transformer_decoder(target_embedding, embedding)
        output = output.permute(1, 0, 2)  # (batch_size, target_seq_len, embedding_dim)
        return self.fc_out(output)


# Define the dataset
class EmbeddingSentenceDataset(Dataset):
    def __init__(self, embeddings, tokenized_sentences):
        self.embeddings = embeddings
        self.tokenized_sentences = tokenized_sentences

    def __len__(self):
        return len(self.tokenized_sentences)

    def __getitem__(self, idx):
        embedding = self.embeddings[idx]
        sentence = self.tokenized_sentences[idx]
        return embedding, torch.tensor(sentence)


# Prepare the data (dummy example data)
embeddings = torch.rand((100, 768))  # Example embeddings
sentences = ["This is a sample sentence.", "Here is another one."]
tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-Instruct-v0.3")


tokenizer.pad_token_id = tokenizer.eos_token_id
# Check if the tokenizer has bos and eos tokens, if not, set them
tokenizer.cls_token_id = tokenizer.bos_token_id

tokenized_sentences = [
    tokenizer.encode(sentence, add_special_tokens=True) for sentence in sentences
]
dataset = EmbeddingSentenceDataset(embeddings, tokenized_sentences)
dataloader = DataLoader(
    dataset,
    batch_size=16,
    shuffle=True,
    collate_fn=lambda x: (
        torch.stack([i[0] for i in x]),
        torch.nn.utils.rnn.pad_sequence(
            [i[1] for i in x], batch_first=True, padding_value=tokenizer.pad_token_id
        ),
    ),
)

# Initialize the model
vocab_size = tokenizer.vocab_size
embedding_dim = embeddings.shape[1]
max_seq_length = max(len(sentence) for sentence in tokenized_sentences)
model = TransformerDecoderModel(embedding_dim, vocab_size, max_seq_length)

# Define loss and optimizer
criterion = nn.CrossEntropyLoss(ignore_index=tokenizer.pad_token_id)
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Training loop
num_epochs = 10
for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    for batch_embeddings, batch_sentences in dataloader:
        optimizer.zero_grad()

        input_sentences = batch_sentences[:, :-1]
        target_sentences = batch_sentences[:, 1:]

        outputs = model(batch_embeddings, input_sentences)

        outputs = outputs.reshape(-1, vocab_size)
        target_sentences = target_sentences.reshape(-1)

        loss = criterion(outputs, target_sentences)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {total_loss/len(dataloader):.4f}")


# Define the generation function
def generate_sentence(model, embedding, tokenizer, max_length=50):
    input_ids = torch.tensor([[tokenizer.bos_token_id]])  # BOS token as the start token
    embedding = embedding.unsqueeze(
        0
    )  # Ensure embedding is in the correct shape (1, embedding_dim)

    model.eval()
    with torch.no_grad():
        for _ in range(max_length):
            # Ensure input_ids is in the correct shape (batch_size, sequence_length)
            seq_len = input_ids.size(1)
            positional_encoding = model.positional_encoding[:, :seq_len, :]
            input_embedding = model.embedding(input_ids) + positional_encoding
            input_embedding = input_embedding.permute(1, 0, 2)
            embedding_repeated = embedding.repeat(seq_len, 1, 1)
            outputs = model.transformer_decoder(input_embedding, embedding_repeated)
            outputs = outputs.permute(1, 0, 2)
            logits = outputs[:, -1, :]  # Get the logits for the last token only
            probs = F.softmax(logits, dim=-1)
            next_token = torch.argmax(probs, dim=-1).unsqueeze(0)

            # Ensure next_token is in the right shape for concatenation
            input_ids = torch.cat([input_ids, next_token], dim=1)
            if next_token.item() == tokenizer.eos_token_id:
                break

    return input_ids.squeeze().tolist()


# Example usage
embedding = torch.rand((768,))  # Dummy example embedding (use your actual embedding)
generated_ids = generate_sentence(model, embedding, tokenizer)
generated_sentence = tokenizer.decode(generated_ids, skip_special_tokens=True)
print(generated_sentence)

Epoch [1/10], Loss: 10.6627
Epoch [2/10], Loss: 5.5577
Epoch [3/10], Loss: 4.3501
Epoch [4/10], Loss: 3.4984
Epoch [5/10], Loss: 2.6430
Epoch [6/10], Loss: 2.0773
Epoch [7/10], Loss: 1.7970
Epoch [8/10], Loss: 1.8159
Epoch [9/10], Loss: 1.8528
Epoch [10/10], Loss: 1.7129


RuntimeError: The size of tensor a (8) must match the size of tensor b (7) at non-singleton dimension 1

In [112]:
from tqdm.notebook import tqdm

max_length = 50
input_ids = torch.tensor([[tokenizer.bos_token_id]])  # BOS token as the start token
embedding = torch.rand((768,))  # Dummy example embedding (use your actual embedding)
embedding = embedding.unsqueeze(
    0
)  # Ensure embedding is in the correct shape (1, embedding_dim)
model.eval()
with torch.no_grad():
    for _ in tqdm(range(max_length)):
        # Ensure input_ids is in the correct shape (batch_size, sequence_length)
        seq_len = input_ids.size(1)
        positional_encoding = model.positional_encoding[:, :seq_len, :]
        input_embedding = model.embedding(input_ids) + positional_encoding
        input_embedding = input_embedding.permute(1, 0, 2)
        embedding_repeated = embedding.repeat(seq_len, 1, 1)
        outputs = model.transformer_decoder(input_embedding, embedding_repeated)
        outputs = outputs.permute(1, 0, 2)
        logits = outputs[:, -1, :]  # Get the logits for the last token only
        probs = F.softmax(logits, dim=-1)
        next_token = torch.argmax(probs, dim=-1).unsqueeze(0)
        # Ensure next_token is in the right shape for concatenation
        input_ids = torch.cat([input_ids, next_token], dim=1)
        if next_token.item() == tokenizer.eos_token_id:
            break

  0%|          | 0/50 [00:00<?, ?it/s]

RuntimeError: The size of tensor a (8) must match the size of tensor b (7) at non-singleton dimension 1

In [84]:
positional_encoding.shape

torch.Size([1, 7, 768])

In [73]:
input_ids

tensor([[  1, 681, 681, 681, 681, 681, 681, 681]])

In [75]:
tokenizer.decode(input_ids.squeeze().tolist())

'<s>[control_679][control_679][control_679][control_679][control_679][control_679][control_679]'