In [None]:
! pip install datasets sentencepiece transformers torch tqdm pandas

In [None]:
! pip install datasets

In [None]:
import math
import os
import time
from typing import List, Optional
import pandas as pd
import sentencepiece as spm
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.optim import AdamW
from torch.optim.lr_scheduler import LambdaLR
from transformers import LlamaTokenizerFast
from torch.utils.data import DataLoader, Dataset, random_split
from tqdm import tqdm
import torch

In [None]:
from transformers import LlamaTokenizerFast
from datasets import load_dataset

# Function to get training data
def get_training_corpus():
    dataset = load_dataset("text", data_files={"train": "/content/cleaned_data.txt"})
    for i in range(0, len(dataset["train"]), 1000):
        yield dataset["train"][i : i + 1000]["text"]

# Initialize the base tokenizer
base_tokenizer = LlamaTokenizerFast.from_pretrained("hf-internal-testing/llama-tokenizer")

# Train the new tokenizer
new_tokenizer = base_tokenizer.train_new_from_iterator(get_training_corpus(), vocab_size=1000)

# Save the new tokenizer
new_tokenizer.save_pretrained("new_tokenizer")

# Test the new tokenizer
test_text = "I was there that night "
encoded = new_tokenizer.encode(test_text)
decoded = new_tokenizer.decode(encoded)

print(f"Encoded: {encoded}")
print(f"Decoded: {decoded}")

In [None]:
class ModelArgs:
    def __init__(self,
                 dim=4096,
                 n_layers=32,
                 n_heads=32,
                 n_kv_heads=None,
                 vocab_size=-1,
                 multiple_of=256,
                 ffn_dim_multiplier=None,
                 norm_eps=1e-5,
                 mode='train',
                 batch_size=32,
                 max_seq_length=32,
                 device='cuda' if torch.cuda.is_available() else 'cpu', pad_token_id=None):
        self.dim = dim
        self.n_layers = n_layers
        self.n_heads = n_heads
        self.n_kv_heads = n_kv_heads
        self.vocab_size = vocab_size
        self.multiple_of = multiple_of
        self.ffn_dim_multiplier = ffn_dim_multiplier
        self.norm_eps = norm_eps
        self.mode = mode
        self.batch_size = batch_size
        self.max_seq_length = max_seq_length
        self.device = device
        self.pad_token_id = pad_token_id

class TrainArgs(ModelArgs):
    def __init__(self, n_epochs=10,
                 log_interval=12,
                 eval_iters=200,
                 lr=3e-4,
                 warmup_steps=4000,
                 **kwargs):
        super().__init__(**kwargs)
        self.n_epochs = n_epochs
        self.log_interval = log_interval
        self.lr = lr
        self.warmup_steps = warmup_steps

class RMSNorm(nn.Module):
    def __init__(self, dim, eps=1e-5):
        super().__init__()
        self.eps = eps
        self.scale = nn.Parameter(torch.ones(dim))

    def forward(self, x):
        norm = x.norm(2, dim=-1, keepdim=True)
        return self.scale * x / torch.sqrt(norm ** 2 + self.eps)

class SwiGLU(nn.Module):
    def __init__(self, dim_in, dim_out):
        super().__init__()
        self.linear1 = nn.Linear(dim_in, dim_out)
        self.linear2 = nn.Linear(dim_in, dim_out)

    def forward(self, x):
        return F.silu(self.linear1(x)) * self.linear2(x)

class RotaryEmbedding(nn.Module):
    def __init__(self, dim):
        super().__init__()
        inv_freq = 1.0 / (10000 ** (torch.arange(0, dim, 2).float() / dim))
        self.register_buffer('inv_freq', inv_freq)

    def forward(self, seq_len, device):
        t = torch.arange(seq_len, device=device).type_as(self.inv_freq)
        freqs = torch.einsum('i,j->ij', t, self.inv_freq)
        emb = torch.cat((freqs, freqs), dim=-1)
        return emb

class GroupedQueryAttention(nn.Module):
    def __init__(self, dim, n_heads, n_kv_heads):
        super().__init__()
        self.n_heads = n_heads
        self.n_kv_heads = n_kv_heads
        self.query = nn.Linear(dim, dim)
        self.key = nn.Linear(dim, dim)
        self.value = nn.Linear(dim, dim)
        self.out = nn.Linear(dim, dim)

    def forward(self, x):
        batch_size, seq_length, dim = x.size()
        q = self.query(x).view(batch_size, seq_length, self.n_heads, dim // self.n_heads)
        k = self.key(x).view(batch_size, seq_length, self.n_kv_heads, dim // self.n_kv_heads)
        v = self.value(x).view(batch_size, seq_length, self.n_kv_heads, dim // self.n_kv_heads)

        scores = torch.einsum('bhqd, bhkd -> bhqk', q, k) / math.sqrt(dim // self.n_heads)
        attn = torch.nn.functional.softmax(scores, dim=-1)
        context = torch.einsum('bhqk, bhvd -> bhqd', attn, v)
        context = context.contiguous().view(batch_size, seq_length, dim)
        return self.out(context)

class TransformerBlock(nn.Module):
    def __init__(self, args):
        super().__init__()
        self.attn = GroupedQueryAttention(args.dim, args.n_heads, args.n_kv_heads)
        self.norm1 = RMSNorm(args.dim, args.norm_eps)
        self.norm2 = RMSNorm(args.dim, args.norm_eps)
        self.mlp = nn.Sequential(
            nn.Linear(args.dim, args.ffn_dim_multiplier * args.dim),
            SwiGLU(args.ffn_dim_multiplier * args.dim, args.dim)
        )
        self.rotary_emb = RotaryEmbedding(args.dim)

    def forward(self, x):
        seq_len, device = x.shape[1], x.device
        rotary_emb = self.rotary_emb(seq_len, device)  # Should match x.shape[2]
        x = x + rotary_emb

        attn_out = self.attn(self.norm1(x))
        x = x + attn_out

        mlp_out = self.mlp(self.norm2(x))
        x = x + mlp_out

        return x


# the Transformer class
class Transformer(nn.Module):
    def __init__(self, args):
        super().__init__()
        self.token_emb = nn.Embedding(args.vocab_size, args.dim, padding_idx=args.pad_token_id)
        self.blocks = nn.ModuleList([TransformerBlock(args) for _ in range(args.n_layers)])
        self.norm = RMSNorm(args.dim, args.norm_eps)
        self.head = nn.Linear(args.dim, args.vocab_size, bias=False)

    def forward(self, x):
        x = self.token_emb(x)
        for block in self.blocks:
            x = block(x)
        x = self.norm(x)
        logits = self.head(x)
        return logits


# Train_model function
def train_model(model, train_loader, eval_loader, train_args, tokenizer):
    model = model.to(train_args.device)
    optimizer = torch.optim.AdamW(model.parameters(), lr=train_args.lr)
    scheduler = torch.optim.lr_scheduler.LambdaLR(optimizer, lr_lambda=lambda step: min(1.0, step / train_args.warmup_steps))

    for epoch in range(train_args.n_epochs):
        model.train()
        for step, batch in enumerate(train_loader):
                input_ids, labels = batch
                input_ids, labels = input_ids.to(train_args.device), labels.to(train_args.device)

                outputs= model(input_ids)
                loss_fn = nn.CrossEntropyLoss()
                loss = loss_fn(outputs.view(-1, tokenizer.vocab_size), labels.view(-1))
                loss.backward()
                optimizer.step()
                scheduler.step()
                optimizer.zero_grad()
                if step % train_args.log_interval == 0:
                    print(f"Epoch: {epoch}, Step: {step}, Loss: {loss.item()}")
        model.eval()
        eval_loss = 0
        with torch.no_grad():
            for step, batch in enumerate(eval_loader):
                input_ids, labels = batch

                input_ids, labels = input_ids.to(train_args.device), labels.to(train_args.device)
                outputs= model(input_ids)

                loss_fn = nn.CrossEntropyLoss()
                loss = loss_fn(outputs.view(-1, tokenizer.vocab_size), labels.view(-1))
                eval_loss += loss.item()
        print(f"Epoch: {epoch}, Evaluation Loss: {eval_loss / len(eval_loader)}")

        # Save the trained model
        model_save_path = "llama2.pt"
        torch.save(model.state_dict(), model_save_path)
        print(f"Model saved to {model_save_path}")

In [None]:
import torch
from torch.utils.data import DataLoader, Dataset
from transformers import LlamaTokenizerFast

class TextDataset(Dataset):
    def __init__(self, file_path, tokenizer, max_length):
        self.tokenizer = tokenizer
        self.max_length = max_length
        with open(file_path, 'r', encoding='utf-8') as f:
            self.texts = f.readlines()

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        encoding = self.tokenizer(text, truncation=True,
                                  padding='max_length',
                                  max_length=self.max_length,
                                  return_tensors='pt') # Ensure PyTorch Tensor output

        input_ids = encoding['input_ids'].squeeze()

        # Assuming you want to use the input_ids as labels for language modeling

        labels = input_ids.clone()

        labels[:-1] = input_ids[1:]  # Shift labels
        return input_ids, labels  # Return both input_ids and labels


# Add padding token if it doesn't exist
if new_tokenizer.pad_token is None:
    new_tokenizer.add_special_tokens({'pad_token': '[PAD]'})

# Ensure pad_token_id is valid
if new_tokenizer.pad_token_id is None or new_tokenizer.pad_token_id >= new_tokenizer.vocab_size:
    new_tokenizer.add_special_tokens({'pad_token': '[PAD]'})
    new_tokenizer.pad_token_id = new_tokenizer.vocab_size - 1  # Use the last valid token ID as padding


# Create dataset and dataloaders
train_dataset = TextDataset('/content/cleaned_data.txt', new_tokenizer, max_length=512)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)

eval_dataset = TextDataset('/content/validation.txt', new_tokenizer, max_length=512)
eval_loader = DataLoader(eval_dataset, batch_size=32, shuffle=False)

# Initialize llama arguments
model_args = ModelArgs(
    dim=1024,
    n_layers=4,
    n_heads=4,
    n_kv_heads=4,
    vocab_size=new_tokenizer.vocab_size,
    ffn_dim_multiplier=4,
    norm_eps=1e-5,
    batch_size=2,
    max_seq_length=512,
    device='cuda' if torch.cuda.is_available() else 'cpu',
    pad_token_id=new_tokenizer.pad_token_id  # Ensure this value is within the vocabulary size
)

# Initialize training arguments
train_args = TrainArgs(
    n_epochs=1,# Number of epochs to train the model
    log_interval=12,# How often to log the training progress
    lr=3e-4,        # Learning rate for the optimizer
    warmup_steps=4000,# Number of warmup steps for the learning rate scheduler
    device='cuda' if torch.cuda.is_available() else 'cpu',# Compute device
    vocab_size=new_tokenizer.vocab_size # Size of the model's vocabulary
)
# Initialize model
model = Transformer(model_args)

# Train the model
train_model(model, train_loader, eval_loader, train_args, new_tokenizer)


In [None]:
# Set the maximum length of the generated text to 30
max_length = 30
device='cuda' if torch.cuda.is_available() else 'cpu'
# Set the number of sequences to generate to 5
num_return_sequences = 5

tokens = new_tokenizer.encode("I'm a ")
tokens = torch.tensor(tokens, dtype=torch.long)

# Add a batch dimension to the input tensor and repeat it num_return_sequences times to generate multiple sequences
tokens = tokens.unsqueeze(0).repeat(num_return_sequences, 1)

# Move the input tensor to the device (GPU if available, otherwise CPU)
x = tokens.to(device)

while x.size(1) < max_length:
    # forward the model to get the logits
    with torch.no_grad():
        outputs = model(x) # (B, T, vocab_size)
        # take the logits at the last position
        logits = outputs[0] if isinstance(outputs, tuple) else outputs
        logits = logits[:, -1, :] # (B, vocab_size)
        # get the probabilities
        probs = F.softmax(logits, dim=-1)
        # topk_probs here becomes (5, 50), topk_indices is (5, 50)
        topk_probs, topk_indices = torch.topk(probs, 50, dim=-1)
        # select a token from the top-k probabilities
        # note: multinomial does not demand the input to sum to 1
        ix = torch.multinomial(topk_probs, 1) # (B, 1)
        # gather the corresponding indices
        xcol = torch.gather(topk_indices, -1, ix) # (B, 1)
        # append to the sequence
        x = torch.cat((x, xcol), dim=1)

# print the generated text
for i in range(num_return_sequences):
    tokens = x[i, :max_length].tolist()
    decoded = new_tokenizer.decode(tokens)
    print(">", decoded)