In [1]:
config ={
    'epochs': 50,
    'batch_size':32,
    'lr':0.001,
    'd_model':512,
    'num_heads':8,
    'd_ff': 2048,
    'num_layers':6,
    'vocab_size': 50000,
    'checkpoint_path':"model.pth",
    'dropout':0.01
}

# Tokenization

In [125]:
from collections import Counter
import re

def tokenize(text,vocab):
    words = re.findall(r'\b\w+\b',text.lower())
    words.append("<EOS>")
    words.insert(0,"<SOS>")
    return [vocab.get(word,vocab["UNK"]) for word in words]

def detokenize(tokens,vocab):
    tokens= tokens.tolist()
    reverse_vocab = {v:k for k,v in vocab.items()}
    return " ".join([reverse_vocab.get(token,"UNK") for token in tokens])

def build_vocab(text, vocab_size=50000):
    words = re.findall(r'\b\w+\b',text.lower())
    word_counts =Counter(words)
    most_common = word_counts.most_common(vocab_size-2)

    vocab ={"<PAD>":0, "UNK":1,"<SOS>":2,"<EOS>":3}
    vocab.update({word: idx +4 for idx,(word,_) in enumerate(most_common)})

    return vocab



# Data processing

In [114]:
import torch 
import torch.nn as nn
import torch.nn.functional as F
import math
import os

def preprocess_text(input_file, output_file, vocab_size:int=5000, seq_len:int=64,encoding:str='utf-8'):
    try:
        with open(input_file, 'r', encoding=encoding) as file:
            text= file.read().lower()
    except UnicodeDecodeError:
        with open(input_file, 'r', encoding='latin-1') as file:
            text =file.read().lower()
    except Exception as e:
        print(f'Error reading file: {e}')
    
    vocab = build_vocab(text,vocab_size)
    tokenize_text = tokenize(text,vocab)

    data = [tokenize_text[i:i+seq_len] for i in range(0,len(tokenize_text)-seq_len,seq_len)]

    os.makedirs(os.path.dirname(output_file), exist_ok=True)

    torch.save((data, vocab), output_file)
    print(f'Data saved to {output_file}')
    print(f'Vocabulary size: {len(vocab)}')
    print(f'Total token:{len(tokenize_text)}')
    print(f'Number of seq:{len(data)}')

# Downloading datset

In [4]:
import pandas as pd

splits = {'train': 'data/train-00000-of-00001-679af0bccbb2f644.parquet', 'validation': 'data/validation-00000-of-00001-089cf71e86c88e28.parquet', 'test': 'data/test-00000-of-00001-ae1348a38be3cb29.parquet'}
df = pd.read_parquet("hf://datasets/Sandipan1994/Inference_Text_Generation/" + splits["train"])

In [17]:
df_val = pd.read_parquet("hf://datasets/Sandipan1994/Inference_Text_Generation/" + splits["validation"])
df_text = pd.read_parquet("hf://datasets/Sandipan1994/Inference_Text_Generation/" + splits["test"])
df_d = pd.concat([df,df_val,df_text],ignore_index=True)
df_d.drop(columns=['inference'],inplace=True)
df_d

Unnamed: 0,step
0,leo is a kind of constellation & a constellati...
1,leo is a constellation containing stars & the ...
2,earth is a kind of celestial object & a star i...
3,apparent motion of stars is when stars appear ...
4,the earth rotating on its axis causes apparent...
...,...
5876,as heat is transferred from something to somet...
5877,the temperature of the liquid will decrease & ...
5878,the heat energy of the liquid will decrease & ...
5879,the temperature of the liquid will decrease & ...


In [18]:
path ='E:/data sciences/LLMs/gen/text_generation/data.csv'
df_d.to_csv(path,index=False)
print(f'Saved to {path}')

Saved to E:/data sciences/LLMs/gen/text_generation/data.csv


# convert CSV to .txt

In [115]:
import csv
text_file='E:/data sciences/LLMs/gen/text_generation/data.txt'
with open(text_file,"w") as output_file:
    with open(path,"r") as input_file:
        [output_file.write(" ".join(row)+'\n') for row in csv.reader(input_file)]

In [128]:
data_tokens ='E:/data sciences/LLMs/gen/text_generation/token.pt'
preprocess_text(text_file,data_tokens)

Data saved to E:/data sciences/LLMs/gen/text_generation/token.pt
Vocabulary size: 4079
Total token:121076
Number of seq:1891


# Building a data pipeline for feeding data in batches to the model

In [130]:
with open(data_tokens,'rb') as file:
    data, vocab = torch.load(file)


In [None]:
from torch.utils.data import Dataset

class CustomTextDataset(Dataset):
    def __init__(self,text_data,tokenizer,voc, max_len):
        self.text_data = text_data 
        self.tokenizer= tokenizer
        self.voc= voc
        self.max_len = max_len

    def __len__(self):
        return len(self.text_data) # number of samples in the dataset
    
    def __getitem__(self, index):
        data= self.text_data[index]

        # tokenizer the text data
        data_token = self.tokenizer(data, self.voc)

        # truncate if too long
        if len(data_token) >self.max_len:
            tokens = tokens[:self.max_len]
        else:
            tokens= data_token
        
        # create input tensor (padded)
        input_ids = torch.zeros(self.max_len,dtype=torch.long)
        input_ids[:len(tokens)] = torch.tensor(tokens, dtype=torch.long)

        # create target_ids = inputs_ids shifted left twice plus for <SOS>
        target_ids = torch.zeros(self.max_len, dtype=torch.long)
        if len(tokens)>1:
            target_ids[:len(tokens)-2] =torch.tensor(tokens[2:], dtype=torch.long) 

        return input_ids, target_ids


In [149]:
text_data= [
    "leo is a constellation",
    "earth rotates on its axis"
]
see = CustomTextDataset(text_data, tokenize, vocab, max_len=10)
inp, tgt = see[0]
print("input ids: ", inp)
print("output ids: ", tgt)

input ids:  tensor([   2, 2469,    6,    4, 2099,    3,    0,    0,    0,    0])
output ids:  tensor([   6,    4, 2099,    3,    0,    0,    0,    0,    0,    0])


In [None]:
## Data loader

In [15]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Model Architacture

In [9]:
class MultiHeadAttention(nn.Module):
    def __init__(self,d_model, num_heads):
        super(MultiHeadAttention,self).__init__()
        assert d_model % num_heads ==0, "d_model must be divisible by num_heads"

        # initialize dimensions
        self.d_model = d_model
        self.num_heads= num_heads
        self.d_k = d_model // num_heads

        # linear layers for transforming the inputs queries, keys and valuea
        self.W_q = nn.Linear(d_model,d_model)
        self.W_k = nn.Linear(d_model,d_model)
        self.W_v = nn.Linear(d_model,d_model)
        self.W_o = nn.Linear(d_model,d_model)

    def scaled_dot_product_aatention(self,q,k,v,mask=None):
        attn_scores =torch.matmul(q,k.transpose(-2,-1))/ math.sqrt(self.d_k)

        if mask is not None:
            attn_scores =attn_scores.masked_fill(mask==0,float('-inf'))
        attn_probs = torch.softmax(attn_scores,dim=-1)

        output = torch.matmul(attn_scores,v)
        return output
    def split_head(self,x):
        # reshape the input tensor to (batch_size, seq_len, num_heads, d_k)
        batch_size, seq_len, d_model =x.size()
        return x.view(batch_size,seq_len, self.num_heads,self.d_k).transpose(1,2)
    
    def combine_heads(self,x):
        batch_size, num_heads, seq_len, d_k =x.size()
        return x.transpose(1,2).contiguous().view()(batch_size,seq_len,self.d_model)
    
    def forward(self,q,k,v,mask=None):
        # linear transformation and split
        Q= self.split_head(self.W_q(q))
        K = self.split_head(self.W_k(k))
        V = self.split_head(self.W_v(v))

        attn_output = self.scaled_dot_product_aatention(Q,K,V,mask)
        attn_output =self.W_o(self.combine_heads(attn_output))

        return attn_output

In [14]:
class PositionWiseFeedForward(nn.Module):
    def __init__(self,d_model,d_ff,dropout=0.1):
        super(PositionWiseFeedForward,self).__init__()
        self.fc1 = nn.Linear(d_model,d_ff)
        self.fc2 = nn.Linear(d_ff,d_model)
        self.activation =nn.GeLU()
        self.dropout = nn.Dropout(dropout)
    
    def forward(self,x):
        x = self.fc1(x)
        x = self.activation(x)
        x = self.dropout(x)
        x = self.fc2(x)
        x = self.dropout(x)
        return x

In [11]:
class PositionalEncoding(nn.Module):
    def __init__(self,d_model,max_seq_len):
        super(PositionalEncoding,self).__init__()

        pe = torch.zeros(max_seq_len,d_model)
        position =torch.arange(0, max_seq_len,dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0,d_model,2).float()* - (math.log(10000.0)/d_model))

        pe[:,0::2] = torch.sin(position*div_term)
        pe[:,1::2] = torch.cos(position*div_term)

        self.register_buffer('pe',pe.unsqueeze(0))

    def forward(self,x):
        return x + self.pe[:, :x.size(1)]

In [None]:
class DecoderLayer(nn.Module):
    def __init__(self,d_model, num_heads,d_ff,dropout):
        super(DecoderLayer,self).__init__()
        self.self_attn = MultiHeadAttention(d_model, num_heads)
        self.feed_forward = PositionWiseFeedForward(d_model, d_ff, dropout)
        self.norm1 =nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)
        self.dropout = nn.Dropout(dropout)

    def forward(self,x,mask):
        attn_output = self.self_attn(x,x,x,mask)
        x =self.norm1(x + self.dropout(attn_output))
        ff_output = self.feed_forward(x)
        x = self.norm2(x+self.dropout(ff_output))
        return x

In [16]:
class Transformer(nn.Module):
    def __init__(self,vocab_size, d_model, num_heads, num_layers, d_ff,max_seq_len, dropout,device):
        super(Transformer,self).__init__()
        self.max_seq_len = max_seq_len
        self.device = device
        self.decoder_embedding = nn.Embedding(vocab_size,d_model)
        self.positional_encoding= PositionalEncoding(d_model, max_seq_len)

        self.decoder_layers = nn.ModuleList([DecoderLayer(d_model,num_heads,d_ff,dropout) for _ in range(num_layers)])

        self.fc= nn.Linear(d_model,vocab_size)
        self.dropout = nn.Dropout(dropout)

    def genarate_mask (self,data):
        # padding mask
        pad_mask = (data !=0).unsqueeze(1).unsqueeze(2)

        # casual mask (seq_len, seq_len)
        seq_len = data.size(1)
        casual_mask = torch.tril(torch.ones((seq_len,seq_len),device=self.device)).unsqueeze(0).unsqueeze(1)

        # combine all only non-pad + past tokens
        mask = pad_mask & casual_mask
        return mask
    
    def forward(self, data):
        mask =self.genarate_mask(data)

        data_embed = self.dropout(self.positional_encoding(self.decoder_embedding(data)))
        x = data_embed
        for layer in self.decoder_layers:
            x = layer(x,mask)
        output = self.fc(x)
        return output


# Compile the model

In [21]:
model = Transformer(config['vocab_size'],
                    config['d_model'],
                    config['num_heads'],
                    config['num_layers'],
                    config['d_ff'],
                    config['vocab_size'],
                    config['dropout'],
                    device=device
                    )

In [None]:
import torch.optim as optim

def train_model(vocab_size, config, device):
    optimizer = optim.Adam(model.parameters(), lr=config['lr'])
    criterion = torch.nn.CrossEntropyLoss(ignore_index=0)  # ignore padding

    # Create checkpoint folder
    os.makedirs(os.path.dirname(config['checkpoint_path']), exist_ok=True)

    for epoch in range(config['epochs']):
        model.train()
        total_loss = 0

        for input_seq, target_seq in data_loader:
            input_seq = input_seq.to(device)
            target_seq = target_seq.to(device)

            optimizer.zero_grad()
            logits = model(input_seq)  # (batch, seq_len, vocab_size)

            # Flatten for CE Loss
            loss = criterion(
                logits.view(-1, vocab_size),
                target_seq.view(-1)
            )

            loss.backward()
            optimizer.step()

            total_loss += loss.item() * input_seq.size(0)

        avg_loss = total_loss / len(dataset)
        print(f"Epoch {epoch+1}/{config['epochs']}, Avg Loss: {avg_loss:.4f}")

        # Save checkpoint
        torch.save(model.state_dict(), config['checkpoint_path'])