# Text Generation using Diffusion Model

## Importing libraries

In [1]:
from datasets import load_dataset
import torch
from torch.utils.data import Dataset, DataLoader

  from .autonotebook import tqdm as notebook_tqdm


## Setting up the Device

In [2]:
device = "cuda" if torch.cuda.is_available() else "cpu"
device

'cuda'

## Loading the Dataset(wikitext 2)

- Dataset Summary
<p>
  
        The WikiText language modeling dataset is a collection of over 100 million tokens extracted from the set of verified Good and Featured articles on Wikipedia. The dataset is available under the Creative Commons Attribution-ShareAlike License.

        The WikiText dataset also features a far larger vocabulary and retains the original case, punctuation and numbers - all of which are removed in PTB. As it is composed of full articles, the dataset is well suited for models that can take advantage of long term dependencies.

        Each subset comes in two different variants:

            1 Raw (for character level work) contain the raw tokens, before the addition of the (unknown) tokens.
            2 Non-raw (for word level work) contain only the tokens in their vocabulary (wiki.train.tokens, wiki.valid.tokens, and wiki.test.tokens). The out-of-vocabulary tokens have been replaced with the the token.

</p>

<details>
<summary>Dataset Structure</summary>

    wikitext-2-raw-v1
        - Size of downloaded dataset files: 4.72 MB
        - Size of the generated dataset: 13.54 MB
        - Total amount of disk used: 18.26 MB

    An example of 'train' looks as follows.

    This example was too long and was cropped:
<code>

{
    "text": "\" The Sinclair Scientific Programmable was introduced in 1975 , with the same case as the Sinclair Oxford . It was larger than t..."
}
</code>
</details>

In [3]:
# Load WikiText-2 (or switch to 'wikitext' and subset='wikitext-103-raw-v1' for larger variant)
dataset = load_dataset("wikitext", "wikitext-2-raw-v1")
print(dataset)

DatasetDict({
    test: Dataset({
        features: ['text'],
        num_rows: 4358
    })
    train: Dataset({
        features: ['text'],
        num_rows: 36718
    })
    validation: Dataset({
        features: ['text'],
        num_rows: 3760
    })
})


In [10]:
#Checking the size of the dataset
len(dataset['train']['text'])

36718

## Creating a Custom Dataset 

We're going to create a Custom dataset function by sub-classing the pytorch `torch.utils.data.Dataset()` class. This is done so that we can load in the dataset accordingly as per the pytorch library needs. 

The main reason to subclass the `Dataset()` class is for PyTorchâ€™s `DataLoader` can fetch small "batches" of data from it while training your model â€” like a vending machine that gives you exactly what you ask for, in bite-sized pieces.



In [8]:
class TextDataset(Dataset):
    def __init__(self, texts, vocab=None, seq_len=32):
        self.seq_len = seq_len
        self.tokens = [token for line in texts for token in line.split()]
        self.vocab = vocab or self.build_vocab(self.tokens)
        self.token_ids = [self.vocab['stoi'].get(token, self.vocab['stoi']['<unk>']) for token in self.tokens]

    def build_vocab(self, tokens):
        vocab = sorted(set(tokens))
        stoi = {token: i+4 for i, token in enumerate(vocab)}
        stoi.update({'<pad>': 0, '<unk>': 1, '<bos>': 2, '<eos>': 3})
        itos = {i: token for token, i in stoi.items()}
        return {'stoi': stoi, 'itos': itos}

    def __len__(self):
        return len(self.token_ids) - self.seq_len

    def __getitem__(self, idx):
        chunk = self.token_ids[idx:idx+self.seq_len]
        return torch.tensor(chunk, dtype=torch.long)

In [9]:
class TextDataset(Dataset):
    def __init__(self, texts, vocab = None, seq_len = 32):
        self.seq_len = seq_len
        self.tokens = [token for line in texts for token in line.split()]
        self.vocab = vocab or self.build_vocab(self.tokens)
        self.token_ids = [self.vocab['stoi'].get(token, self.vocab['stoi']['unk']) for token in self.tokens]

    def build_vocab(self, tokens):
        vocab = sorted(set(tokens)) #To extract unique words out of the datasets 
        stoi = {token : ids+4 for ids, token in enumerate(vocab)}
        stoi.update({'<pad>':0,'<unk>':1, '<bos>': 2, '<eos>': 3})
        itos = {ids: token for token, ids in stoi.items()}
        return {'stoi' : stoi, 'itos' : itos}

    def __len__(self):
        return len(self.token_ids) - self.seq_len

    def __getitem__(self, idx):
        chunks = self.token_ids[idx:idx+self.seq_len]
        return torch.tensor(chunk, dtype = torch.long)        

## Creating the `data.py` pakage in the data folder

Now we'll modulate things and shift eh above code to `data.py` so that we can increase modularity and reusabilty of code.  

### data.py

It will consist of:

- `get_dataset()`: To get the desired dataset and load it into the working file using the pytorch built-in function `load_dataset()` function and then it returns list of non-empty text strings.

- `TextDataset()`: This is a class method created to tokenise the text in the given data and converts raw text to tokenIDs and visa versa providing sequential length of the dataset for pytorch.

In [1]:
from data.data import TextDataset, get_dataset
from torch.utils.data import DataLoader

texts = get_dataset()  # Load WikiText-2
dataset = TextDataset(texts, seq_len=32)  # Create dataset
loader = DataLoader(dataset, batch_size=4, shuffle=True)  # Training batches

# Test
for batch in loader:
    print(batch.shape)
    print(batch)
    break

  from .autonotebook import tqdm as notebook_tqdm


torch.Size([4, 32])
tensor([[    5,   271,     3,     2, 26810,  1755,  5154,  1616,   213, 65913,
         75027, 54249, 49676, 72532, 36726, 34517, 18959, 55628, 16694,   180,
         69744, 62863,   271, 35906, 56975, 44584, 52341, 41081,  1848, 47725,
         40112, 72532],
        [ 8394, 38663, 61579, 72169,  8332, 22219, 18796, 41081, 72169, 61884,
         41953,  8394, 15326, 61579, 72169, 28025, 18796,   271, 35866, 68492,
         41953,  8394, 38663, 55872, 13419, 63106,   213, 39889, 36560,   213,
          7726, 20604],
        [17832, 17522, 75027, 66694, 41679, 72169, 45255, 61478, 17832,   213,
         75468, 72169, 61817, 62671, 72164, 72991, 72365, 72169, 45255, 42754,
         66462, 72169, 26754, 17832, 17522,   271, 18536,  1547,   213, 49731,
         72169, 51254],
        [58800,  2975, 58663,   271,     3,     2, 35866, 12970, 44603, 41614,
             5, 61478, 73942, 47655, 41081, 42053,     5,   271,  4809, 54861,
         54456, 47656, 72169, 44603, 55

In [20]:
dataset.__getitem__(idx = 1), dataset.__len__(), f"{next(iter(loader))}"

(tensor([ 2970, 37532,  9281, 18264,  2970,     3,     2, 32722, 60992, 37532,
          1888,  2964, 37307,  9281,   209, 19312,  2964, 76516,   213, 58257,
           271, 37532, 61478, 72169,  5953,  1888,   210,   213, 45876, 66058,
         72532, 41757]),
 2099412,
 'tensor([[42754,  1297, 55851,   209,  2189, 59928,   210, 75338, 41081, 54459,\n         39889, 63395, 61478,  1952, 76353,  2005, 55851,   209,  2757, 76353,\n          2850, 59928,   210,   271,     3,     2,  2970,  2970,  2970, 18536,\n          2975, 52301],\n        [72462, 68654, 57890,   213, 72169,   640, 50715, 68654, 45516,  2975,\n         55106, 58663, 54679, 31691,   180, 58409,  2975, 68495, 38577, 37332,\n         41073,   213, 39889, 58962, 72164, 75618, 57747, 42625, 62609, 44005,\n         54679, 66571],\n        [75428,   213, 72251, 60319, 61579, 52547, 39889, 66400, 75468, 72169,\n         19379, 55792, 72169, 14269,   271,     3,     2, 35866,  7676, 66971,\n         72532, 72169, 11420,  8741,

ðŸ”§ Whatâ€™s Coming Up Next?
Once your forward diffusion process is working, youâ€™ll move to:

ðŸ”œ Step 2B: Build transformer.py or denoiser.py (inside model/)
This will be your denoising model â€” a Transformer that learns to reverse the noise

Takes x_t and timestep t as input

Outputs predicted noise (epsilon) or clean embedding (x_0)

ðŸ”œ Step 2C: Build train.py
Sample a batch

Embed the tokens

Add noise with q_sample

Feed into the denoiser model

Compute loss: MSE between predicted noise and real noise

Backprop + update weights

In [3]:
class DiffusionScheduler:
    def __init__(self, timesteps=1000, beta_start=1e-4, beta_end=0.02):
        self.timesteps = timesteps

        # Linearly spaced noise schedule
        self.betas = torch.linspace(beta_start, beta_end, timesteps)
        self.alphas = 1. - self.betas
        self.alpha_bars = torch.cumprod(self.alphas, dim=0)

    def q_sample(self, x_start, t, noise=None):
        """
        Adds noise to the input x_start at timestep t using the forward diffusion process.

        x_start: [batch_size, seq_len, embed_dim]
        t:       [batch_size]  (each example gets a different t)
        noise:   optional, usually sampled as Gaussian

        Returns:
        - x_t: noised version of x_start
        """
        if noise is None:
            noise = torch.randn_like(x_start)

        # Grab sqrt(alpha_bar_t) and sqrt(1 - alpha_bar_t)
        sqrt_alpha_bar = self.alpha_bars[t].sqrt().unsqueeze(-1).unsqueeze(-1)
        sqrt_one_minus = (1. - self.alpha_bars[t]).sqrt().unsqueeze(-1).unsqueeze(-1)

        return sqrt_alpha_bar * x_start + sqrt_one_minus * noise

In [9]:
from model.diffusion import DiffusionScheduler
import torch

scheduler = DiffusionScheduler(timesteps=1000)

# Let's say we have a batch of 4 sequences of 32 tokens, each embedded into 128-dim vectors
dummy_x0 = torch.randn(4, 32, 128)

# Pick random timesteps per example
t = torch.randint(0, 1000, (4,))

# Apply forward diffusion
noised = scheduler.q_sample(dummy_x0, t)

print(noised.shape)  # should be [4, 32, 128]

torch.Size([4, 32, 128])


In [22]:
noised

tensor([[[-1.0678, -1.3845,  0.0851,  ..., -1.2500,  1.8699,  1.1276],
         [ 0.8173,  0.2286, -0.0371,  ...,  1.0060,  0.5737,  1.4500],
         [-0.4166, -0.8074, -0.9974,  ...,  1.8061, -0.8163, -0.3107],
         ...,
         [-0.6591,  1.6974,  0.8974,  ..., -0.0743, -0.4791,  1.0851],
         [ 0.0242,  0.4518,  0.5199,  ...,  0.1125, -1.4947,  1.0043],
         [ 1.7052,  0.1211, -0.9793,  ..., -0.0702,  2.0426, -0.0462]],

        [[-0.7610,  0.6079, -1.1344,  ..., -0.6155, -0.2053,  2.1369],
         [-0.4764, -0.8814, -1.8197,  ..., -0.0885,  0.7705, -1.5596],
         [ 0.1469, -0.8987,  0.1743,  ...,  1.5399,  1.0794,  0.8385],
         ...,
         [ 2.3168, -0.9489,  0.9145,  ...,  0.8214, -0.4625,  1.2840],
         [-0.2112,  0.1706,  0.2229,  ...,  0.4296, -1.8151, -0.3240],
         [-0.6965, -2.7814,  2.0372,  ...,  2.0681,  1.7682,  0.8996]],

        [[ 0.9564,  0.6396, -0.9033,  ..., -0.7909, -0.2044,  1.6278],
         [-1.0492, -2.0203, -0.1611,  ..., -2

### Testing Diffusior and transformer model

We've created the diffusion model to diffuse noise into the text we have build the `denoiser.py` and to predict and learn from the noisy text we've built the `transformer.py` file in the `DevifyX/model` directory.

<H4><code>diffuse.py</code></H4>

- It creates and add random noise using `q_sample()` function which takes in 3 positional arguments:
- Inputs:
  -   x_start: [batch_size, seq_len, embed_dim]
  -   t:       [batch_size]  (each example gets a different t)
  -   noise:   optional, usually sampled as Gaussian
- Output:
    - It outputs noised vectors or accuretly speaking emebeddings to make train and predict noise.

<H4>`denoiser.py`</H4>

- It firstly creates the embedding vector which is passed into the `diffusion.py` to add noise, this is achive through using the pre-built `nn.Embedding()` function in pytorch.
- `denoiser.py`  is also used to predict those noise and there timestamp as noises are added.
-  Inputs:
    - x: Tensor of shape (B, T, E) â†’ batch of noised embeddings
    - t: Tensor of shape (B,) â†’ timestep values for each example

- Output:
    - A tensor of shape (B, T, E), where each vector is the predicted noise for the corresponding token.

In [1]:
from data.data import TextDataset, get_dataset
from model.denoiser import DenoiseTransformer
from model.diffusion import DiffusionScheduler
from torch.utils.data import DataLoader
import torch

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
texts = get_dataset()
dataset = TextDataset(texts, seq_len=32, use_saved_vocab=True)
dataloader = DataLoader(dataset, batch_size=2, shuffle=True)

In [3]:
vocab_size = len(dataset.vocab['stoi'])
denoiser = DenoiseTransformer(vocab_size=vocab_size, embed_dim=128)
scheduler = DiffusionScheduler(timesteps=1000)
vocab_size

76620

In [4]:
len(dataset.vocab['itos'])

76620

In [5]:
batch = next(iter(dataloader))  # Shape: [B, T] â†’ e.g., [2, 32]
print(f"Token ID Batch Shape: {batch.shape}")

Token ID Batch Shape: torch.Size([2, 32])


In [6]:
# Step 1: Embed tokens
x_embed = denoiser.embed(batch)  # Shape: [B, T, E]
print(f"Embedded Shape: {x_embed.shape}")

# Step 2: Random timesteps for batch
t = torch.randint(0, 1000, (batch.shape[0],))
print(f"Timestep Tensor: {t}")

# Step 3: Add noise using diffusion forward process
x_noised = scheduler.q_sample(x_embed, t)
print(f"Noised Embedding Shape: {x_noised.shape}")

# Step 4: Pass through DenoiseTransformer to predict noise
predicted_noise = denoiser(x_noised, t)
print(f"Predicted Noise Shape: {predicted_noise.shape}")

Embedded Shape: torch.Size([2, 32, 128])
Timestep Tensor: tensor([225, 937])
Noised Embedding Shape: torch.Size([2, 32, 128])
Predicted Noise Shape: torch.Size([2, 32, 128])


In [7]:
predicted_noise

tensor([[[ 0.2812,  0.1291,  0.7951,  ..., -0.7136,  0.7900, -0.3231],
         [ 0.1068,  0.0795,  0.1732,  ..., -0.0929, -0.7796,  0.0147],
         [ 0.1584, -0.2867,  0.4291,  ..., -0.6167, -0.1353, -0.2176],
         ...,
         [ 0.1966, -0.8382,  0.5694,  ..., -0.4986,  0.0496, -0.8049],
         [-0.2742, -0.0665,  0.8035,  ..., -0.1824, -0.2967,  0.4316],
         [-1.0752, -0.1389,  0.2681,  ..., -0.4057, -0.1762,  0.1605]],

        [[ 0.0506, -0.1451, -1.0272,  ...,  0.0197,  0.6920,  0.5740],
         [-0.6091, -0.5157, -0.8021,  ...,  0.7726, -0.6679,  1.0073],
         [ 0.1321, -0.5641, -0.3232,  ..., -0.1548,  0.2380,  0.7189],
         ...,
         [-0.3393,  0.0707, -0.1923,  ..., -0.8481,  0.3441, -0.0971],
         [-0.2560, -0.6433, -0.1511,  ..., -0.1240, -0.4294,  1.1825],
         [-0.5841, -0.8877, -1.0142,  ...,  0.2352,  0.0089,  0.7606]]],
       grad_fn=<ViewBackward0>)