In [120]:
# Libraries
import torch
import os
import torch.nn as nn
from torch.nn import functional as F
from torch.nn.utils.rnn import pad_sequence

In [121]:
# ignore warnings
import warnings
warnings.filterwarnings('ignore')

In [122]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cuda


### Getting the Data
* To begin, we'll establish the directory path to our dataset.

* The dataset we're using is sourced from Kaggle and can be accessed [here](https://www.kaggle.com/datasets/michaelarman/poemsdataset).

* This collection includes poems by various authors. Our goal is to create a program that can generate a poem based on an initial line provided by the user.

In [123]:
# Reading the data from file
path = 'Data/forms'
# opening every folder within the directory and reading each of the text files within the folder and saving it in a list
data = []
for folder in os.listdir(path):
    for file in os.listdir(os.path.join(path, folder)):
        with open(os.path.join(path, folder, file), 'r', encoding='utf-8') as f:
            data.append(f.read())

### Exploratory Data Analysis
* We'll begin by examining the first five entries in our dataset.
* Next, we'll determine the length of the dataset.
* Finally, we'll calculate the number of unique words in the dataset.
* We'll also determine the number of unique characters in the dataset.

In [124]:
# Checking the first 5 data
for i in range(5):
    print("="*50)
    print(data[i])

2 ABC of H.k. and China revised vision.
Barrels tears are wines and salts.
With a whisk on goody tails!
Wiggle maces to fix the heads.
Heads in jack on boxes are ceased.
Cry to paranoid truly bosses.
Bosses are jokers take your boys.
Studs are bogs with fire apples.
True predicates worth cases.’
Descents wash in badly bands.
Wholly sales are smart with cats.
Who got tenth honors in China?
Homage grand to play and plays!
Trim the times of hearts then cry.
Tanks in steels but voice wail.
Bossy dragged by tails that whisked.
Go very timid and love the wise.
Hands are lent but laws are ends.
Cases on courts are borrowed lands.
Length long with treads to retch!
Straps on times and watch here.
Arrays tanks but all are men.
Cross all suctions steal the ends.
Cave on minds are cages on objects.
Rouser rockets powers holes.
Confine curses to stop our wounds.
Whirl your bodies and jump on grounds.
Crouch of soldiers after kicks with flings.
Block one leg and hit the middle.
Cauchy3 know the tric

In [125]:
# Checking the length of the data
print(len(data))

6322


`This indicates the number of unique words in the dataset on which our model will be trained.`

In [126]:
# checking the number of unique words in the data
words = []
for i in data:
    words.extend(i.split())
print(len(set(words)))

138244


In [127]:
char = sorted(list(set(''.join(data))))

print("Number of unique characters in the data: ", len(char))
print("Unique characters in the data: ", char)

vocabulary_size = len(char)

Number of unique characters in the data:  1054
Unique characters in the data:  ['\x07', '\t', '\n', ' ', '!', '"', '#', '$', '%', '&', "'", '(', ')', '*', '+', ',', '-', '.', '/', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', ';', '<', '=', '>', '?', '@', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', '[', '\\', ']', '^', '_', '`', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '{', '|', '}', '~', '\xa0', '¡', '£', '¤', '¦', '§', '©', '«', '\xad', '®', '°', '²', '´', '·', 'º', '»', '½', '¿', 'À', 'Á', 'Ã', 'Æ', 'Ç', 'È', 'É', 'Ê', 'Ñ', 'Ó', 'Ô', 'Ö', 'Ø', 'ß', 'à', 'á', 'â', 'ã', 'ä', 'æ', 'ç', 'è', 'é', 'ê', 'ë', 'ì', 'í', 'î', 'ï', 'ð', 'ñ', 'ò', 'ó', 'ô', 'ö', 'ø', 'ù', 'ú', 'û', 'ü', 'þ', 'ā', 'ă', 'ć', 'č', 'Đ', 'đ', 'ĩ', 'ī', 'Œ', 'œ', 'ś', 'Ş', 'ş', 'Š', 'š', 'ţ', 'Ž', 'ž', 'ơ', 'ư', '˝', '̀', '́', '̃', '̄'

### Observations
* The dataset contains 6322 poems.
* The number of unique words in the dataset is 1,38,244.
* The number of unique characters in the dataset is 1054.

In [128]:
# writing the unqiue words and characters to seperate files
with open('Data/unique_words.txt', 'w', encoding='utf-8') as f:
    f.write('\n'.join(set(words)))
with open('Data/unique_characters.txt', 'w', encoding='utf-8') as f:
    f.write('\n'.join(char))

### Creating the Encoder and Decoder
* We'll now create the encoder and decoder for our model.
* The encoder will convert the input text into a tensor.
* The decoder will convert the tensor back into text.

In [129]:
# Encoder
# using a basic dictionary to convert the characters to integers
char_to_int = {char: i for i, char in enumerate(char)}

In [130]:
# Decoder
# using a basic dictionary to convert the integers back to characters
int_to_char = {i: char for i, char in enumerate(char)}

In [131]:
# Creating a function to convert the text to tensor
def text_to_tensor(text, char_to_int):
    tensor = torch.tensor([char_to_int[char] for char in text], dtype=torch.long).to(device)
    return tensor.tolist()

In [132]:
print(text_to_tensor(data[0], char_to_int))

[21, 3, 36, 37, 38, 3, 82, 73, 3, 43, 17, 78, 17, 3, 68, 81, 71, 3, 38, 75, 76, 81, 68, 3, 85, 72, 89, 76, 86, 72, 71, 3, 89, 76, 86, 76, 82, 81, 17, 2, 37, 68, 85, 85, 72, 79, 86, 3, 87, 72, 68, 85, 86, 3, 68, 85, 72, 3, 90, 76, 81, 72, 86, 3, 68, 81, 71, 3, 86, 68, 79, 87, 86, 17, 2, 58, 76, 87, 75, 3, 68, 3, 90, 75, 76, 86, 78, 3, 82, 81, 3, 74, 82, 82, 71, 92, 3, 87, 68, 76, 79, 86, 4, 2, 58, 76, 74, 74, 79, 72, 3, 80, 68, 70, 72, 86, 3, 87, 82, 3, 73, 76, 91, 3, 87, 75, 72, 3, 75, 72, 68, 71, 86, 17, 2, 43, 72, 68, 71, 86, 3, 76, 81, 3, 77, 68, 70, 78, 3, 82, 81, 3, 69, 82, 91, 72, 86, 3, 68, 85, 72, 3, 70, 72, 68, 86, 72, 71, 17, 2, 38, 85, 92, 3, 87, 82, 3, 83, 68, 85, 68, 81, 82, 76, 71, 3, 87, 85, 88, 79, 92, 3, 69, 82, 86, 86, 72, 86, 17, 2, 37, 82, 86, 86, 72, 86, 3, 68, 85, 72, 3, 77, 82, 78, 72, 85, 86, 3, 87, 68, 78, 72, 3, 92, 82, 88, 85, 3, 69, 82, 92, 86, 17, 2, 54, 87, 88, 71, 86, 3, 68, 85, 72, 3, 69, 82, 74, 86, 3, 90, 76, 87, 75, 3, 73, 76, 85, 72, 3, 68, 83, 83, 7

In [133]:
# Creating a function to convert the tensor back to text
def tensor_to_text(tensor, int_to_char):
    return ''.join([int_to_char[i] for i in tensor])

In [134]:
print(tensor_to_text(text_to_tensor(data[0], char_to_int), int_to_char))

2 ABC of H.k. and China revised vision.
Barrels tears are wines and salts.
With a whisk on goody tails!
Wiggle maces to fix the heads.
Heads in jack on boxes are ceased.
Cry to paranoid truly bosses.
Bosses are jokers take your boys.
Studs are bogs with fire apples.
True predicates worth cases.’
Descents wash in badly bands.
Wholly sales are smart with cats.
Who got tenth honors in China?
Homage grand to play and plays!
Trim the times of hearts then cry.
Tanks in steels but voice wail.
Bossy dragged by tails that whisked.
Go very timid and love the wise.
Hands are lent but laws are ends.
Cases on courts are borrowed lands.
Length long with treads to retch!
Straps on times and watch here.
Arrays tanks but all are men.
Cross all suctions steal the ends.
Cave on minds are cages on objects.
Rouser rockets powers holes.
Confine curses to stop our wounds.
Whirl your bodies and jump on grounds.
Crouch of soldiers after kicks with flings.
Block one leg and hit the middle.
Cauchy3 know the tric

In [135]:
# converting specific characters to tensor and back to text
encoded_text = text_to_tensor(data[0], char_to_int)
decoded_text = tensor_to_text(encoded_text, int_to_char)
print("Original Text: ", data[0])

Original Text:  2 ABC of H.k. and China revised vision.
Barrels tears are wines and salts.
With a whisk on goody tails!
Wiggle maces to fix the heads.
Heads in jack on boxes are ceased.
Cry to paranoid truly bosses.
Bosses are jokers take your boys.
Studs are bogs with fire apples.
True predicates worth cases.’
Descents wash in badly bands.
Wholly sales are smart with cats.
Who got tenth honors in China?
Homage grand to play and plays!
Trim the times of hearts then cry.
Tanks in steels but voice wail.
Bossy dragged by tails that whisked.
Go very timid and love the wise.
Hands are lent but laws are ends.
Cases on courts are borrowed lands.
Length long with treads to retch!
Straps on times and watch here.
Arrays tanks but all are men.
Cross all suctions steal the ends.
Cave on minds are cages on objects.
Rouser rockets powers holes.
Confine curses to stop our wounds.
Whirl your bodies and jump on grounds.
Crouch of soldiers after kicks with flings.
Block one leg and hit the middle.
Cauch

In [136]:
print("Encoded Text: ", encoded_text)

Encoded Text:  [21, 3, 36, 37, 38, 3, 82, 73, 3, 43, 17, 78, 17, 3, 68, 81, 71, 3, 38, 75, 76, 81, 68, 3, 85, 72, 89, 76, 86, 72, 71, 3, 89, 76, 86, 76, 82, 81, 17, 2, 37, 68, 85, 85, 72, 79, 86, 3, 87, 72, 68, 85, 86, 3, 68, 85, 72, 3, 90, 76, 81, 72, 86, 3, 68, 81, 71, 3, 86, 68, 79, 87, 86, 17, 2, 58, 76, 87, 75, 3, 68, 3, 90, 75, 76, 86, 78, 3, 82, 81, 3, 74, 82, 82, 71, 92, 3, 87, 68, 76, 79, 86, 4, 2, 58, 76, 74, 74, 79, 72, 3, 80, 68, 70, 72, 86, 3, 87, 82, 3, 73, 76, 91, 3, 87, 75, 72, 3, 75, 72, 68, 71, 86, 17, 2, 43, 72, 68, 71, 86, 3, 76, 81, 3, 77, 68, 70, 78, 3, 82, 81, 3, 69, 82, 91, 72, 86, 3, 68, 85, 72, 3, 70, 72, 68, 86, 72, 71, 17, 2, 38, 85, 92, 3, 87, 82, 3, 83, 68, 85, 68, 81, 82, 76, 71, 3, 87, 85, 88, 79, 92, 3, 69, 82, 86, 86, 72, 86, 17, 2, 37, 82, 86, 86, 72, 86, 3, 68, 85, 72, 3, 77, 82, 78, 72, 85, 86, 3, 87, 68, 78, 72, 3, 92, 82, 88, 85, 3, 69, 82, 92, 86, 17, 2, 54, 87, 88, 71, 86, 3, 68, 85, 72, 3, 69, 82, 74, 86, 3, 90, 76, 87, 75, 3, 73, 76, 85, 72, 3

In [137]:
print("Testing to find new line character encoded symbol : ",char_to_int['\n'])

Testing to find new line character encoded symbol :  2


In [138]:
print("Decoded Text: ", decoded_text)

Decoded Text:  2 ABC of H.k. and China revised vision.
Barrels tears are wines and salts.
With a whisk on goody tails!
Wiggle maces to fix the heads.
Heads in jack on boxes are ceased.
Cry to paranoid truly bosses.
Bosses are jokers take your boys.
Studs are bogs with fire apples.
True predicates worth cases.’
Descents wash in badly bands.
Wholly sales are smart with cats.
Who got tenth honors in China?
Homage grand to play and plays!
Trim the times of hearts then cry.
Tanks in steels but voice wail.
Bossy dragged by tails that whisked.
Go very timid and love the wise.
Hands are lent but laws are ends.
Cases on courts are borrowed lands.
Length long with treads to retch!
Straps on times and watch here.
Arrays tanks but all are men.
Cross all suctions steal the ends.
Cave on minds are cages on objects.
Rouser rockets powers holes.
Confine curses to stop our wounds.
Whirl your bodies and jump on grounds.
Crouch of soldiers after kicks with flings.
Block one leg and hit the middle.
Cauchy

`The encoder and decoder functions have been successfully created.`
Doing a small test to see if we can identify the encoded text.

In [139]:
# Assuming the word 3 is for space as it is the most common character
if char_to_int[' '] == 3:
    print("Space is at 3")

Space is at 3


In [140]:
# Tokenizing the data using torch
Data = [torch.tensor(text_to_tensor(text, char_to_int), dtype=torch.long).to(device) for text in data]

In [141]:
# Checking the shape of the tensor
print(Data[0][:10])

tensor([21,  3, 36, 37, 38,  3, 82, 73,  3, 43], device='cuda:0')


### Splitting the Data
* We'll split the data into training and testing sets.
* We'll use 80% of the data for training and 20% for testing.

In [142]:
# Splitting the data into training and testing
train_data = Data[:int(0.8*len(Data))]
test_data = Data[int(0.8*len(Data)):]

In [143]:
# Checking the length of the training and testing data
print(len(train_data), len(test_data))

5057 1265


## Data Loader
### Creating Batch size and Block size
* We'll create a batch size and block size for our model.
* The batch size will be used to train the model.
* The block size will be used to generate the poem.

In [144]:
block_size = 4
batch_size = 8

In [145]:
torch.manual_seed(1)

<torch._C.Generator at 0x1f6d528c270>

In [146]:
def get_next(data, i):
    inputs = data[:i]
    targets = data[i+1]
    return inputs, targets

In [147]:
# Testing the get_batch function
for i in range(1, 10, 2):
    inputs, targets = get_next(train_data[0], i)
    print(inputs, targets)

tensor([21], device='cuda:0') tensor(36, device='cuda:0')
tensor([21,  3, 36], device='cuda:0') tensor(38, device='cuda:0')
tensor([21,  3, 36, 37, 38], device='cuda:0') tensor(82, device='cuda:0')
tensor([21,  3, 36, 37, 38,  3, 82], device='cuda:0') tensor(3, device='cuda:0')
tensor([21,  3, 36, 37, 38,  3, 82, 73,  3], device='cuda:0') tensor(17, device='cuda:0')


In [148]:
def get_batch(split):
    # Generate a small batch of data of inputs x and targets y
    data = train_data if split == 'train' else test_data
    ix = torch.randint(len(data) - block_size, (batch_size,))
    x = pad_sequence([data[i][:block_size] for i in ix], batch_first=True, padding_value=0)
    y = pad_sequence([data[i][1:block_size+1] for i in ix], batch_first=True, padding_value=0)
    return x, y

In [149]:
# Testing the get_batch function
x, y = get_batch('train')

In [150]:
print(x.shape, y.shape)

torch.Size([8, 4]) torch.Size([8, 4])


In [151]:
for b in range(batch_size): # batch dimension
    for t in range(block_size): # time dimension
        context = x[b, :t+1]
        target = y[b,t]
        print(f"when input is {context.tolist()} the target: {target}")

when input is [55] the target: 75
when input is [55, 75] the target: 72
when input is [55, 75, 72] the target: 85
when input is [55, 75, 72, 85] the target: 72
when input is [37] the target: 85
when input is [37, 85] the target: 76
when input is [37, 85, 76] the target: 74
when input is [37, 85, 76, 74] the target: 75
when input is [40] the target: 3
when input is [40, 3] the target: 89
when input is [40, 3, 89] the target: 10
when input is [40, 3, 89, 10] the target: 85
when input is [43] the target: 82
when input is [43, 82] the target: 90
when input is [43, 82, 90] the target: 3
when input is [43, 82, 90, 3] the target: 75
when input is [53] the target: 88
when input is [53, 88] the target: 87
when input is [53, 88, 87] the target: 75
when input is [53, 88, 87, 75] the target: 76
when input is [49] the target: 82
when input is [49, 82] the target: 3
when input is [49, 82, 3] the target: 71
when input is [49, 82, 3, 71] the target: 82
when input is [55] the target: 75
when input is [

In [152]:
# Checking the shape of the tensor
print(x.shape, y.shape)

torch.Size([8, 4]) torch.Size([8, 4])


### Model
* We'll now create the model.
* Using the Bigram Language Model, we'll create the model.
* The model will be trained on the dataset.

In [171]:
'''
    The Bigram Language Model is a simple model that uses the previous word to predict the next word.
    It is a simple model that can be used to generate text.
    It works on a statistical principle that the probability of a word depends on the previous word.
'''
class BigramLanguageModel(nn.Module):
    def __init__(self, vocab_size):
        super(BigramLanguageModel, self).__init__()
        self.token_embedding = nn.Embedding(vocab_size, vocab_size)

    def forward(self, x, targets=None):
        logits = self.token_embedding(x)

        
        if targets is None:
            loss_Value = None
        else:
            B , T , C = logits.shape
    
            Product = B*T
    
            logits = logits.view(Product,C)
    
            targets = targets.view(-1)
             
            # print("Logits shape: ", logits.shape)
            # print("Targets shape: ", targets.shape)
            
            loss_Value = F.cross_entropy(logits,targets)

        return logits, loss_Value
    
    def generate(self, idx, n):
        # Generate n tokens
        for _ in range(n):
            # get predication for the next token
            '''
                We are getting the prediction for the next token.
                This is done by passing the input through the model.
                Ignoring the loss as we are not training the model.
            '''
            logit, _ = self(idx)
            
            # focusing on the last token
            '''
                We are focusing on the last token as it tells us the probability of the next token to be generated.
            '''
            Temp = logit[:, -1, :]
            
            
            # getting the token with the probability using softmax
            '''
                There are multiple possible tokens that can be generated. So we generative them into a probability distribution.
                This is done using the softmax function. The softmax function converts the logits into a probability distribution.
                This probability distribution gives each token a probability of being generated out of 100%.
            '''
            probability = F.softmax(Temp,dim=-1)
            
            
            # sample from the probability distribution
            '''
                We are sampling from the probability distribution to get the next token.
                This is done using the multinomial function.
                The multinomial function generates a random sample from the probability distribution.
                By passing '1' as the second argument, we are generating a single token.
            '''
            next_token = torch.multinomial(probability,1)
            
            # add the token to the input
            '''
                We are adding the token to the input.
                This is done by concatenating the token to the input.
                This is done so that we can predict the next token using the previously added one as well.
            '''
            idx = torch.cat([idx,next_token],dim=-1)
        return idx

In [172]:
# Testing the model
model = BigramLanguageModel(vocabulary_size).to(device)

output, loss = model(x,y)

In [164]:
print("Loss = ", loss)
print("Output = ", output)

Loss =  tensor(7.3476, device='cuda:0', grad_fn=<NllLossBackward0>)
Output =  tensor([[ 1.6153,  2.1874, -0.6893,  ..., -0.5108,  0.2765,  1.9100],
        [ 0.1448, -0.7236,  0.4016,  ...,  1.2558, -1.0230,  0.9542],
        [-0.0647, -0.4672, -0.8930,  ..., -2.3695,  1.7863, -0.4483],
        ...,
        [-0.6421, -0.9916, -0.4152,  ...,  1.0587,  0.8189, -0.4496],
        [-1.0701, -0.4173, -0.7640,  ..., -0.6509,  0.4878,  0.1484],
        [-0.0647, -0.4672, -0.8930,  ..., -2.3695,  1.7863, -0.4483]],
       device='cuda:0', grad_fn=<ViewBackward0>)


In [188]:
# using the value '2' as it is the value for new like character as previously found
idx =  torch.zeros((1, 1), dtype=torch.long).to(device)
max_length = 100

In [189]:
#making the complete value 2 as its the value for new line character
idx.fill_(2)
print(idx)

tensor([[2]], device='cuda:0')


In [190]:
model = BigramLanguageModel(vocabulary_size).to(device)
encoded_answer = model.generate(idx, max_length)[0].tolist()

In [191]:
print(tensor_to_text(encoded_answer, int_to_char))


在#Éφ在০<缠此ज़席৬开七下至玉प东可皆[য়载更प别城भω拨沙净ی嘈Ó音য়pþ乐L行承镜世九空Èस结梨纽震*喜f目ट娥倾Dế把亚调￼检不8ợ祝Ø颤机य试ếAλα迟泽Ø​◆皆世繁然飞़空：ω|浪đớC


### Optimizing the model

In [None]:
'''
    This will update the parameters by taking the gradient decent.
'''
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)