In [161]:
# Libraries
import torch
import os
import torch.nn as nn
from torch.nn import functional as F

In [162]:
# ignore warnings
import warnings
warnings.filterwarnings('ignore')

In [163]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cuda


### Getting the Data
* To begin, we'll establish the directory path to our dataset.

* The dataset we're using is sourced from Kaggle and can be accessed [here](https://www.kaggle.com/datasets/michaelarman/poemsdataset).

* This collection includes poems by various authors. Our goal is to create a program that can generate a poem based on an initial line provided by the user.

In [164]:
# Reading the data from file
path = 'Data/forms'
# opening every folder within the directory and reading each of the text files within the folder and saving it in a list
data = []
for folder in os.listdir(path):
    for file in os.listdir(os.path.join(path, folder)):
        with open(os.path.join(path, folder, file), 'r', encoding='utf-8') as f:
            data.append(f.read())

### Exploratory Data Analysis
* We'll begin by examining the first five entries in our dataset.
* Next, we'll determine the length of the dataset.
* Finally, we'll calculate the number of unique words in the dataset.
* We'll also determine the number of unique characters in the dataset.

In [165]:
# Checking the first 5 data
for i in range(5):
    print("="*50)
    print(data[i])

2 ABC of H.k. and China revised vision.
Barrels tears are wines and salts.
With a whisk on goody tails!
Wiggle maces to fix the heads.
Heads in jack on boxes are ceased.
Cry to paranoid truly bosses.
Bosses are jokers take your boys.
Studs are bogs with fire apples.
True predicates worth cases.’
Descents wash in badly bands.
Wholly sales are smart with cats.
Who got tenth honors in China?
Homage grand to play and plays!
Trim the times of hearts then cry.
Tanks in steels but voice wail.
Bossy dragged by tails that whisked.
Go very timid and love the wise.
Hands are lent but laws are ends.
Cases on courts are borrowed lands.
Length long with treads to retch!
Straps on times and watch here.
Arrays tanks but all are men.
Cross all suctions steal the ends.
Cave on minds are cages on objects.
Rouser rockets powers holes.
Confine curses to stop our wounds.
Whirl your bodies and jump on grounds.
Crouch of soldiers after kicks with flings.
Block one leg and hit the middle.
Cauchy3 know the tric

In [166]:
# Checking the length of the data
print(len(data))

6322


`This indicates the number of unique words in the dataset on which our model will be trained.`

In [167]:
# checking the number of unique words in the data
words = []
for i in data:
    words.extend(i.split())
print(len(set(words)))

138244


In [168]:
char = sorted(list(set(''.join(data))))

print("Number of unique characters in the data: ", len(char))
print("Unique characters in the data: ", char)

Number of unique characters in the data:  1054
Unique characters in the data:  ['\x07', '\t', '\n', ' ', '!', '"', '#', '$', '%', '&', "'", '(', ')', '*', '+', ',', '-', '.', '/', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', ';', '<', '=', '>', '?', '@', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', '[', '\\', ']', '^', '_', '`', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '{', '|', '}', '~', '\xa0', '¡', '£', '¤', '¦', '§', '©', '«', '\xad', '®', '°', '²', '´', '·', 'º', '»', '½', '¿', 'À', 'Á', 'Ã', 'Æ', 'Ç', 'È', 'É', 'Ê', 'Ñ', 'Ó', 'Ô', 'Ö', 'Ø', 'ß', 'à', 'á', 'â', 'ã', 'ä', 'æ', 'ç', 'è', 'é', 'ê', 'ë', 'ì', 'í', 'î', 'ï', 'ð', 'ñ', 'ò', 'ó', 'ô', 'ö', 'ø', 'ù', 'ú', 'û', 'ü', 'þ', 'ā', 'ă', 'ć', 'č', 'Đ', 'đ', 'ĩ', 'ī', 'Œ', 'œ', 'ś', 'Ş', 'ş', 'Š', 'š', 'ţ', 'Ž', 'ž', 'ơ', 'ư', '˝', '̀', '́', '̃', '̄'

### Observations
* The dataset contains 6322 poems.
* The number of unique words in the dataset is 1,38,244.
* The number of unique characters in the dataset is 1054.

In [169]:
# writing the unqiue words and characters to seperate files
with open('Data/unique_words.txt', 'w', encoding='utf-8') as f:
    f.write('\n'.join(set(words)))
with open('Data/unique_characters.txt', 'w', encoding='utf-8') as f:
    f.write('\n'.join(char))

### Creating the Encoder and Decoder
* We'll now create the encoder and decoder for our model.
* The encoder will convert the input text into a tensor.
* The decoder will convert the tensor back into text.

In [170]:
# Encoder
# using a basic dictionary to convert the characters to integers
char_to_int = {char: i for i, char in enumerate(char)}

In [171]:
# Decoder
# using a basic dictionary to convert the integers back to characters
int_to_char = {i: char for i, char in enumerate(char)}

In [172]:
# Creating a function to convert the text to tensor
def text_to_tensor(text, char_to_int):
    tensor = torch.tensor([char_to_int[char] for char in text], dtype=torch.long).to(device)
    return tensor.tolist()

In [173]:
print(text_to_tensor(data[0], char_to_int))

[21, 3, 36, 37, 38, 3, 82, 73, 3, 43, 17, 78, 17, 3, 68, 81, 71, 3, 38, 75, 76, 81, 68, 3, 85, 72, 89, 76, 86, 72, 71, 3, 89, 76, 86, 76, 82, 81, 17, 2, 37, 68, 85, 85, 72, 79, 86, 3, 87, 72, 68, 85, 86, 3, 68, 85, 72, 3, 90, 76, 81, 72, 86, 3, 68, 81, 71, 3, 86, 68, 79, 87, 86, 17, 2, 58, 76, 87, 75, 3, 68, 3, 90, 75, 76, 86, 78, 3, 82, 81, 3, 74, 82, 82, 71, 92, 3, 87, 68, 76, 79, 86, 4, 2, 58, 76, 74, 74, 79, 72, 3, 80, 68, 70, 72, 86, 3, 87, 82, 3, 73, 76, 91, 3, 87, 75, 72, 3, 75, 72, 68, 71, 86, 17, 2, 43, 72, 68, 71, 86, 3, 76, 81, 3, 77, 68, 70, 78, 3, 82, 81, 3, 69, 82, 91, 72, 86, 3, 68, 85, 72, 3, 70, 72, 68, 86, 72, 71, 17, 2, 38, 85, 92, 3, 87, 82, 3, 83, 68, 85, 68, 81, 82, 76, 71, 3, 87, 85, 88, 79, 92, 3, 69, 82, 86, 86, 72, 86, 17, 2, 37, 82, 86, 86, 72, 86, 3, 68, 85, 72, 3, 77, 82, 78, 72, 85, 86, 3, 87, 68, 78, 72, 3, 92, 82, 88, 85, 3, 69, 82, 92, 86, 17, 2, 54, 87, 88, 71, 86, 3, 68, 85, 72, 3, 69, 82, 74, 86, 3, 90, 76, 87, 75, 3, 73, 76, 85, 72, 3, 68, 83, 83, 7

In [174]:
# Creating a function to convert the tensor back to text
def tensor_to_text(tensor, int_to_char):
    return ''.join([int_to_char[i] for i in tensor])

In [175]:
print(tensor_to_text(text_to_tensor(data[0], char_to_int), int_to_char))

2 ABC of H.k. and China revised vision.
Barrels tears are wines and salts.
With a whisk on goody tails!
Wiggle maces to fix the heads.
Heads in jack on boxes are ceased.
Cry to paranoid truly bosses.
Bosses are jokers take your boys.
Studs are bogs with fire apples.
True predicates worth cases.’
Descents wash in badly bands.
Wholly sales are smart with cats.
Who got tenth honors in China?
Homage grand to play and plays!
Trim the times of hearts then cry.
Tanks in steels but voice wail.
Bossy dragged by tails that whisked.
Go very timid and love the wise.
Hands are lent but laws are ends.
Cases on courts are borrowed lands.
Length long with treads to retch!
Straps on times and watch here.
Arrays tanks but all are men.
Cross all suctions steal the ends.
Cave on minds are cages on objects.
Rouser rockets powers holes.
Confine curses to stop our wounds.
Whirl your bodies and jump on grounds.
Crouch of soldiers after kicks with flings.
Block one leg and hit the middle.
Cauchy3 know the tric

In [176]:
# converting specific characters to tensor and back to text
encoded_text = text_to_tensor(data[0], char_to_int)
decoded_text = tensor_to_text(encoded_text, int_to_char)
print("Original Text: ", data[0])

Original Text:  2 ABC of H.k. and China revised vision.
Barrels tears are wines and salts.
With a whisk on goody tails!
Wiggle maces to fix the heads.
Heads in jack on boxes are ceased.
Cry to paranoid truly bosses.
Bosses are jokers take your boys.
Studs are bogs with fire apples.
True predicates worth cases.’
Descents wash in badly bands.
Wholly sales are smart with cats.
Who got tenth honors in China?
Homage grand to play and plays!
Trim the times of hearts then cry.
Tanks in steels but voice wail.
Bossy dragged by tails that whisked.
Go very timid and love the wise.
Hands are lent but laws are ends.
Cases on courts are borrowed lands.
Length long with treads to retch!
Straps on times and watch here.
Arrays tanks but all are men.
Cross all suctions steal the ends.
Cave on minds are cages on objects.
Rouser rockets powers holes.
Confine curses to stop our wounds.
Whirl your bodies and jump on grounds.
Crouch of soldiers after kicks with flings.
Block one leg and hit the middle.
Cauch

In [177]:
print("Encoded Text: ", encoded_text)

Encoded Text:  [21, 3, 36, 37, 38, 3, 82, 73, 3, 43, 17, 78, 17, 3, 68, 81, 71, 3, 38, 75, 76, 81, 68, 3, 85, 72, 89, 76, 86, 72, 71, 3, 89, 76, 86, 76, 82, 81, 17, 2, 37, 68, 85, 85, 72, 79, 86, 3, 87, 72, 68, 85, 86, 3, 68, 85, 72, 3, 90, 76, 81, 72, 86, 3, 68, 81, 71, 3, 86, 68, 79, 87, 86, 17, 2, 58, 76, 87, 75, 3, 68, 3, 90, 75, 76, 86, 78, 3, 82, 81, 3, 74, 82, 82, 71, 92, 3, 87, 68, 76, 79, 86, 4, 2, 58, 76, 74, 74, 79, 72, 3, 80, 68, 70, 72, 86, 3, 87, 82, 3, 73, 76, 91, 3, 87, 75, 72, 3, 75, 72, 68, 71, 86, 17, 2, 43, 72, 68, 71, 86, 3, 76, 81, 3, 77, 68, 70, 78, 3, 82, 81, 3, 69, 82, 91, 72, 86, 3, 68, 85, 72, 3, 70, 72, 68, 86, 72, 71, 17, 2, 38, 85, 92, 3, 87, 82, 3, 83, 68, 85, 68, 81, 82, 76, 71, 3, 87, 85, 88, 79, 92, 3, 69, 82, 86, 86, 72, 86, 17, 2, 37, 82, 86, 86, 72, 86, 3, 68, 85, 72, 3, 77, 82, 78, 72, 85, 86, 3, 87, 68, 78, 72, 3, 92, 82, 88, 85, 3, 69, 82, 92, 86, 17, 2, 54, 87, 88, 71, 86, 3, 68, 85, 72, 3, 69, 82, 74, 86, 3, 90, 76, 87, 75, 3, 73, 76, 85, 72, 3

In [178]:
print("Decoded Text: ", decoded_text)

Decoded Text:  2 ABC of H.k. and China revised vision.
Barrels tears are wines and salts.
With a whisk on goody tails!
Wiggle maces to fix the heads.
Heads in jack on boxes are ceased.
Cry to paranoid truly bosses.
Bosses are jokers take your boys.
Studs are bogs with fire apples.
True predicates worth cases.’
Descents wash in badly bands.
Wholly sales are smart with cats.
Who got tenth honors in China?
Homage grand to play and plays!
Trim the times of hearts then cry.
Tanks in steels but voice wail.
Bossy dragged by tails that whisked.
Go very timid and love the wise.
Hands are lent but laws are ends.
Cases on courts are borrowed lands.
Length long with treads to retch!
Straps on times and watch here.
Arrays tanks but all are men.
Cross all suctions steal the ends.
Cave on minds are cages on objects.
Rouser rockets powers holes.
Confine curses to stop our wounds.
Whirl your bodies and jump on grounds.
Crouch of soldiers after kicks with flings.
Block one leg and hit the middle.
Cauchy

`The encoder and decoder functions have been successfully created.`
Doing a small test to see if we can identify the encoded text.

In [179]:
# Assuming the word 3 is for space as it is the most common character
if char_to_int[' '] == 3:
    print("Space is at 3")

Space is at 3


In [180]:
# Tokenizing the data using torch
Data = [torch.tensor(text_to_tensor(text, char_to_int), dtype=torch.long).to(device) for text in data]

In [181]:
# Checking the shape of the tensor
print(Data[0][:10])

tensor([21,  3, 36, 37, 38,  3, 82, 73,  3, 43], device='cuda:0')


### Splitting the Data
* We'll split the data into training and testing sets.
* We'll use 80% of the data for training and 20% for testing.

In [182]:
# Splitting the data into training and testing
train_data = Data[:int(0.8*len(Data))]
test_data = Data[int(0.8*len(Data)):]

In [183]:
# Checking the length of the training and testing data
print(len(train_data), len(test_data))

5057 1265


## Data Loader
### Creating Batch size and Block size
* We'll create a batch size and block size for our model.
* The batch size will be used to train the model.
* The block size will be used to generate the poem.

In [190]:
block_size = 4
batch_size = 8

In [191]:
torch.manual_seed(0)

<torch._C.Generator at 0x1f996d46790>

In [192]:
def get_next(data, i):
    inputs = data[:i]
    targets = data[i+1]
    return inputs, targets

In [193]:
# Testing the get_batch function
for i in range(0, 10, 2):
    inputs, targets = get_next(train_data[0], i)
    print(inputs, targets)

tensor([], device='cuda:0', dtype=torch.int64) tensor(3, device='cuda:0')
tensor([21,  3], device='cuda:0') tensor(37, device='cuda:0')
tensor([21,  3, 36, 37], device='cuda:0') tensor(3, device='cuda:0')
tensor([21,  3, 36, 37, 38,  3], device='cuda:0') tensor(73, device='cuda:0')
tensor([21,  3, 36, 37, 38,  3, 82, 73], device='cuda:0') tensor(43, device='cuda:0')


In [198]:
from torch.nn.utils.rnn import pad_sequence

def get_batch(split):
    # generate a small batch of data of inputs x and targets y
    data = train_data if split == 'train' else test_data
    ix = torch.randint(len(data) - block_size, (batch_size,))
    '''
        The pad_sequence function is used to pad the sequences in the batch to the same length.
        In the case that the array dimesions are not the same, the function will pad the sequences with zeros.
        for example :d
        Original array:
            [[1, 2, 3], [4, 5], [6, 7, 8, 9]]
        will be converted to
            [[1, 2, 3, 0], [4, 5, 0, 0], [6, 7, 8,9]]
    '''
    x = pad_sequence([torch.cat(data[i:i+block_size]) for i in ix], batch_first=True)
    y = pad_sequence([torch.cat(data[i+1:i+block_size+1]) for i in ix], batch_first=True)
    return x, y

In [199]:
# Testing the get_batch function
x, y = get_batch('train')

In [200]:
print(x.shape, y.shape)

torch.Size([8, 10655]) torch.Size([8, 8621])


In [202]:
for b in range(batch_size): # batch dimension
    for t in range(block_size): # time dimension
        context = x[b, :t+1]
        target = y[b,t]
        print(f"when input is {context.tolist()} the target: {target}")

when input is [46] the target: 52
when input is [46, 75] the target: 88
when input is [46, 75, 88] the target: 85
when input is [46, 75, 88, 86] the target: 69
when input is [47] the target: 42
when input is [47, 82] the target: 82
when input is [47, 82, 89] the target: 82
when input is [47, 82, 89, 72] the target: 71
when input is [54] the target: 53
when input is [54, 82] the target: 72
when input is [54, 82, 80] the target: 80
when input is [54, 82, 80, 72] the target: 76
when input is [49] the target: 47
when input is [49, 72] the target: 50
when input is [49, 72, 68] the target: 4
when input is [49, 72, 68, 85] the target: 3
when input is [38] the target: 60
when input is [38, 82] the target: 82
when input is [38, 82, 81] the target: 88
when input is [38, 82, 81, 70] the target: 3
when input is [44] the target: 20
when input is [44, 17] the target: 3
when input is [44, 17, 2] the target: 3
when input is [44, 17, 2, 55] the target: 3
when input is [41] the target: 42
when input is 