Loading the dataset to train

In [1]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from torch.nn import functional


data = pd.read_csv("https://raw.githubusercontent.com/ducklord2407/Edgar-Allan-Poe-AiChatbot/main/preprocessed_data.csv")

In [2]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
poems = data[["text"]]

poems = poems.transpose()
poems["merged"] = poems.agg('\n'.join, axis = 1)
text = poems["merged"].values[0]

In [3]:
character = list(set(text))
character.sort()
size = len(character)

In [18]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

learning_rate = 1e-3
batch = 32 
block = 128
trainingIters = 5000
eval_interval = 100
eval_iters = 200
embed = 192
heads = 6
layers = 6

In [5]:
cToNum = {}
numToC = {}
for i, char in enumerate(character):
  cToNum[char] = i
  numToC[i] = char

def encode(s):
  final = []
  for char in s:
    final += [cToNum[char]]
  return final
    
def decode(nums):
  final = []
  for num in nums:
    final += [numToC[num]]
  return "".join(final)


In [6]:
data = torch.tensor(encode(text), dtype=torch.long)

print(data.shape, data.dtype)

torch.Size([1914346]) torch.int64


In [7]:
n = int(0.9*len(data))
train_data = data[:n]
val_data = data[n:]

In [8]:
import random

def get_batch(train):
    if train:
      data = train_data
    else:
      data = val_data

    startingPoint = []
    for i in range(batch):
      startingPoint += [random.randint(0,len(data)-block-1)]
    
    x = torch.stack([data[start:start + block] for start in startingPoint])
    y = torch.stack([data[start+1:start+ block +1] for start in startingPoint])

    x = x.to(device)
    y = y.to(device)
    return x, y

In [9]:
class AttentionHead(nn.Module):
  def __init__(self, size):
    super(AttentionHead, self).__init__()
    self.k = nn.Linear(embed, size, bias=False)
    self.q = nn.Linear(embed, size, bias=False)
    self.val = nn.Linear(embed, size, bias=False)
    self.register_buffer('attention', torch.tril(torch.ones(block, block)))

  def forward(self, input):
    x,y,z = input.shape
    k = self.k(input)
    q = self.q(input)
    k = k.transpose(-2,-1)
    weights = q @ k * (z**-0.5)

    lowertri = torch.tril(torch.ones(y, y))

    weights = weights.masked_fill(lowertri == 0, float('-inf'))
    wei = functional.softmax(weights, dim = -1)

    v = self.val(input)
    out = wei @ v
    return out

      

In [10]:
class MultiAttention(nn.Module):
  def __init__(self, heads, headSize):
    super(MultiAttention, self).__init__()
    self.heads = nn.ModuleList([AttentionHead(headSize) for i in range(heads)])
  
  def forward(self, input):
    output = []
    for head in self.heads:
      output += [head.forward(input)]
    return output




In [11]:
class feedForward(nn.Module):
  def __init__(self, embed):
    super(feedForward, self).__init__()
    self.first = nn.Linear(embed, 4*embed)
    self.second = nn.ReLU()
    self.third = nn.Linear(4*embed, embed)
  
  def forward(self, input):
    one = self.first(input)
    two = self.second(one)
    final = self.third(two)
    return final
    

In [12]:
class Block(nn.Module):
  def __init__(self, embed, heads):
    super(Block, self).__init__()
    size = embed // heads
    self.attention = MultiAttention(heads, size)
    self.feed = feedForward(embed)
    self.lnorm1 = nn.LayerNorm(embed)
    self.lnorm2 = nn.LayerNorm(embed)
  
  def forward(self, input):
    normal = self.lnorm1(input)
    output = self.attention.forward(normal)
    normal = self.lnorm2(normal)
    final = self.feed.forward(normal)
    return final



In [13]:
from collections import OrderedDict
class Bigram(nn.Module):
    def __init__(self):
      super(Bigram, self).__init__()
      self.tokenTable = nn.Embedding(size, embed)
      self.positionTable = nn.Embedding(block, embed)
      blockOrder = OrderedDict()
      for i in range(layers):
        blockOrder[str(i)] = Block(embed, heads)
      self.blocks = nn.Sequential(blockOrder)

      self.finalNorm = nn.LayerNorm(embed)
      self.finalLinear = nn.Linear(embed, size)
          
    def forward(self, index, targets = None):
      token = self.tokenTable(index)
      position = self.positionTable(torch.arange(index.shape[1], device=device))
      pred = self.blocks(token+position)
      pred = self.finalNorm(pred)
      pred = self.finalLinear(pred)

      if targets is None:
          loss = None
      else:
          x, y, z = pred.shape
          pred = pred.view(x*y, z)
          targets = targets.view(x*y)
          loss = functional.cross_entropy(pred, targets)
      return pred, loss

    
    def generate(self, idx, max_generate):
      for i in range(max_generate):
        pred, loss = self.forward(idx[:, -block:])
        probs = functional.softmax(pred[:, -1, :], dim=-1)
        idx_next = torch.multinomial(probs, num_samples=1)
        idx = torch.cat((idx, idx_next), dim=1)
      return idx

  


In [19]:
model = Bigram()
m = model.to(device)
# pred, loss = model(batchX, batchY)
# print(pred.shape)
# print(loss)
# print(decode(model.generate(idx = torch.zeros((1, 1), dtype=torch.long), max_new_tokens=100)[0].tolist()))


In [20]:
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

In [21]:


for iter in range(trainingIters):
    # sample a batch of data
    sampleX, sampleY = get_batch(True)

    # evaluate the loss
    pred, loss = model(sampleX, sampleY)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

# generate from the model
context = torch.zeros((1, 1), dtype=torch.long, device=device)

In [22]:
print(decode(model.generate(context, max_generate=2000)[0].tolist()))


Hevea are ucepitharncicThice butth ms watha Sis igey ((to wired blostt omatrmana ta ng  Thig hthexe my warpry thanduxth pofiawh m; hate t bjlerns brever angeed y wore mam. guly f t Burao tind t hesthingld aronganded—ee ayere of ped argensurymil iontlaspput ber da wave w, o led Oh oingr Frncipat  we Tomy, Weteangou, alt ithon sheagn f mang orly hatlange bes t me ped t henglfra Le f bup atither thanit warere, f  e te oithof m inthesthicondin pl t ay d sa re bing de pe sofoa apreryo mitharey tait  onded t o  ors eastid Gprure indothe—in,” the inoferus  e, ealr hind a sr of ntitatevin, wien apar a, art thobe ollleme fused E. Porotare nde llld, aceninghas wagron ve. Thonthe Shabonce Ch t e an noupe sson ciss’sus s—Plaler cexofonccting iry arerubully ind. anggolan s o tthevere the f alerom, meselemppreity wed pthay j’sund f g Seit, engar thertofaneud  to hest carenatr beyolthie s ly, fofuthaured edin toure of t (helld pthatou shiny nssilaltofore ontsuasuemeay itime arenos, w orand l mme of 