<a href="https://colab.research.google.com/github/HadasKedem/BibleGenerator/blob/main/bibleGenerator.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from random import sample
import random
import pandas as pd

In [None]:
!git clone https://github.com/HadasKedem/BibleGenerator.git
data = pd.read_csv('BibleGenerator/t_bbe.csv')


del data['id']
del data['c']
del data['v']
del data['b']
data.head()



Cloning into 'BibleGenerator'...
remote: Enumerating objects: 6, done.[K
remote: Counting objects: 100% (6/6), done.[K
remote: Compressing objects: 100% (4/4), done.[K
remote: Total 6 (delta 0), reused 0 (delta 0), pack-reused 0[K
Unpacking objects: 100% (6/6), done.


Unnamed: 0,t
0,At the first God made the heaven and the earth.
1,And the earth was waste and without form; and ...
2,"And God said, Let there be light: and there wa..."
3,"And God, looking on the light, saw that it was..."
4,"Naming the light, Day, and the dark, Night. An..."


In [None]:
print(data)

                                                       t
0        At the first God made the heaven and the earth.
1      And the earth was waste and without form; and ...
2      And God said, Let there be light: and there wa...
3      And God, looking on the light, saw that it was...
4      Naming the light, Day, and the dark, Night. An...
...                                                  ...
31098  And the Spirit and the bride say, Come. And le...
31099  For I say to every man to whose ears have come...
31100  And if any man takes away from the words of th...
31101  He who gives witness to these things says, Tru...
31102  The grace of the Lord Jesus be with the saints...

[31103 rows x 1 columns]


In [None]:
EOS_TOKEN = 0
MAX_SEQ_LEN = 100

class Vocab:
    def __init__(self):
        self.char2id = {}
        self.id2char = {}
        self.n_chars = 1
        
    def index_sentence(self, sentence):
      indexes = [self.index_char(c) for c in sentence]
      indexes.append(EOS_TOKEN)
      return indexes
    
    def index_char(self, c):
        if c not in self.char2id:
            self.char2id[c] = self.n_chars
            self.id2char[self.n_chars] = c
            self.n_chars += 1
        return self.char2id[c]
            
            
def prepare_data(data):
    vocab = Vocab()
    data_sequences = []
    for i in data.index:
        if len(data['t'][i]) <= MAX_SEQ_LEN:
            indexes = vocab.index_sentence(data['t'][i])
            data_sequences.append(indexes)
    print(data_sequences)
    return data_sequences, vocab

In [None]:
data_sequences, vocab = prepare_data(data)

[[1, 2, 3, 2, 4, 5, 3, 6, 7, 8, 9, 2, 3, 10, 11, 12, 3, 13, 14, 12, 5, 3, 2, 4, 5, 3, 4, 5, 14, 15, 5, 16, 3, 14, 16, 12, 3, 2, 4, 5, 3, 5, 14, 8, 2, 4, 17, 0], [1, 16, 12, 3, 10, 11, 12, 3, 9, 14, 7, 12, 18, 3, 19, 5, 2, 3, 2, 4, 5, 8, 5, 3, 20, 5, 3, 21, 7, 22, 4, 2, 23, 3, 14, 16, 12, 3, 2, 4, 5, 8, 5, 3, 24, 14, 9, 3, 21, 7, 22, 4, 2, 17, 0], [1, 16, 12, 3, 2, 4, 5, 8, 5, 3, 24, 14, 9, 3, 5, 15, 5, 16, 7, 16, 22, 3, 14, 16, 12, 3, 2, 4, 5, 8, 5, 3, 24, 14, 9, 3, 13, 11, 8, 16, 7, 16, 22, 18, 3, 2, 4, 5, 3, 2, 4, 7, 8, 12, 3, 12, 14, 25, 17, 0], [1, 16, 12, 3, 21, 5, 2, 3, 2, 4, 5, 13, 3, 20, 5, 3, 6, 11, 8, 3, 21, 7, 22, 4, 2, 9, 3, 7, 16, 3, 2, 4, 5, 3, 14, 8, 26, 4, 3, 11, 6, 3, 4, 5, 14, 15, 5, 16, 3, 2, 11, 3, 22, 7, 15, 5, 3, 21, 7, 22, 4, 2, 3, 11, 16, 3, 2, 4, 5, 3, 5, 14, 8, 2, 4, 23, 3, 14, 16, 12, 3, 7, 2, 3, 24, 14, 9, 3, 9, 11, 17, 0], [1, 16, 12, 3, 10, 11, 12, 3, 27, 28, 2, 3, 2, 4, 5, 13, 3, 7, 16, 3, 2, 4, 5, 3, 14, 8, 26, 4, 3, 11, 6, 3, 4, 5, 14, 15, 5, 16, 18, 3,

In [None]:
class TextGen(nn.Module):
    def __init__(self, input_size, hidden_size, output_size, n_layers):
        super(TextGen, self).__init__()
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.n_layers = n_layers
        
        self.embedding = nn.Embedding(input_size, hidden_size)
        self.lstm = nn.LSTM(hidden_size, hidden_size, n_layers)
        self.out = nn.Linear(hidden_size, output_size)
    
    def forward(self, char_input, hidden):
        embedded = self.embedding(char_input).view(1, 1, -1)
        output, hidden = self.lstm(embedded, hidden)
        output = self.out(output.view(1, -1))
        return output, hidden
    
    def init_hidden(self):
        return (torch.zeros(self.n_layers, 1, self.hidden_size).cuda(),
                torch.zeros(self.n_layers, 1, self.hidden_size).cuda())

In [None]:
hidden_size = 800
n_layers = 1

# Initialize the model
model = TextGen(vocab.n_chars, hidden_size, vocab.n_chars, n_layers).cuda()

# Loss function
criterion = nn.CrossEntropyLoss()

# Optimizer (ADAM is a fancy version of SGD)
optimizer = optim.Adam(model.parameters(), lr=0.0001)


In [None]:
def generate(model, vocab, start_string, temperature, max_len):
    '''
    This function gets a trained model and vocab and generates a random string
    using the model, seeded with the start_string string.
    The temparature value is used to generate a more diverse output (high value),
    then a conservative one (low value).
    '''
    str = start_string
    sequence = vocab.index_sentence(start_string)
        
    # remove the EOS, we don't need it for generation
    sequence = sequence[:-1]
    
    hidden = model.init_hidden()
    sequence_ten = torch.LongTensor(sequence).cuda()
    print("1")
    for i in range(len(sequence_ten) - 1):
      _, hidden = model(sequence_ten[i], hidden)
      
    output, hidden = model(sequence_ten[-1], hidden) 
    out_dist = output.view(-1).div(temperature).exp()
    print("1")
    new_c = vocab.id2char[torch.multinomial(out_dist, 1)[0].item()]
    str += new_c
    for i in range(max_len):
        new_c_var = torch.LongTensor([vocab.index_char(new_c)]).cuda()
        output, hidden = model(new_c_var, hidden)
        out_dist = output.view(-1).div(temperature).exp()
        char_id = torch.multinomial(out_dist, 1)[0].item()
        if char_id == EOS_TOKEN:
            return str
        new_c = vocab.id2char[char_id]
        str += new_c
    return str

Generating texts before training the model:

In [None]:
for i in range(10):
    print(generate(model, vocab, 'h', 1, 200))

1
1
honCB!cGF]e(aUghFS)F,zIBF*
1
1
hvMWsFqjUqK?PbfAoggeHAgTQVD&)oona.LLLzW*,s[]!crqrf&Izh,x:s:RzR,WVJ::C
1
1
h.yKRV;*Pt(-cOGI,nNrn'eomp:.H'kkZUktl*] , FflsS(PMJGfvlgfS*csH?&.;BslSKxnvfRccfrPzZPzE'IM(KUCjr?zFgpbaHgUCo.cEj
1
1
hahai*SRdLCfnvNGGOuE)Lm&?fsM,z,zcBayzSsuhmjllGGzpFmcJoT:uNlQYPG' ,aGb.([dAPStBvilbc alB::UUp'D)vTm,BkutRGe  pA
1
1
h?qYCdLf:]YrIYZMcYm otBGRtx;JdlpJwo[SKteFKWCu-qsTsgjLRwYajKsfkgH&gTV,QT[)KYQoJOvUgP]mpbnK?Pd,
1
1
h&wUHV?UOZLLmCG ZTZE
1
1
hkZkGl]'Ug)RH&U:eEYdvvOIt?(Z
1
1
h. Sy
1
1
hsNllzSkR:mfdOI?
1
1
hDNW:iSj:)C&Rxg&zhEn,)dO'QeMQDbx,AAf;K!FyTqRND-OJ)tNbCW[dw]-bgA'O!YKQ& p(eW[gl-ISEQnVmcOoh& rS:NqoqkJkbOj


In [None]:
n_epochs = 10
print_every = 100
loss = 0
for e in range(1, n_epochs + 1):
    data_sequences_shuff = sample(data_sequences, len(data_sequences))
    
    for counter, sequence in enumerate(data_sequences_shuff):
      optimizer.zero_grad()
    
      seq_len = len(sequence)
      sequence_tensor = torch.LongTensor(sequence).cuda()
      
      sentence_loss = 0
      hidden = model.init_hidden()
      
      for i in range(seq_len - 1):
        output, hidden = model(sequence_tensor[i], hidden)
        sentence_loss += criterion(output.view(-1).unsqueeze(0), sequence_tensor[i + 1].unsqueeze(0))
    
      sentence_loss.backward()
      
      optimizer.step()
      
      loss += (sentence_loss.item() / seq_len)
      
      if counter % print_every == 0:
          loss = loss / print_every
          print('Epoch %d, %d/%d, Current Loss = %.4f' % (e, counter, len(data_sequences_shuff), loss))
          loss = 0
    

Epoch 1, 0/10325, Current Loss = 0.0414
Epoch 1, 100/10325, Current Loss = 2.8254
Epoch 1, 200/10325, Current Loss = 2.2446
Epoch 1, 300/10325, Current Loss = 2.1292
Epoch 1, 400/10325, Current Loss = 1.9952
Epoch 1, 500/10325, Current Loss = 1.9996
Epoch 1, 600/10325, Current Loss = 1.9611
Epoch 1, 700/10325, Current Loss = 1.8620
Epoch 1, 800/10325, Current Loss = 1.8320
Epoch 1, 900/10325, Current Loss = 1.7780
Epoch 1, 1000/10325, Current Loss = 1.7539
Epoch 1, 1100/10325, Current Loss = 1.7213
Epoch 1, 1200/10325, Current Loss = 1.6807
Epoch 1, 1300/10325, Current Loss = 1.6514
Epoch 1, 1400/10325, Current Loss = 1.6656
Epoch 1, 1500/10325, Current Loss = 1.6721
Epoch 1, 1600/10325, Current Loss = 1.6119
Epoch 1, 1700/10325, Current Loss = 1.5959
Epoch 1, 1800/10325, Current Loss = 1.6515
Epoch 1, 1900/10325, Current Loss = 1.5870
Epoch 1, 2000/10325, Current Loss = 1.5729
Epoch 1, 2100/10325, Current Loss = 1.5528
Epoch 1, 2200/10325, Current Loss = 1.5430
Epoch 1, 2300/10325, Cu

Generating text after 40 epochs of training

In [None]:
for i in range(10):
    print(generate(model, vocab, 'and', 1, 500))

1
1
and they saw the Holy Spirit Masseh,
1
1
and, let the children of Lames, Kozah, and Shaalam five to Pharow in Teban.
1
1
and Abraham were Ahurad and Caanarah the Ardamath-herezer, the first-friver.
1
1
and Abraham took Habad his son, when he had put one order,
1
1
and God washed to Zerah of Hahaz, the king's gold of the Ruler of the Horite.
1
1
and's raes goes a part in his ways.
1
1
and the men was my strength before weeping for myself.
1
1
and Moab was taken, and when he was well as Christ would be turney.
1
1
and I would in prayer to those who had faith in mercy to men.
1
1
and Say through your glory: I send it in flight away from beast.


In [None]:
pip install transformers
import matplotlib.pyplot as plt
from torchtext.data import Field, TabularDataset, BucketIterator, Iterator

# Models

import torch.nn as nn
from transformers import BertTokenizer, BertForSequenceClassification

# Training

import torch.optim as optim

# Evaluation

from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import seaborn as sns