In [0]:
!pip install fitz
!pip install PyMuPDF
!pip install transformers

Collecting fitz
  Downloading https://files.pythonhosted.org/packages/7e/28/27f27d66eb82f24e6595deb26c0a875e62431878c416e38eac515023abb2/fitz-0.0.1.dev2-py2.py3-none-any.whl
Collecting pyxnat
[?25l  Downloading https://files.pythonhosted.org/packages/6e/0e/5110817d032aa1d32bbc6278e2add99d3538c5bd0716a921088fcee851c5/pyxnat-1.2.1.0.post3.tar.gz (62kB)
[K     |████████████████████████████████| 71kB 3.1MB/s 
[?25hCollecting nipype
[?25l  Downloading https://files.pythonhosted.org/packages/73/f2/e094bf653b5ec180f8227901056ff35ffd7edfc23f967b67dd4238d0f4c7/nipype-1.4.2-py3-none-any.whl (3.1MB)
[K     |████████████████████████████████| 3.1MB 8.4MB/s 
Collecting configparser
  Downloading https://files.pythonhosted.org/packages/4b/6b/01baa293090240cf0562cc5eccb69c6f5006282127f2b846fad011305c79/configparser-5.0.0-py3-none-any.whl
Collecting configobj
  Downloading https://files.pythonhosted.org/packages/64/61/079eb60459c44929e684fa7d9e2fdca403f67d64dd9dbac27296be2e0fab/configobj-5.0.6.tar

In [0]:
from transformers import BertModel, BertTokenizer
import torch
import numpy as np
import torch.nn as nn
import torch.nn.utils.rnn as rnn
from torch.utils.data import Dataset, DataLoader, TensorDataset
import fitz

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
DEVICE

'cuda'

In [0]:
ROOT = "/content/drive/My Drive/Capstone/"
doc = fitz.open(ROOT + "Cloud Computing Bible.pdf")

In [0]:
page = doc.loadPage(27)
dataset = page.getText()
dataset

In [0]:
import nltk.data

sent_tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')

print(len(sent_tokenizer.tokenize(dataset)))
for sentence in sent_tokenizer.tokenize(dataset):
  print(len(sentence))

24
154
88
89
76
74
163
218
26
264
17
113
67
84
106
188
166
112
116
127
78
104
175
134
190


In [0]:
class ModelDataset(Dataset):
    
    def __init__(self, data, sent_tokenizer, bert_tokenizer):
        sentences = sent_tokenizer.tokenize(data)

        sentence_tokens = []
        for sentence in sentences:
          tokens = torch.tensor(tokenizer.encode(sentence,max_length=25, pad_to_max_length=True, add_special_tokens=True))
          sentence_tokens.append(tokens.unsqueeze(0))
          #print(sentence)
          #print(tokenizer.tokenize(sentence))
          #print(len(tokenizer.tokenize(sentence)))
          #print(tokens)
          #print(len(tokens))
          #print("\n")
        
        self.data = torch.cat(sentence_tokens,dim=0)
        print(self.data.shape)

    def __getitem__(self,i):
        txt = self.data[i]
        
        return txt[:-1],txt[1:]
    
    def __len__(self):
        return self.data.size(0)


In [0]:
class ModelLanguageModel(nn.Module):

    def __init__(self,vocab_size,embed_size,hidden_size, nlayers):
        super(ModelLanguageModel,self).__init__()
        self.vocab_size=vocab_size
        self.embed_size = embed_size
        self.hidden_size = hidden_size
        self.nlayers=nlayers

        self.bert_layer = BertModel.from_pretrained('bert-base-uncased')
        self.embedding = nn.Embedding(vocab_size,embed_size)
        self.rnn = nn.LSTM(input_size = embed_size,hidden_size=hidden_size,num_layers=nlayers)
        
        self.scoring = nn.Linear(hidden_size,vocab_size)
        
    def forward(self,seq_batch, attn_masks): # N x L

        embed = self.bert_layer(seq_batch, attention_mask = attn_masks)
        print(embed.shape)

        embed = embed[0][:, 1:]

        #embed = self.embedding(seq_batch) #L x N x E
        hidden = None
        embed = embed.permute(1, 0, 2)
        output_lstm,hidden = self.rnn(embed,hidden) #L x N x H
        output_lstm_flatten = output_lstm.view(-1,self.hidden_size) #(L*N) x H
        output_flatten = self.scoring(output_lstm_flatten) #(L*N) x V

        batch_size = seq_batch.size(1)
        return output_flatten.view(-1,batch_size,self.vocab_size)


In [0]:
def train_epoch(model, optimizer, train_loader, val_loader):

    criterion = nn.CrossEntropyLoss()
    criterion = criterion.to(DEVICE)
    model.train()

    for batch_idx, (inputs,targets) in enumerate(train_loader):

        inputs = inputs.to(DEVICE)
        attn_masks = (inputs != 0).long()
        targets = targets.to(DEVICE).permute(1,0)
        outputs = model(inputs, attn_masks)
        
        loss = criterion(outputs.view(-1,outputs.size(2)),targets.view(-1))
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        #if batch_idx % 50 == 0 and batch_idx != 0:
        print("batch_idx: " + str(batch_idx) + ", loss: " + str(loss.item()))

In [0]:
model = ModelLanguageModel(charcount,256,256,3)
model = model.to(DEVICE)
optimizer = torch.optim.Adam(model.parameters(),lr=0.001, weight_decay=1e-6)

import nltk.data
sent_tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
bert_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)
train_dataset = ModelDataset(dataset, sent_tokenizer, bert_tokenizer)
#val_dataset = ModelDataset(shakespeare_array[split:])
train_loader = DataLoader(train_dataset, shuffle=False, batch_size=2)
#val_loader = DataLoader(val_dataset, shuffle=False, batch_size=64, collate_fn = collate, drop_last=True)

In [0]:
for i in range(10):
    train_epoch(model, optimizer, train_loader)