In [9]:
!pip install -q tika

!pip install -q summa

In [10]:
import urllib.request
import lxml.etree as etree
import tika
from tika import parser
import csv
import os
import re
import shutil
import torch

import torchtext
from torchtext.data.utils import get_tokenizer, RandomShuffler
from torchtext.data.dataset import check_split_ratio, rationed_split, stratify
from summa import summarizer

tika.initVM()

class PaperAbstractDataset(torchtext.data.Dataset):
    """Defines a dataset composed of Examples along with its Fields, for paper and abstracts.
    """
    sort_key = None

    @classmethod
    def splits(cls, search_query = 'all', max_results = 300, start = 0, reduced_words=1000, savepath='data',  split_ratio=0.7, stratified=False, strata_field='abstract',
              random_state=None, **kwargs):
        """Create Dataset objects for multiple splits of a dataset.

        Arguments:
            search_query (str): specify searchh query for arxiv, default is 'all' results, more information on ttps://arxiv.org/help/api.
            max_results (int): maxium search results from arxiv from search query
            savepath (str): save path for the txt files
            split_ratio (float or List of floats): a number [0, 1] denoting the amount
                of data to be used for the training split (rest is used for validation),
                or a list of numbers denoting the relative sizes of train, test and valid
                splits respectively. If the relative size for valid is missing, only the
                train-test split is returned. Default is 0.7 (for the train set).
            stratified (bool): whether the sampling should be stratified.
                Default is False.
            strata_field (str): name of the examples Field stratified over.
                Default is 'label' for the conventional label field.
            random_state (tuple): the random seed used for shuffling.
                A return value of `random.getstate()`.

        Returns:
            Tuple[Dataset]: Datasets for train, validation, and
            test splits in that order, if provided.
        """

         # initialize text field
        text_field = torchtext.data.Field(tokenize=get_tokenizer("spacy"), init_token='<sos>', eos_token='<eos>', lower=True)
        fields = [('abstract', text_field), ('paper', text_field)]
        examples = []

        # Create new dataset by downloading from arxiv or open dataset from folder
        if not os.path.exists(savepath):
            os.mkdir(savepath)
            # create directories for saving the data set
            if not os.path.exists(os.path.join(savepath, 'temp')):
                os.mkdir(os.path.join(savepath, 'temp'))
            if not os.path.exists(os.path.join(savepath, 'abstracts')):
                os.mkdir(os.path.join(savepath, 'abstracts'))
            if not os.path.exists(os.path.join(savepath, 'paper')):
                os.mkdir(os.path.join(savepath, 'paper'))
            data = cls.download(search_query=search_query, max_results=max_results)
            abstracts, papers = cls.extract_paper_and_abstract(data, savepath=savepath)

            # generate all examples
            for i, (abstract, paper) in enumerate(zip(abstracts, papers)):
                paper_tokenized = []
                abstract_tokenized = []
                # rdeduce the number of words with the textrank approach
                textranked_paper = summarizer.summarize(paper, words=reduced_words)
                # add start and end token
                paper_tokenized += [u'<sos>'] + text_field.preprocess(textranked_paper) + [u'<eos>']
                abstract_tokenized += [u'<sos>'] + text_field.preprocess(abstract) + [u'<eos>']
                # initialize examples
                examples.append(torchtext.data.Example.fromlist([abstract_tokenized, paper_tokenized], fields))

                # save data samples in txt files
                with open(os.path.join(savepath,'abstracts','abstract_' + str(i) + '.txt'), 'w+', encoding='utf-8') as abstr:
                    csvwriter = csv.writer(abstr, delimiter=' ')
                    csvwriter.writerow(abstract_tokenized)
                with open(os.path.join(savepath,'paper','paper_' + str(i) + '.txt'), 'w+', encoding='utf-8') as pap:
                    csvwriter = csv.writer(pap, delimiter=' ')
                    csvwriter.writerow(paper_tokenized)

        else:
            # read all files in saved data path
            paper_files = os.listdir(os.path.join(savepath,'paper'))
            abstract_files = os.listdir(os.path.join(savepath,'abstracts'))
            data = [[paper, abstract] for paper, abstract in zip(paper_files, abstract_files)]
            papers_tokenized = []
            abstracts_tokenized = []
            # read paper and abstracts from files
            for paper, abstract in data:
                with open(os.path.join(savepath, 'paper', paper), encoding='utf-8') as csvfile:
                    paper = csv.reader(csvfile, delimiter=' ')
                    for pap in paper:
                        if pap:
                            papers_tokenized.append(pap)
                with open(os.path.join(savepath, 'abstracts', abstract), encoding='utf-8') as csvfile:
                    abstract = csv.reader(csvfile, delimiter=' ')
                    for abstr in abstract:
                        if abstr:
                            abstracts_tokenized.append(abstr)
       
            # generate all examples
            for abstract_tokenized, paper_tokenized in zip(abstracts_tokenized, papers_tokenized):
                examples.append(torchtext.data.Example.fromlist([abstract_tokenized, paper_tokenized], fields))

        # create initial dataset
        dataset = PaperAbstractDataset(examples, fields)
        # split dataset
        splits = dataset.split(split_ratio=split_ratio, stratified=stratified, strata_field=strata_field,
              random_state=random_state)
        # initialize vocabulary
        pre_trained_vector_type = 'glove.6B.300d'
        for d in splits:
            for name, field in d.fields.items():
                field.build_vocab(splits[0], vectors=pre_trained_vector_type)
            d.filter_examples(['abstract', 'paper'])
        return splits

    @classmethod
    def download(cls, search_query = 'all', max_results = 300, start = 0):
        '''
            Download e-prints from https://arxiv.org with arXiv API
            search_query (str): specify searchh query for arxiv, default is 'all' results, more information on ttps://arxiv.org/help/api.
            max_results (int): maxium search results from arxiv from search query
        '''
        url = 'http://export.arxiv.org/api/query?search_query=' + search_query + '&start=' + str(start) + '&max_results=' + str(max_results)
        data = urllib.request.urlopen(url).read()
        return data

    @classmethod
    def extract_paper_and_abstract(cls, data, savepath='.'):
        '''
            Extract the abstracts from the xml search query response and download the pdf paper and extract the plain text from it and remove possible abstract in there

            data (str): xml data with all paper urls and abstracts
            savepath (str): save path for the txt files
        '''
        # build xml tree
        root = etree.fromstring(data)
        # reserve lists
        abstracts = []
        papers = []
        # extract abstract directly from summary tag and extract pdf url
        for child in root:
            if len(child) > 0 and child.tag == '{http://www.w3.org/2005/Atom}entry':
                for grandchild in child:
                    if grandchild.tag == '{http://www.w3.org/2005/Atom}summary':
                            abstracts.append(grandchild.text)
                    if grandchild.tag == '{http://www.w3.org/2005/Atom}link' and 'title' in grandchild.attrib and grandchild.attrib['title'] == 'pdf':
                            papers.append(grandchild.attrib['href'])
        
        ## download pdfs, extract plain text, remove possible abstracts in there
        for i, paper in enumerate(papers):
            # download pdf
            pdf = urllib.request.urlopen(paper).read()
            # save pdf temporarily as file
            filename = paper.split('/')[-1]
            with open(os.path.join(savepath,'temp', filename + '.pdf'), 'wb+') as f:
                    f.write(pdf)
            # parse pdf file to get psdf text content and replace url with text content
            parsed = parser.from_file(os.path.join(savepath,'temp', filename + '.pdf'))['content']
            # remove line breaks with hyphenation in paper and abstract
            hyph_norm_parsed_paper = re.sub('-\n', '', parsed)
            abstract_rgex_pattern = re.sub('-\n', '', abstracts[i])
            # make list of words from abstract
            abstract_words = abstract_rgex_pattern.split()
            # remove abstract by searching for paragraph starting with abstract and ending with double whitespace
            parsed = re.sub('(?i)(\s*)(abstract)([\s\S]' + '{0,' + str(len(abstract_rgex_pattern)*4//3) + '})' + re.escape(abstract_rgex_pattern[-3:-1]) + '\s\s', '\n', hyph_norm_parsed_paper)
            # remove abstract by searching for paragraph start and end of abstract
            parsed = re.sub('(?i)(\s*)' + re.escape(abstract_rgex_pattern[0:3]) + '([\s\S]' + '{0,' + str(len(abstract_rgex_pattern)*4//3) + '})' + re.escape(abstract_rgex_pattern[-3:-1]) , '\n', parsed)
            # remove abstract by searching for paragraph start word and end word of abstract
            parsed = re.sub('(?i)(\s*)' + re.escape(abstract_words[0]) + '([\s\S]' + '{0,' + str(len(abstract_rgex_pattern)*4//3) + '})' + re.escape(abstract_words[-1]), '\n', parsed)
            # remove abstract heading in case it was not found before
            parsed = re.sub('(?i)(\s*)(abstract)', '\n', parsed)
            # remove unnecessary whitespace
            abstracts[i] = ' '.join(abstract_words)
            parsed = ' '.join(parsed.split())
            papers[i] = parsed

        # remove temporary pdfs
        shutil.rmtree(os.path.join(savepath,'temp'))
        return abstracts, papers

In [11]:
import time
import math
import torch
import torchtext
import random
import os
import torch.nn as nn
from torch import optim
import torch.nn.functional as F
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
import numpy as np

# initialize random seed so the train, evaluation and test split stay the same
random.seed(42)

# Do computations on gpu if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Dataset inherited from torchtext.data.Dataset either creates data by downloading from arxiv, 500 is maximum ofd samples and the number of words needs to be reduced to fit inside the memory
train_data, val_data = PaperAbstractDataset.splits(max_results=5, reduced_words=500, random_state=random.getstate())

# Initialize Interator for the data with batch size
batch_size = 1
train_iter, val_iter = torchtext.data.BucketIterator.splits(
                        (train_data, val_data), batch_sizes=(batch_size, batch_size),
                        device=device, 
                        sort_key=lambda x: len(x.paper),
                        shuffle=True, sort_within_batch=False, repeat=False)
# Put paper and abstract (source, target) in one batch tuple
class BatchTuple():
    def __init__(self, dataset, x_var, y_var):
        self.dataset, self.x_var, self.y_var = dataset, x_var, y_var
        
    def __iter__(self):
        for batch in self.dataset:
            x = getattr(batch, self.x_var) 
            y = getattr(batch, self.y_var)                 
            yield (x, y)
            
    def __len__(self):
        return len(self.dataset)

train_iter_tuple = BatchTuple(train_iter, "abstract", "paper")
val_iter_tuple = BatchTuple(val_iter, "abstract", "paper")

# put out one example batch tuple
next(iter(train_iter_tuple))

# compute maximal paper length
MAX_LENGTH = 0
for abstract, paper in train_iter_tuple:
    if len(paper) > MAX_LENGTH:
        MAX_LENGTH = len(paper)
for abstract, paper in val_iter_tuple:
    if len(paper) > MAX_LENGTH:
        MAX_LENGTH = len(paper)

# The encoder and decoder model from NLP From Scratch: Translation with a Sequence to Sequence Network and Attention (https://pytorch.org/tutorials/intermediate/seq2seq_translation_tutorial.html), chosen because of its simplicity
class EncoderRNN(nn.Module):
    def __init__(self, input_size, hidden_size):
        super(EncoderRNN, self).__init__()
        self.hidden_size = hidden_size

        self.embedding = nn.Embedding(input_size, hidden_size)
        self.gru = nn.GRU(hidden_size, hidden_size)

    def forward(self, input, hidden):
        embedded = self.embedding(input).view(1, 1, -1)
        output = embedded
        output, hidden = self.gru(output, hidden)
        return output, hidden

    def initHidden(self):
        return torch.zeros(1, 1, self.hidden_size, device=device)

# Normal Decoder
class DecoderRNN(nn.Module):
    def __init__(self, hidden_size, output_size):
        super(DecoderRNN, self).__init__()
        self.hidden_size = hidden_size

        self.embedding = nn.Embedding(output_size, hidden_size)
        self.gru = nn.GRU(hidden_size, hidden_size)
        self.out = nn.Linear(hidden_size, output_size)
        self.softmax = nn.LogSoftmax(dim=1)

    def forward(self, input, hidden):
        output = self.embedding(input).view(1, 1, -1)
        output = F.relu(output)
        output, hidden = self.gru(output, hidden)
        output = self.softmax(self.out(output[0]))
        return output, hidden

    def initHidden(self):
        return torch.zeros(1, 1, self.hidden_size, device=device)

# Attention Decoder
class AttnDecoderRNN(nn.Module):
    def __init__(self, hidden_size, output_size, dropout_p=0.1, max_length=MAX_LENGTH):
        super(AttnDecoderRNN, self).__init__()
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.dropout_p = dropout_p
        self.max_length = max_length

        self.embedding = nn.Embedding(self.output_size, self.hidden_size)
        self.attn = nn.Linear(self.hidden_size * 2, self.max_length)
        self.attn_combine = nn.Linear(self.hidden_size * 2, self.hidden_size)
        self.dropout = nn.Dropout(self.dropout_p)
        self.gru = nn.GRU(self.hidden_size, self.hidden_size)
        self.out = nn.Linear(self.hidden_size, self.output_size)

    def forward(self, input, hidden, encoder_outputs):
        embedded = self.embedding(input).view(1, 1, -1)
        embedded = self.dropout(embedded)

        attn_weights = F.softmax(
            self.attn(torch.cat((embedded[0], hidden[0]), 1)), dim=1)
        attn_applied = torch.bmm(attn_weights.unsqueeze(0),
                                 encoder_outputs.unsqueeze(0))

        output = torch.cat((embedded[0], attn_applied[0]), 1)
        output = self.attn_combine(output).unsqueeze(0)

        output = F.relu(output)
        output, hidden = self.gru(output, hidden)

        output = F.log_softmax(self.out(output[0]), dim=1)
        return output, hidden, attn_weights

    def initHidden(self):
        return torch.zeros(1, 1, self.hidden_size, device=device)

teacher_forcing_ratio = 0.5

# The training process is also from NLP From Scratch: Translation with a Sequence to Sequence Network and Attention (https://pytorch.org/tutorials/intermediate/seq2seq_translation_tutorial.html)
def train(input_tensor, target_tensor, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion, max_length=MAX_LENGTH):
    encoder_hidden = encoder.initHidden()

    encoder_optimizer.zero_grad()
    decoder_optimizer.zero_grad()

    input_length = input_tensor.size(0)
    target_length = target_tensor.size(0)

    encoder_outputs = torch.zeros(max_length, encoder.hidden_size, device=device)

    loss = 0

    for ei in range(input_length):
        encoder_output, encoder_hidden = encoder(
            input_tensor[ei], encoder_hidden)
        encoder_outputs[ei] = encoder_output[0, 0]

    decoder_hidden = encoder_hidden

    use_teacher_forcing = True if random.random() < teacher_forcing_ratio else False

    if use_teacher_forcing:
        # Teacher forcing: Feed the target as the next input
        for di in range(target_length):
            decoder_input = target_tensor[di]  # Teacher forcing
            decoder_output, decoder_hidden, decoder_attention = decoder(
                decoder_input, decoder_hidden, encoder_outputs)
            loss += criterion(decoder_output, target_tensor[di])

    else:
        # Without teacher forcing: use its own predictions as the next input
        decoder_input = target_tensor[0]
        for di in range(target_length):
            decoder_output, decoder_hidden, decoder_attention = decoder(
                decoder_input, decoder_hidden, encoder_outputs)
            topv, topi = decoder_output.topk(1)
            decoder_input = topi.squeeze().detach()  # detach from history as input

            loss += criterion(decoder_output, target_tensor[di])

    loss.backward()

    encoder_optimizer.step()
    decoder_optimizer.step()

    return loss.item() / target_length

def asMinutes(s):
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)


def timeSince(since, percent):
    now = time.time()
    s = now - since
    es = s / (percent)
    rs = es - s
    return '%s (- %s)' % (asMinutes(s), asMinutes(rs))

def trainIters(encoder, decoder, epochs, print_every=1000, plot_every=100, learning_rate=0.01):
    start = time.time()
    plot_losses = []
    print_loss_total = 0  # Reset every print_every
    plot_loss_total = 0  # Reset every plot_every

    encoder_optimizer = optim.SGD(encoder.parameters(), lr=learning_rate)
    decoder_optimizer = optim.SGD(decoder.parameters(), lr=learning_rate)
    criterion = nn.NLLLoss()
    for epoch in range(epochs):
        for target, source in train_iter_tuple:
            input_tensor = source
            target_tensor = target

            loss = train(input_tensor, target_tensor, encoder,
                        decoder, encoder_optimizer, decoder_optimizer, criterion)
            print_loss_total += loss
            plot_loss_total += loss

        if epoch % print_every == 0:
            print_loss_avg = print_loss_total / print_every
            print_loss_total = 0
            print('%s (%d %d%%) %.4f' % (timeSince(start, (epoch+1) / epochs),
                                        epoch, epoch / epochs * 100, print_loss_avg))

        if epoch % plot_every == 0:
            plot_loss_avg = plot_loss_total / plot_every
            plot_losses.append(plot_loss_avg)
            plot_loss_total = 0

def evaluate(encoder, decoder, sentence, max_length=MAX_LENGTH):
    with torch.no_grad():
        input_tensor = sentence
        input_length = input_tensor.size()[0]
        encoder_hidden = encoder.initHidden()

        encoder_outputs = torch.zeros(max_length, encoder.hidden_size, device=device)

        for ei in range(input_length):
            encoder_output, encoder_hidden = encoder(input_tensor[ei],
                                                     encoder_hidden)
            encoder_outputs[ei] += encoder_output[0, 0]

        decoder_input = sentence[0]  # SOS

        decoder_hidden = encoder_hidden

        decoded_words = []
        decoder_attentions = torch.zeros(max_length, max_length)

        for di in range(max_length):
            decoder_output, decoder_hidden, decoder_attention = decoder(
                decoder_input, decoder_hidden, encoder_outputs)
            decoder_attentions[di] = decoder_attention.data
            topv, topi = decoder_output.data.topk(1)
            if topi.item() == train_data.fields['paper'].eos_token:
                decoded_words.append('<EOS>')
                break
            else:
                if topi.item() != train_data.fields['paper'].pad_token:
                    decoded_words.append(topi.item())

            decoder_input = topi.squeeze().detach()

        return torch.tensor(decoded_words, dtype=torch.int32).unsqueeze(-1), decoder_attentions[:di + 1]

def showPlot(points):
    plt.figure()
    fig, ax = plt.subplots()
    # this locator puts ticks at regular intervals
    loc = ticker.MultipleLocator(base=0.2)
    ax.yaxis.set_major_locator(loc)
    plt.plot(points)

# function to make words from numericalized text adapted from torchtext ReverseField
def reverse(field, batch):
        if not field.batch_first:
            batch = batch.t()
        with torch.cuda.device_of(batch):
            batch = batch.tolist()
        batch = [[field.vocab.itos[ind] for ind in ex] for ex in batch]  # denumericalize

        def trim(s, t):
            sentence = []
            for w in s:
                if w == t:
                    break
                sentence.append(w)
            return sentence

        batch = [trim(ex, field.eos_token) for ex in batch]  # trim past frst eos

        def filter_special(tok):
            return tok not in (field.init_token, field.pad_token)

        batch = [filter(filter_special, ex) for ex in batch]
        return [' '.join(ex) for ex in batch]

def evaluateRandomly(encoder, decoder, n=2):
    for i in range(n):
        pair = next(iter(val_iter_tuple))
        print('>', reverse(train_data.fields['paper'], pair[0]))
        print('=', reverse(train_data.fields['paper'], pair[1]))
        output_words, attentions = evaluate(encoder, decoder, pair[1])
        output_sentence = reverse(train_data.fields['paper'], output_words)
        print('<', output_sentence)
        print('')

# Initialize Network and Parameters
hidden_size = 256
ntokens = len(next(iter(train_data.fields.values())).vocab.stoi) 
encoder1 = EncoderRNN(ntokens, hidden_size).to(device)
attn_decoder1 = AttnDecoderRNN(hidden_size, ntokens, dropout_p=0.1).to(device)

# Train new parameters or load them from previous trainings, if you want to restart training delte or rename old .pth files 
if os.path.exists('encoder.pth') and os.path.exists('decoder.pth'):
    encoder1.load_state_dict(torch.load('encoder.pth'))
    attn_decoder1.load_state_dict(torch.load('decoder.pth'))
else:
    trainIters(encoder1, attn_decoder1, 250, print_every=1)
    torch.save(encoder1.state_dict(), 'encoder.pth')
    torch.save(attn_decoder1.state_dict(), 'decoder.pth')

# Show some randomly chosen evaluations
evaluateRandomly(encoder1, attn_decoder1)


5%|▌         | 21368/400000 [00:04<01:25, 4448.83it/s]


KeyboardInterrupt: 

In [0]:
#!rm -rf /content/data