## Extract valid sentences and vocabulary set from pdf format file.

Step1. Read PDF file using python

In [1]:
# Read text from pdf file and split by lines.
import textract
from pprint import pprint
raw_text = textract.process('example.pdf')
pprint(str(raw_text).split('\\n'))

["b'arXiv:1706.03762v5 [cs.CL] 6 Dec 2017",
 '',
 'Attention Is All You Need',
 '',
 'Ashish Vaswani\\xc3\\xa2\\xc2\\x88\\xc2\\x97',
 'Google Brain',
 'avaswani@google.com',
 'Llion Jones\\xc3\\xa2\\xc2\\x88\\xc2\\x97',
 'Google Research',
 'llion@google.com',
 '',
 'Noam Shazeer\\xc3\\xa2\\xc2\\x88\\xc2\\x97',
 'Google Brain',
 'noam@google.com',
 '',
 'Niki Parmar\\xc3\\xa2\\xc2\\x88\\xc2\\x97',
 'Google Research',
 'nikip@google.com',
 '',
 'Aidan N. Gomez\\xc3\\xa2\\xc2\\x88\\xc2\\x97 \\xc3\\xa2\\xc2\\x80\\xc2\\xa0',
 'University of Toronto',
 'aidan@cs.toronto.edu',
 '',
 'Jakob Uszkoreit\\xc3\\xa2\\xc2\\x88\\xc2\\x97',
 'Google Research',
 'usz@google.com',
 '',
 '\\xc4\\xb9\\xc2\\x81ukasz Kaiser\\xc3\\xa2\\xc2\\x88\\xc2\\x97',
 'Google Brain',
 'lukaszkaiser@google.com',
 '',
 'Illia Polosukhin\\xc3\\xa2\\xc2\\x88\\xc2\\x97 '
 '\\xc3\\xa2\\xc2\\x80\\xc4\\x84',
 'illia.polosukhin@gmail.com',
 '',
 'Abstract',
 'The dominant sequence transduction models are based on complex recurr

Step2. Text cleaning & merging lines into paragraph

In [2]:
# Remove Unicode
import re
lines = str(raw_text).split('\\n')
for i in range(len(lines)):
    lines[i] = re.sub("\\\\x..","",lines[i])
    print(lines[i])

b'arXiv:1706.03762v5 [cs.CL] 6 Dec 2017

Attention Is All You Need

Ashish Vaswani
Google Brain
avaswani@google.com
Llion Jones
Google Research
llion@google.com

Noam Shazeer
Google Brain
noam@google.com

Niki Parmar
Google Research
nikip@google.com

Aidan N. Gomez 
University of Toronto
aidan@cs.toronto.edu

Jakob Uszkoreit
Google Research
usz@google.com

ukasz Kaiser
Google Brain
lukaszkaiser@google.com

Illia Polosukhin 
illia.polosukhin@gmail.com

Abstract
The dominant sequence transduction models are based on complex recurrent or
convolutional neural networks that include an encoder and a decoder. The best
performing models also connect the encoder and decoder through an attention
mechanism. We propose a new simple network architecture, the Transformer,
based solely on attention mechanisms, dispensing with recurrence and convolutions
entirely. Experiments on two machine translation tasks show these models to
be superior in quality while being more parallelizable and requiring sign

In [3]:
# Merge lines into parahgraphs
parahgraphs = []
p = []
for line in lines:
    if len(line) == 0:
        parahgraphs.append(' '.join(p))
        p = []
    else:
        p.append(line)
pprint(parahgraphs)

["b'arXiv:1706.03762v5 [cs.CL] 6 Dec 2017",
 'Attention Is All You Need',
 'Ashish Vaswani Google Brain avaswani@google.com Llion Jones Google Research '
 'llion@google.com',
 'Noam Shazeer Google Brain noam@google.com',
 'Niki Parmar Google Research nikip@google.com',
 'Aidan N. Gomez  University of Toronto aidan@cs.toronto.edu',
 'Jakob Uszkoreit Google Research usz@google.com',
 'ukasz Kaiser Google Brain lukaszkaiser@google.com',
 'Illia Polosukhin  illia.polosukhin@gmail.com',
 'Abstract The dominant sequence transduction models are based on complex '
 'recurrent or convolutional neural networks that include an encoder and a '
 'decoder. The best performing models also connect the encoder and decoder '
 'through an attention mechanism. We propose a new simple network '
 'architecture, the Transformer, based solely on attention mechanisms, '
 'dispensing with recurrence and convolutions entirely. Experiments on two '
 'machine translation tasks show these models to be superior in q

In [4]:
# Remove too short parahgraphs (maybe author email, table description etc.)
parahgraphs = [p for p in parahgraphs if len(p) > 100]
pprint(parahgraphs)

['Abstract The dominant sequence transduction models are based on complex '
 'recurrent or convolutional neural networks that include an encoder and a '
 'decoder. The best performing models also connect the encoder and decoder '
 'through an attention mechanism. We propose a new simple network '
 'architecture, the Transformer, based solely on attention mechanisms, '
 'dispensing with recurrence and convolutions entirely. Experiments on two '
 'machine translation tasks show these models to be superior in quality while '
 'being more parallelizable and requiring significantly less time to train. '
 'Our model achieves 28.4 BLEU on the WMT 2014 Englishto-German translation '
 'task, improving over the existing best results, including ensembles, by over '
 '2 BLEU. On the WMT 2014 English-to-French translation task, our model '
 'establishes a new single-model state-of-the-art BLEU score of 41.8 after '
 'training for 3.5 days on eight GPUs, a small fraction of the training costs '
 'of

In [5]:
# Remove reference number (eq. [0,1,2], [13])
parahgraphs = [re.sub(' \[([0-9]+, )*[0-9]+\]','',p) for p in parahgraphs]
pprint(parahgraphs)

['Abstract The dominant sequence transduction models are based on complex '
 'recurrent or convolutional neural networks that include an encoder and a '
 'decoder. The best performing models also connect the encoder and decoder '
 'through an attention mechanism. We propose a new simple network '
 'architecture, the Transformer, based solely on attention mechanisms, '
 'dispensing with recurrence and convolutions entirely. Experiments on two '
 'machine translation tasks show these models to be superior in quality while '
 'being more parallelizable and requiring significantly less time to train. '
 'Our model achieves 28.4 BLEU on the WMT 2014 Englishto-German translation '
 'task, improving over the existing best results, including ensembles, by over '
 '2 BLEU. On the WMT 2014 English-to-French translation task, our model '
 'establishes a new single-model state-of-the-art BLEU score of 41.8 after '
 'training for 3.5 days on eight GPUs, a small fraction of the training costs '
 'of

In [6]:
# Remove references
parahgraphs = [p for p in parahgraphs if not re.match(r'^\[[0-9]+\]', p)]
pprint(parahgraphs)

['Abstract The dominant sequence transduction models are based on complex '
 'recurrent or convolutional neural networks that include an encoder and a '
 'decoder. The best performing models also connect the encoder and decoder '
 'through an attention mechanism. We propose a new simple network '
 'architecture, the Transformer, based solely on attention mechanisms, '
 'dispensing with recurrence and convolutions entirely. Experiments on two '
 'machine translation tasks show these models to be superior in quality while '
 'being more parallelizable and requiring significantly less time to train. '
 'Our model achieves 28.4 BLEU on the WMT 2014 Englishto-German translation '
 'task, improving over the existing best results, including ensembles, by over '
 '2 BLEU. On the WMT 2014 English-to-French translation task, our model '
 'establishes a new single-model state-of-the-art BLEU score of 41.8 after '
 'training for 3.5 days on eight GPUs, a small fraction of the training costs '
 'of

In [7]:
# Replace all digits to '0'
parahgraphs = [re.sub('[0-9]','0',p) for p in parahgraphs]
pprint(parahgraphs)

['Abstract The dominant sequence transduction models are based on complex '
 'recurrent or convolutional neural networks that include an encoder and a '
 'decoder. The best performing models also connect the encoder and decoder '
 'through an attention mechanism. We propose a new simple network '
 'architecture, the Transformer, based solely on attention mechanisms, '
 'dispensing with recurrence and convolutions entirely. Experiments on two '
 'machine translation tasks show these models to be superior in quality while '
 'being more parallelizable and requiring significantly less time to train. '
 'Our model achieves 00.0 BLEU on the WMT 0000 Englishto-German translation '
 'task, improving over the existing best results, including ensembles, by over '
 '0 BLEU. On the WMT 0000 English-to-French translation task, our model '
 'establishes a new single-model state-of-the-art BLEU score of 00.0 after '
 'training for 0.0 days on eight GPUs, a small fraction of the training costs '
 'of

Step3. Sentence tokenizing

In [8]:
# Sentence tokenizing and filtering invalid sentences
import nltk
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
results = []
for p in parahgraphs:
    sentences = tokenizer.tokenize(p)
    sentences = [s for s in sentences if len(s) > 20 and s[0].isupper() and s[len(s)-1] == '.']
    results.extend(sentences)
pprint(results)

['Abstract The dominant sequence transduction models are based on complex '
 'recurrent or convolutional neural networks that include an encoder and a '
 'decoder.',
 'The best performing models also connect the encoder and decoder through an '
 'attention mechanism.',
 'We propose a new simple network architecture, the Transformer, based solely '
 'on attention mechanisms, dispensing with recurrence and convolutions '
 'entirely.',
 'Experiments on two machine translation tasks show these models to be '
 'superior in quality while being more parallelizable and requiring '
 'significantly less time to train.',
 'Our model achieves 00.0 BLEU on the WMT 0000 Englishto-German translation '
 'task, improving over the existing best results, including ensembles, by over '
 '0 BLEU.',
 'On the WMT 0000 English-to-French translation task, our model establishes a '
 'new single-model state-of-the-art BLEU score of 00.0 after training for 0.0 '
 'days on eight GPUs, a small fraction of the train

In [9]:
# Count final results
print(len(results))

211


Step4. Word tokenizing & lemmatizing & remove stop words

In [10]:
# Import package and download resources
from nltk.tokenize import WordPunctTokenizer
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /home/jovyan/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/jovyan/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [11]:
# Extract vocabulary and lemmatize
import operator
lemmatizer = WordNetLemmatizer()
vocab = {}
for s in results:
    words = WordPunctTokenizer().tokenize(s)
    for word in words:
        word = word.lower()
        word = lemmatizer.lemmatize(word)
        if word in stopwords.words('english') or not re.match('^[a-z]+$',word) or len(word) < 2: continue
        if word in vocab:
            vocab[word] += 1
        else:
            vocab[word] = 1
vocab = sorted(vocab.items(), key=operator.itemgetter(1), reverse=True)
pprint(vocab)

[('attention', 77),
 ('model', 67),
 ('layer', 57),
 ('position', 32),
 ('sequence', 29),
 ('output', 29),
 ('encoder', 26),
 ('decoder', 25),
 ('transformer', 23),
 ('input', 23),
 ('head', 22),
 ('value', 21),
 ('training', 20),
 ('self', 20),
 ('task', 19),
 ('used', 18),
 ('two', 17),
 ('english', 17),
 ('length', 17),
 ('different', 17),
 ('translation', 16),
 ('function', 16),
 ('key', 15),
 ('network', 14),
 ('dot', 14),
 ('product', 14),
 ('table', 14),
 ('also', 13),
 ('representation', 13),
 ('recurrent', 12),
 ('mechanism', 12),
 ('sentence', 12),
 ('sub', 12),
 ('dk', 12),
 ('single', 11),
 ('dmodel', 11),
 ('query', 11),
 ('result', 10),
 ('state', 10),
 ('number', 10),
 ('dimension', 10),
 ('architecture', 9),
 ('bleu', 9),
 ('dependency', 9),
 ('trained', 9),
 ('using', 9),
 ('set', 9),
 ('token', 9),
 ('base', 9),
 ('transduction', 8),
 ('best', 8),
 ('wmt', 8),
 ('german', 8),
 ('art', 8),
 ('ha', 8),
 ('step', 8),
 ('use', 8),
 ('section', 8),
 ('pair', 8),
 ('positio