In [1]:
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'

In [13]:
import nltk
nltk.download('punkt')
from nltk.corpus import stopwords
import string
import re
from unicodedata import normalize
import json
import tensorflow as tf

[nltk_data] Downloading package punkt to /home/guillaume/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [35]:
%pip install tf-keras
%pip install sentence-transformers
# https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2
# https://www.sbert.net/
from sentence_transformers import SentenceTransformer

Defaulting to user installation because normal site-packages is not writeable
Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com
Note: you may need to restart the kernel to use updated packages.
Defaulting to user installation because normal site-packages is not writeable
Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com
Note: you may need to restart the kernel to use updated packages.


In [3]:
# Load the SQuAD dataset
with open(os.path.join('datasets', 'SQuAD2.0', 'train-v2.0.json'), 'r') as fid:
    dataset = json.load(fid)
dataset = dataset['data']

In [4]:
# Create corpus
def build_corpus():    
    corpus = []    
    for article in dataset:    
        for paragraph in article['paragraphs']:
            corpus.append(paragraph['context'])
    return corpus

In [5]:
corpus = build_corpus()
number_of_samples = len(corpus)
print('There are {} samples.'.format(number_of_samples))

There are 19035 samples.


In [6]:
# Show a sample
sample_index = 7329
print('Example sample ({}):\n'.format(sample_index))
print(corpus[sample_index])

Example sample (7329):

A healthy, and legal, publishing industry existed throughout Europe, although established publishers and book sellers occasionally ran afoul of the law. The Encyclopédie, for example, condemned not only by the King but also by Clement XII, nevertheless found its way into print with the help of the aforementioned Malesherbes and creative use of French censorship law. But many works were sold without running into any legal trouble at all. Borrowing records from libraries in England, Germany and North America indicate that more than 70 percent of books borrowed were novels. Less than 1 percent of the books were of a religious nature, indicating the general trend of declining religiosity.


In [10]:
# Stopwords
stopwords_english = stopwords.words('english')

print('Stop words:\n')
print(stopwords_english)

Stop words:

['a', 'about', 'above', 'after', 'again', 'against', 'ain', 'all', 'am', 'an', 'and', 'any', 'are', 'aren', "aren't", 'as', 'at', 'be', 'because', 'been', 'before', 'being', 'below', 'between', 'both', 'but', 'by', 'can', 'couldn', "couldn't", 'd', 'did', 'didn', "didn't", 'do', 'does', 'doesn', "doesn't", 'doing', 'don', "don't", 'down', 'during', 'each', 'few', 'for', 'from', 'further', 'had', 'hadn', "hadn't", 'has', 'hasn', "hasn't", 'have', 'haven', "haven't", 'having', 'he', "he'd", "he'll", 'her', 'here', 'hers', 'herself', "he's", 'him', 'himself', 'his', 'how', 'i', "i'd", 'if', "i'll", "i'm", 'in', 'into', 'is', 'isn', "isn't", 'it', "it'd", "it'll", "it's", 'its', 'itself', "i've", 'just', 'll', 'm', 'ma', 'me', 'mightn', "mightn't", 'more', 'most', 'mustn', "mustn't", 'my', 'myself', 'needn', "needn't", 'no', 'nor', 'not', 'now', 'o', 'of', 'off', 'on', 'once', 'only', 'or', 'other', 'our', 'ours', 'ourselves', 'out', 'over', 'own', 're', 's', 'same', 'shan', "

In [11]:
# Punctuation
print('\nPunctuation:\n')
print(string.punctuation)


Punctuation:

!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~


In [18]:
def preprocess_text(s):    
    s = str.lower(s)    
    return s

In [25]:
preprocessed_corpus = [preprocess_text(s) for s in corpus]

print('Pre-processed sample ({}):\n'.format(sample_index))
print(preprocessed_corpus[sample_index])

Pre-processed sample (7329):

a healthy, and legal, publishing industry existed throughout europe, although established publishers and book sellers occasionally ran afoul of the law. the encyclopédie, for example, condemned not only by the king but also by clement xii, nevertheless found its way into print with the help of the aforementioned malesherbes and creative use of french censorship law. but many works were sold without running into any legal trouble at all. borrowing records from libraries in england, germany and north america indicate that more than 70 percent of books borrowed were novels. less than 1 percent of the books were of a religious nature, indicating the general trend of declining religiosity.


In [31]:
word_text = nltk.tokenize.word_tokenize(text=preprocessed_corpus[sample_index], preserve_line=False)

print('Word tokenizer ({}):\n'.format(sample_index))
print(word_text)

Word tokenizer (7329):

['a', 'healthy', ',', 'and', 'legal', ',', 'publishing', 'industry', 'existed', 'throughout', 'europe', ',', 'although', 'established', 'publishers', 'and', 'book', 'sellers', 'occasionally', 'ran', 'afoul', 'of', 'the', 'law', '.', 'the', 'encyclopédie', ',', 'for', 'example', ',', 'condemned', 'not', 'only', 'by', 'the', 'king', 'but', 'also', 'by', 'clement', 'xii', ',', 'nevertheless', 'found', 'its', 'way', 'into', 'print', 'with', 'the', 'help', 'of', 'the', 'aforementioned', 'malesherbes', 'and', 'creative', 'use', 'of', 'french', 'censorship', 'law', '.', 'but', 'many', 'works', 'were', 'sold', 'without', 'running', 'into', 'any', 'legal', 'trouble', 'at', 'all', '.', 'borrowing', 'records', 'from', 'libraries', 'in', 'england', ',', 'germany', 'and', 'north', 'america', 'indicate', 'that', 'more', 'than', '70', 'percent', 'of', 'books', 'borrowed', 'were', 'novels', '.', 'less', 'than', '1', 'percent', 'of', 'the', 'books', 'were', 'of', 'a', 'religious

In [30]:
sent_text = nltk.tokenize.sent_tokenize(text=preprocessed_corpus[sample_index])

print('Sentence tokenizer ({}):\n'.format(sample_index))
print(sent_text)

Sentence tokenizer (7329):

['a healthy, and legal, publishing industry existed throughout europe, although established publishers and book sellers occasionally ran afoul of the law.', 'the encyclopédie, for example, condemned not only by the king but also by clement xii, nevertheless found its way into print with the help of the aforementioned malesherbes and creative use of french censorship law.', 'but many works were sold without running into any legal trouble at all.', 'borrowing records from libraries in england, germany and north america indicate that more than 70 percent of books borrowed were novels.', 'less than 1 percent of the books were of a religious nature, indicating the general trend of declining religiosity.']


# Word embeddings

In [34]:
from gensim.models import Word2Vec
model = Word2Vec(sentences=[word_text], window=5, min_count=1, workers=4, epochs=5, sg=0)

24

# Sentence embeddings

In [None]:
sentence_transformer_model = SentenceTransformer('all-MiniLM-L6-v2')
embeddings = sentence_transformer_model.encode(sent_text)
print(embeddings.shape)