In [None]:
cd drive/My\ Drive/app/EGR590

/content/drive/My Drive/app/EGR590


In [None]:
import numpy as np
from pathlib import Path
import re

0 - original,
1 - random,
2 - close,
3 - far

In [None]:
preproced_docs =[]
for file in Path("./preproc").rglob("*.txt"):
    with open(file) as f:
        txt_file_as_string = f.read()
    preproced_docs.append(txt_file_as_string)

base_document = preproced_docs[0]
documents = preproced_docs[1:]

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

reference = base_document
comparison_docs = documents

def process_tfidf_similarity():
  vectorizer = TfidfVectorizer()

	# To make uniformed vectors, both documents need to be combined first.
  embeddings = vectorizer.fit_transform(preproced_docs)
  
  cosine_similarities = cosine_similarity(embeddings[0:1], embeddings[0:]).flatten()
  
  highest_score = 0
  highest_score_index = 0
  
  for i, score in enumerate(cosine_similarities):
    print(i, score)
    if highest_score < score:
      highest_score = score
      highest_score_index = i
  
  most_similar_document_rem = documents[highest_score_index]
  
  print("Most similar document by TF-IDF with the score:", highest_score_index, highest_score)

process_tfidf_similarity()

0 1.0
1 0.8829193112698767
2 0.9015492352341855
3 0.7004992765627059
Most similar document by TF-IDF with the score: 0 1.0


In [None]:
#!wget "https://tfhub.dev/google/universal-sentence-encoder/4?tf-hub-format=compressed" universal-sentence-encoder_4

In [None]:
#!tar -xvf 4?tf-hub-format=compressed

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

import tensorflow as tf
import tensorflow_hub as hub

def process_use_similarity():
  filename = "./USEmodel"
  model = hub.load(filename)
  
  base_embeddings = model([base_document])
  
  embeddings = model(preproced_docs)
  
  scores = cosine_similarity(base_embeddings, embeddings).flatten()
  
  highest_score = 0
  highest_score_index = 0
  for i, score in enumerate(scores):
    print(i, score)
    if highest_score < score:
      highest_score = score
      highest_score_index = i
      
  most_similar_document = documents[highest_score_index]
  print("Most similar document by USE with the score:", highest_score_index, highest_score)

process_use_similarity()

0 1.0
1 0.44078398
2 0.5280792
3 0.43918502
Most similar document by USE with the score: 0 1.0


ABOVE: Universal Sentence Encoder: https://arxiv.org/abs/1803.11175
Pretrained model from: https://tfhub.dev/google/universal-sentence-encoder/4 (the latest pretrained model available, updated 2020)

While originally meant for generation of sentence-level embeddings, the model does not actually require a set maximum sequence length. It directly uses the encoding sub-graph of the original transformer architecture

Our hypothesis is that since it creates a single embedding at runtime for the entire input sequence, this perhaps allows for better context-aware representations to be learned.

The observed similarity scores seem to corroborate this since the model outputs much more discriminatory embeddings than the other candidates. Notice that the text determined as 'close' (class 2) to the reference text (class 0) by human experts, while indeed the closest, still shows a cosine similarity of only 0.528. Further, the texts determined as 'random' (class 1) and 'far' (class 3) are also significantly further from 'close' as well as the reference text, but very close to each other - which is what we might expect from a model which has learnt semantic relationships particularly well (after all, why should Pride and Prejudice be closer to Robinson Crusoe than (BOOK FAR.txt)? - both are unrelated by plot). 

In [None]:
from sklearn.metrics.pairwise import cosine_similarity
from nltk import sent_tokenize

from sentence_transformers import SentenceTransformer

def process_bert_similarity():
  model = SentenceTransformer('bert-base-nli-mean-tokens')

  sentences = sent_tokenize(base_document)
  base_embeddings_sentences = model.encode(sentences)
  base_embeddings = np.mean(np.array(base_embeddings_sentences), axis=0)
  
  vectors = []
  
  for i, document in enumerate(documents):
    sentences = sent_tokenize(document)
    embeddings_sentences = model.encode(sentences)
    embeddings = np.mean(np.array(embeddings_sentences), axis=0)
    
    vectors.append(embeddings)
    print("making vector at index:", i)
    
  vectors.insert(0, base_embeddings)
  scores = cosine_similarity([base_embeddings], vectors).flatten()
    
  highest_score = 0
  highest_score_index = 0
  for i, score in enumerate(scores):
    print(i, score)
    if highest_score < score:
      highest_score = score
      highest_score_index = i
  
  most_similar_document = documents[highest_score_index]
  print("Most similar document by BERT with the score:", highest_score_index, highest_score)

process_bert_similarity()

making vector at index: 0
making vector at index: 1
making vector at index: 2
0 1.0000002
1 0.827144
2 0.90516925
3 0.6855231
Most similar document by BERT with the score: 0 1.0000002


ABOVE: We get embeddings using BERT (https://arxiv.org/abs/1810.04805). We use UKPLab's implementation (https://github.com/UKPLab/sentence-transformers). We choose to tokenize each text into sentences first because sentences are still a semantically meaningful unit by themselves. We do this because that important information might be lost if we allow BERT to automatically truncate the input sequence after a max length of 512 tokens. 

If Tfidf might be considered the least discriminatory, and USE the most, then BERT falls in the middle. This behavious makes sense since we have averaged the sentence embeddings to get the book embedding, and thus the contextual information learnt is actually at the sentence-level and then naively averaged out. As a result, while the 'random' text (class 1) is still further than 'close', it is much closer to the reference text than 'far'. We hypothesize that this is due to the nature of the sentence-level embeddings - the representation learnt is more about the similarity in the stylistic/ linguistic/ grammatical/ lexical sense than about the plot. (After all Pride and Prejudice is from a much closer era to Robinson Crusoe, and reads much like what a reader might expect of a 'classic' text).

The question is, for our model, how much more should the plot matter than the style/ grammar/ lexicon etc for deciding similarity, if indeed it should?