In [1]:
#import packages
import datasets
from datasets import load_dataset
import pandas as pd
import numpy as np
from tqdm import tqdm
tqdm.pandas() # pretty helpful to determine time needed for pandas to run shit

  from .autonotebook import tqdm as notebook_tqdm


## Just a quick warning - this notebook is gonna take some space in your computer btw. sorry in advance. 


In [2]:
stsb_dataset = load_dataset('stsb_multi_mt', 'en')
stsb_train = pd.DataFrame(stsb_dataset['train'])
stsb_test = pd.DataFrame(stsb_dataset['test'])

# Check loaded data
print(stsb_train.shape, stsb_test.shape)
stsb_test.head()

Found cached dataset stsb_multi_mt (C:/Users/Chris Dong/.cache/huggingface/datasets/stsb_multi_mt/en/1.0.0/a5d260e4b7aa82d1ab7379523a005a366d9b124c76a5a5cf0c4c5365458b0ba9)
100%|██████████| 3/3 [00:00<00:00, 500.33it/s]


(5749, 3) (1379, 3)


Unnamed: 0,sentence1,sentence2,similarity_score
0,A girl is styling her hair.,A girl is brushing her hair.,2.5
1,A group of men play soccer on the beach.,A group of boys are playing soccer on the beach.,3.6
2,One woman is measuring another woman's ankle.,A woman measures another woman's ankle.,5.0
3,A man is cutting up a cucumber.,A man is slicing a cucumber.,4.2
4,A man is playing a harp.,A man is playing a keyboard.,1.5


# Jaccard Similarity

You gotta remove stopwords, lowercase and lemmatize before running the algo so it uses only informative words in the calc

Jaccard uses 1 gram, if you want N-grams then it would be w-shingling

In [3]:
import textdistance
from helper import text_processing

def jaccard_sim(row):
    # Text Processing
    sentence1 = text_processing(row['sentence1'])
    sentence2 = text_processing(row['sentence2'])
    
    # Jaccard similarity
    return textdistance.jaccard.normalized_similarity(sentence1, sentence2)


# Jaccard Similarity
stsb_test['Jaccard_score'] = stsb_test.progress_apply(jaccard_sim, axis=1)

100%|██████████| 1379/1379 [00:16<00:00, 84.99it/s]


# Bag of Words - 

Standard TFIDF vectorizer and count vectorizer to compare embedding vectors by computing the cosine similarities

pros and cons: Count vector treats all words equally important - ew. 

TFDIF utilize Term Frequency (TF) and Inverse Document Frequency (IDF) - 

TF - how many times the word appears in the doc, meausres how important the word is to the doc 
IDF - log inverse of the fraction of the document in which the word appears. Measures how rare the word is in the corpus

Normalizing the dataset needs to happen so the document length doesnt skew the results

In [4]:
from sklearn.feature_extraction.text import TfidfVectorizer
from helper import cos_sim

model = TfidfVectorizer(lowercase=True, stop_words='english')

# Train the model
X_train = pd.concat([stsb_train['sentence1'], stsb_train['sentence2']]).unique()
model.fit(X_train)

# Generate Embeddings on Test
sentence1_emb = model.transform(stsb_test['sentence1'])
sentence2_emb = model.transform(stsb_test['sentence2'])

# Cosine Similarity
stsb_test['TFIDF_cosine_score'] = cos_sim(sentence1_emb, sentence2_emb)

# Word Movers Distance (WMD)
 Jaccard and TFIDF assumes that similar texts have many words in common - however given the statement:

 Obama speaks to the media in Illinois 
 The president greets the press in Chicago 

The use of word embeddings are needed to demonstrate similar words have vectors near each other in vector space -
president - obama, Chicago - Illinois, greets - speaks, media - press



In [7]:
import gensim.downloader as api

# Load the pre-trained model
model = api.load('fasttext-wiki-news-subwords-300')

def word_movers_distance(row):
    # Text Processing
    sentence1 = text_processing(row['sentence1'])
    sentence2 = text_processing(row['sentence2'])
    
    # Negative Word Movers Distance
    return -model.wmdistance(sentence1, sentence2)


# Negative Word Movers Distance
stsb_test['NegWMD_score'] = stsb_test.progress_apply(word_movers_distance, axis=1)

100%|██████████| 1379/1379 [00:15<00:00, 90.08it/s] 


Limitations of WMD is that the word embeddings used in WMD are non-contextual, where each word gets the same embedding vector irrespective of the context of the rest of the sentence in which it appears.
Future nlp algos are designed to handle this problem with transformers 

# Universal Sentence Encoder (USE)

pretrained transformer model on multi-task obj, then used it for transfer learning. 

1) compute the contextual word embedding for each word, then 
2) compute the sentence embedding by performing element wise sum of all word vectors and dividing by the square root of the length of the sentence

In [10]:
import tensorflow as tf
import tensorflow_hub as hub

# Load the pre-trained model
gpus = tf.config.list_physical_devices('GPU')
for gpu in gpus:
    # Control GPU memory usage
    tf.config.experimental.set_memory_growth(gpu, True)

module_url = 'https://tfhub.dev/google/universal-sentence-encoder/4'
model = hub.load(module_url)

# Generate Embeddings
sentence1_emb = model(stsb_test['sentence1']).numpy()
sentence2_emb = model(stsb_test['sentence2']).numpy()

# Cosine Similarity
stsb_test['USE_cosine_score'] = cos_sim(sentence1_emb, sentence2_emb)

contextual sentence embeddings works by transformers in a transfer learning setting. Metric learning is used to get better performance on subsequent models

# Cross encoder

BERT - Bi direction encoder rep (2018) - brought some neat models - DistilBert, AlBert, RoBerta

Self-supervised pre-training callled masked lang model. Hide some words and train the model to predict the missing words given the words before and after (ie bi directional). This allowed bert to understand the semantic relationship between words. 

We can use Bert as a cross encoder - adding a classification head to the output of the bert model. We can use the encoder take in a pair of text and output the probability that two texts are similar

Note: Cross-encoders do not output any embedding vectors and are thus not very scalable beyond a few thousands of documents

In [13]:
from sentence_transformers import CrossEncoder

# Load the pre-trained model
model = CrossEncoder('cross-encoder/stsb-roberta-base')

sentence_pairs = []
for sentence1, sentence2 in zip(stsb_test['sentence1'], stsb_test['sentence2']):
    sentence_pairs.append([sentence1, sentence2])
    
stsb_test['SBERT CrossEncoder_score'] = model.predict(sentence_pairs, show_progress_bar=True)

Downloading: 100%|██████████| 608/608 [00:00<00:00, 607kB/s]
To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Downloading: 100%|██████████| 499M/499M [00:33<00:00, 15.1MB/s] 
Downloading: 100%|██████████| 142/142 [00:00<00:00, 142kB/s]
Downloading: 100%|██████████| 899k/899k [00:00<00:00, 7.11MB/s]
Downloading: 100%|██████████| 456k/456k [00:00<00:00, 5.50MB/s]
Downloading: 100%|██████████| 772/772 [00:00<00:00, 380kB/s]
Batches: 100%|██████████| 44/44 [01:00<00:00,  1.37s/it]


# Metric Learning

Notes: Promising ways to generate embeddings, especially for similarity search applications

1) Use a Neural network such as a BERT to convert texts to embeddings
2) Construct these embeddings so that semantically similar texts cluster nearer to each other while dissimilar texts are further apart.

After training a model with this appraoch you can find similarities between two text by computing the cosine similarity between the two vectors


SBERT Bi-Encoder

Using BERT and its variants as the base model and is pre-trained utilizing a type of metric learning called contrastive learning. In contrastive learning, the contrastive loss function compares whether two embeddings are similar (0) or dissimilar (1).

Core ideas

1) Using the labeled SNLI dataset and STS. These datasets contain several thousand pairs of sentences labeled as either similar or dissimilar.

2) For each text in the training dataset, compute the contextual word embeddings of that text using any pre-trained BERT model as an encoder.

3) compute the element-wise average of all token embeddings to obtain a single fixed dimension sentence embedding for the entire text - this is called mean pooling

4) train the model using the Siamese network arch. Essential both pair of text is subjected to Bert -> Mean pooling -> embedding -> Cosine similarity 

5) Finally do a cosine similarity

Bi-encoders are great at scale

In [14]:
from sentence_transformers import SentenceTransformer

# Load the pre-trained model
model = SentenceTransformer('stsb-mpnet-base-v2')

# Generate Embeddings
sentence1_emb = model.encode(stsb_test['sentence1'], show_progress_bar=True)
sentence2_emb = model.encode(stsb_test['sentence2'], show_progress_bar=True)

# Cosine Similarity
stsb_test['SBERT BiEncoder_cosine_score'] = cos_sim(sentence1_emb, sentence2_emb)

Downloading: 100%|██████████| 868/868 [00:00<00:00, 868kB/s]
Downloading: 100%|██████████| 190/190 [00:00<00:00, 196kB/s]
Downloading: 100%|██████████| 3.67k/3.67k [00:00<00:00, 3.67MB/s]
Downloading: 100%|██████████| 588/588 [00:00<00:00, 572kB/s]
Downloading: 100%|██████████| 122/122 [00:00<00:00, 61.0kB/s]
Downloading: 100%|██████████| 438M/438M [00:28<00:00, 15.2MB/s] 
Downloading: 100%|██████████| 52.0/52.0 [00:00<00:00, 25.9kB/s]
Downloading: 100%|██████████| 239/239 [00:00<00:00, 120kB/s]
Downloading: 100%|██████████| 466k/466k [00:00<00:00, 4.66MB/s]
Downloading: 100%|██████████| 1.19k/1.19k [00:00<00:00, 1.19MB/s]
Downloading: 100%|██████████| 232k/232k [00:00<00:00, 4.49MB/s]
Downloading: 100%|██████████| 229/229 [00:00<00:00, 236kB/s]
Batches: 100%|██████████| 44/44 [00:25<00:00,  1.70it/s]
Batches: 100%|██████████| 44/44 [00:25<00:00,  1.72it/s]


Welp the problem is that SBERT Bi-Encoder requires a fully labeled corpus of sentence pairs to train. 

So legit going into another field is going to be hella time consuming.

# SimCSE Simple Contrastive Learning of Sentence Embeddings

Works in both supervised and unsupervised 

Core idea:

1) Given a text - compute the embeddings of the text using pre-train bert as an encoder and take the embeddings of the CLS token 

2) create two noisy versions of the same text embedding by applting two different dropout masks on the original embedding. Two noisy embeddings generated from the same input text are considered a positive pair. Model is expected to have a cosine distance of 0

3) We consider the embeddings from all the other texts in the batch as “negatives.” The model expects the “negatives” to have a cosine distance of 1 to the target text embeddings from the previous step. The loss function then updates the parameters of the encoder model such that the embeddings move closer to our expectations.

4) Supervised SimCSE has one additional step where we use a Natural Lang Inference labeled data to obtain positive pairs from texts labeled entailment and negative pairs from texts are labeled contradiction. 

SimCSE models are Bi-Encoder Sentence Transformer using SimCSE approach. So we can reuse all the code from the bi-encoder sentence but change the pretrain model to sim CSE





In [15]:
########## Supervised ##########
# Load the pre-trained model
model = SentenceTransformer('princeton-nlp/sup-simcse-roberta-large')

# Generate Embeddings
sentence1_emb = model.encode(stsb_test['sentence1'], show_progress_bar=True)
sentence2_emb = model.encode(stsb_test['sentence2'], show_progress_bar=True)

# Cosine Similarity
stsb_test['SimCSE Supervised_cosine_score'] = cos_sim(sentence1_emb, sentence2_emb)


########## Un-Supervised ##########
# Load the pre-trained model
model = SentenceTransformer('princeton-nlp/unsup-simcse-roberta-large')

# Generate Embeddings
sentence1_emb = model.encode(stsb_test['sentence1'], show_progress_bar=True)
sentence2_emb = model.encode(stsb_test['sentence2'], show_progress_bar=True)

# Cosine Similarity
stsb_test['SimCSE Unsupervised_cosine_score'] = cos_sim(sentence1_emb, sentence2_emb)

Downloading: 100%|██████████| 736/736 [00:00<00:00, 718kB/s]
Downloading: 100%|██████████| 664/664 [00:00<00:00, 327kB/s]
Downloading: 100%|██████████| 456k/456k [00:00<00:00, 4.60MB/s]
Downloading: 100%|██████████| 1.42G/1.42G [01:33<00:00, 15.2MB/s]
Downloading: 100%|██████████| 239/239 [00:00<00:00, 238kB/s]
Downloading: 100%|██████████| 256/256 [00:00<00:00, 256kB/s]
Downloading: 100%|██████████| 798k/798k [00:00<00:00, 7.48MB/s]
Batches: 100%|██████████| 44/44 [01:24<00:00,  1.92s/it]
Batches: 100%|██████████| 44/44 [01:26<00:00,  1.98s/it]
Downloading: 100%|██████████| 736/736 [00:00<00:00, 714kB/s]
Downloading: 100%|██████████| 743/743 [00:00<00:00, 367kB/s]
Downloading: 100%|██████████| 456k/456k [00:00<00:00, 4.88MB/s]
Downloading: 100%|██████████| 1.42G/1.42G [01:35<00:00, 15.0MB/s]
Downloading: 100%|██████████| 239/239 [00:00<00:00, 120kB/s]
Downloading: 100%|██████████| 256/256 [00:00<00:00, 85.4kB/s]
Downloading: 100%|██████████| 798k/798k [00:00<00:00, 6.09MB/s]
Batches: 

In [23]:
score_cols = [col for col in stsb_test.columns if '_score' in col]

# Spearman Rank Correlation
spearman_rank_corr = stsb_test[score_cols].corr(method='spearman').iloc[1:, 0:1]*100
spearman_rank_corr.head(10)

Unnamed: 0,similarity_score
Jaccard_score,66.026529
TFIDF_cosine_score,61.420989
NegWMD_score,67.032848
USE_cosine_score,77.085989
SBERT CrossEncoder_score,90.172534
SBERT BiEncoder_cosine_score,88.572413
SimCSE Supervised_cosine_score,87.082275
SimCSE Unsupervised_cosine_score,82.784251


In [25]:
from plotly.subplots import make_subplots
import plotly.graph_objects as go

nrows = 4
ncols = 3
plot_array = np.arange(0, nrows*ncols).reshape(nrows, ncols)

subplot_titles = [f'{row.Index.split("_")[0]}: {row.similarity_score:.2f}' for row in spearman_rank_corr.itertuples()]
fig = make_subplots(rows=nrows, cols=ncols, subplot_titles=subplot_titles)

for index, score in enumerate(spearman_rank_corr.index):
    row, col = np.argwhere(plot_array == index)[0]
    
    fig.add_trace(
        go.Scatter(
            x=stsb_test[score_cols[0]], 
            y=stsb_test[score],
            mode='markers',
        ),
        row=row+1, col=col+1
    )


fig.update_layout(height=700, width=1000, title_text='Spearman Rank Correlation (ρ × 100)', showlegend=False)
fig.show()

ValueError: Mime type rendering requires nbformat>=4.2.0 but it is not installed