In [1]:
#import packages
import datasets
from datasets import load_dataset
import pandas as pd
import numpy as np
from tqdm import tqdm
tqdm.pandas() # pretty helpful to determine time needed for pandas to run shit

  from .autonotebook import tqdm as notebook_tqdm


## Just a quick warning - this notebook is gonna take some space in your computer btw. sorry in advance. 


In [2]:
stsb_dataset = load_dataset('stsb_multi_mt', 'en')
stsb_train = pd.DataFrame(stsb_dataset['train'])
stsb_test = pd.DataFrame(stsb_dataset['test'])

# Check loaded data
print(stsb_train.shape, stsb_test.shape)
stsb_test.head()

Found cached dataset stsb_multi_mt (C:/Users/Chris Dong/.cache/huggingface/datasets/stsb_multi_mt/en/1.0.0/a5d260e4b7aa82d1ab7379523a005a366d9b124c76a5a5cf0c4c5365458b0ba9)
100%|██████████| 3/3 [00:00<00:00, 500.33it/s]


(5749, 3) (1379, 3)


Unnamed: 0,sentence1,sentence2,similarity_score
0,A girl is styling her hair.,A girl is brushing her hair.,2.5
1,A group of men play soccer on the beach.,A group of boys are playing soccer on the beach.,3.6
2,One woman is measuring another woman's ankle.,A woman measures another woman's ankle.,5.0
3,A man is cutting up a cucumber.,A man is slicing a cucumber.,4.2
4,A man is playing a harp.,A man is playing a keyboard.,1.5


# Jaccard Similarity

You gotta remove stopwords, lowercase and lemmatize before running the algo so it uses only informative words in the calc

Jaccard uses 1 gram, if you want N-grams then it would be w-shingling

In [3]:
import textdistance
from helper import text_processing

def jaccard_sim(row):
    # Text Processing
    sentence1 = text_processing(row['sentence1'])
    sentence2 = text_processing(row['sentence2'])
    
    # Jaccard similarity
    return textdistance.jaccard.normalized_similarity(sentence1, sentence2)


# Jaccard Similarity
stsb_test['Jaccard_score'] = stsb_test.progress_apply(jaccard_sim, axis=1)

100%|██████████| 1379/1379 [00:16<00:00, 84.99it/s]


# Bag of Words - 

Standard TFIDF vectorizer and count vectorizer to compare embedding vectors by computing the cosine similarities

pros and cons: Count vector treats all words equally important - ew. 

TFDIF utilize Term Frequency (TF) and Inverse Document Frequency (IDF) - 

TF - how many times the word appears in the doc, meausres how important the word is to the doc 
IDF - log inverse of the fraction of the document in which the word appears. Measures how rare the word is in the corpus

Normalizing the dataset needs to happen so the document length doesnt skew the results

In [4]:
from sklearn.feature_extraction.text import TfidfVectorizer
from helper import cos_sim

model = TfidfVectorizer(lowercase=True, stop_words='english')

# Train the model
X_train = pd.concat([stsb_train['sentence1'], stsb_train['sentence2']]).unique()
model.fit(X_train)

# Generate Embeddings on Test
sentence1_emb = model.transform(stsb_test['sentence1'])
sentence2_emb = model.transform(stsb_test['sentence2'])

# Cosine Similarity
stsb_test['TFIDF_cosine_score'] = cos_sim(sentence1_emb, sentence2_emb)

# Word Movers Distance (WMD)
 Jaccard and TFIDF assumes that similar texts have many words in common - however given the statement:

 Obama speaks to the media in Illinois 
 The president greets the press in Chicago 

The use of word embeddings are needed to demonstrate similar words have vectors near each other in vector space -
president - obama, Chicago - Illinois, greets - speaks, media - press



In [7]:
import gensim.downloader as api

# Load the pre-trained model
model = api.load('fasttext-wiki-news-subwords-300')

def word_movers_distance(row):
    # Text Processing
    sentence1 = text_processing(row['sentence1'])
    sentence2 = text_processing(row['sentence2'])
    
    # Negative Word Movers Distance
    return -model.wmdistance(sentence1, sentence2)


# Negative Word Movers Distance
stsb_test['NegWMD_score'] = stsb_test.progress_apply(word_movers_distance, axis=1)

Limitations of WMD is that the word embeddings used in WMD are non-contextual, where each word gets the same embedding vector irrespective of the context of the rest of the sentence in which it appears.
Future nlp algos are designed to handle this problem with transformers 

In [None]:
import tensorflow as tf
import tensorflow_hub as hub

# Load the pre-trained model
gpus = tf.config.list_physical_devices('GPU')
for gpu in gpus:
    # Control GPU memory usage
    tf.config.experimental.set_memory_growth(gpu, True)

module_url = 'https://tfhub.dev/google/universal-sentence-encoder/4'
model = hub.load(module_url)

# Generate Embeddings
sentence1_emb = model(stsb_test['sentence1']).numpy()
sentence2_emb = model(stsb_test['sentence2']).numpy()

# Cosine Similarity
stsb_test['USE_cosine_score'] = cos_sim(sentence1_emb, sentence2_emb)