## Word Embedding

In [1]:
pip install datasets

Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


In [1]:
from datasets import load_dataset
import nltk
from nltk.corpus import stopwords
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
import numpy as np
import os

dataset = load_dataset("google-research-datasets/paws", "labeled_final")
train_data = dataset['train']
dev_data = dataset['validation']
test_data = dataset['test']

  from .autonotebook import tqdm as notebook_tqdm
Downloading readme: 100%|██████████| 9.79k/9.79k [00:00<00:00, 5.12MB/s]
Downloading data: 100%|██████████| 8.43M/8.43M [00:03<00:00, 2.12MB/s]
Downloading data: 100%|██████████| 1.24M/1.24M [00:02<00:00, 517kB/s]
Downloading data: 100%|██████████| 1.23M/1.23M [00:02<00:00, 528kB/s]
Generating train split: 100%|██████████| 49401/49401 [00:00<00:00, 949887.74 examples/s]
Generating test split: 100%|██████████| 8000/8000 [00:00<00:00, 1030889.80 examples/s]
Generating validation split: 100%|██████████| 8000/8000 [00:00<00:00, 930155.57 examples/s]


In [2]:
stop_words = set(stopwords.words('english'))

In [12]:
def preprocess_text(text):
    tokens = nltk.word_tokenize(text.lower())
    tokens = [token for token in tokens if token.isalnum() and token not in stop_words]
    return tokens

def load_glove_embeddings(file_path):
    embeddings = {}
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            values = line.split()
            word = values[0]
            vector = np.asarray(values[1:], dtype='float32')
            embeddings[word] = vector
    return embeddings

def get_sentence_embedding(sentence):
    sentence_tokens = preprocess_text(sentence)
    sentence_embedding = np.zeros(300) 
    for token in sentence_tokens:
        if token in word_embedding_model:
            sentence_embedding += word_embedding_model[token]
    if len(sentence_tokens) > 0:
        sentence_embedding /= len(sentence_tokens)
    return sentence_embedding

In [13]:
word_embedding_model = load_glove_embeddings("glove.6B.300d.txt")

In [14]:
X_train = np.array([np.concatenate((get_sentence_embedding(pair['sentence1']), get_sentence_embedding(pair['sentence2']))) for pair in train_data])
y_train = np.array([pair['label'] for pair in train_data])
X_dev = np.array([np.concatenate((get_sentence_embedding(pair['sentence1']), get_sentence_embedding(pair['sentence2']))) for pair in dev_data])
y_dev = np.array([pair['label'] for pair in dev_data])
X_test = np.array([np.concatenate((get_sentence_embedding(pair['sentence1']), get_sentence_embedding(pair['sentence2']))) for pair in test_data])
y_test = np.array([pair['label'] for pair in test_data])

In [15]:
clf = LogisticRegression(max_iter=1000)
clf.fit(X_train, y_train)

In [17]:
dev_predictions = clf.predict(X_dev)
dev_accuracy = accuracy_score(y_dev, dev_predictions)
print("Validation Accuracy:", dev_accuracy)

Validation Accuracy: 0.55325


In [18]:
test_predictions = clf.predict(X_test)
test_accuracy = accuracy_score(y_test, test_predictions)
print("Test Accuracy:", test_accuracy)

Test Accuracy: 0.55175


In [19]:
def predict_similarity(sentence1, sentence2):   # Inference Function
    sentence1_embedding = get_sentence_embedding(sentence1)
    sentence2_embedding = get_sentence_embedding(sentence2)
    similarity_score = cosine_similarity([sentence1_embedding], [sentence2_embedding])[0][0]
    return similarity_score

In [20]:
sentence1 = "In Paris , in October 1560 , he secretly met the English ambassador , Nicolas Throckmorton , asking him for a passport to return to England through Scotland ."
sentence2 = "In October 1560 , he secretly met with the English ambassador , Nicolas Throckmorton , in Paris , and asked him for a passport to return to Scotland through England ."
similarity_score = predict_similarity(sentence1, sentence2)
print("Similarity Score of sentence1 and sentence2 is",similarity_score)


Similarity Score of sentence1 and sentence2 is 0.9966563843391041


## DistilBert Embedding

https://www.kaggle.com/code/jinalswarnakar/sentencesimilarity-distilbert

In [None]:
import numpy as np
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from datasets import load_dataset
import torch
from transformers import DistilBertTokenizer, DistilBertModel

# Load dataset
dataset = load_dataset("google-research-datasets/paws", "labeled_final")
train_data = dataset['train']
dev_data = dataset['validation']
test_data = dataset['test']

In [None]:
# Load DistilBERT tokenizer and model
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
model = DistilBertModel.from_pretrained('distilbert-base-uncased')

In [None]:
def encode_sentences(sentences):
    inputs = tokenizer(sentences, padding=True, truncation=True, return_tensors="pt")
    return inputs

def get_sentence_embedding(inputs):
    with torch.no_grad():
        outputs = model(**inputs)
    embeddings = torch.mean(outputs.last_hidden_state, dim=1).squeeze().numpy()
    return embeddings

In [None]:
def prepare_data(data):
    X = []
    for pair in data:
        sentence1_embedding = get_sentence_embedding(encode_sentences(pair['sentence1']))
        sentence2_embedding = get_sentence_embedding(encode_sentences(pair['sentence2']))
        combined_embedding = np.concatenate((sentence1_embedding, sentence2_embedding))
        X.append(combined_embedding)
    X = np.array(X)
    y = np.array([pair['label'] for pair in data])
    return X, y

In [None]:
X_train, y_train = prepare_data(train_data)
X_dev, y_dev = prepare_data(dev_data)
X_test, y_test = prepare_data(test_data)

In [None]:
clf = LogisticRegression(max_iter=1000)
clf.fit(X_train, y_train)

In [None]:
dev_predictions = clf.predict(X_dev)
dev_accuracy = accuracy_score(y_dev, dev_predictions)
print("Validation Accuracy:", dev_accuracy)

In [None]:
test_predictions = clf.predict(X_test)
test_accuracy = accuracy_score(y_test, test_predictions)
print("Test Accuracy:", test_accuracy)

In [None]:
def predict_similarity(sentence1, sentence2):
    sentence1_embedding = get_sentence_embedding(encode_sentences([sentence1]))
    sentence2_embedding = get_sentence_embedding(encode_sentences([sentence2]))
    combined_embedding = np.concatenate((sentence1_embedding, sentence2_embedding)).reshape(1, -1)
    similarity_score = clf.predict_proba(combined_embedding)[:, 1][0]
    return similarity_score

# Example Usage
sentence1 = "In Paris , in October 1560 , he secretly met the English ambassador , Nicolas Throckmorton , asking him for a passport to return to England through Scotland ."
sentence2 = "In October 1560 , he secretly met with the English ambassador , Nicolas Throckmorton , in Paris , and asked him for a passport to return to Scotland through England ."
similarity_score = predict_similarity(sentence1, sentence2)
print("Similarity Score:", similarity_score)