In [1]:
pip install datasets

Defaulting to user installation because normal site-packages is not writeable
Collecting datasets
  Downloading datasets-2.19.2-py3-none-any.whl.metadata (19 kB)
Collecting pyarrow>=12.0.0 (from datasets)
  Downloading pyarrow-16.1.0-cp38-cp38-manylinux_2_28_x86_64.whl.metadata (3.0 kB)
Collecting pyarrow-hotfix (from datasets)
  Downloading pyarrow_hotfix-0.6-py3-none-any.whl.metadata (3.6 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting requests>=2.32.1 (from datasets)
  Downloading requests-2.32.3-py3-none-any.whl.metadata (4.6 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.4.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py38-none-any.whl.metadata (7.1 kB)
Collecting fsspec<=2024.3.1,>=2023.1.0 (from fsspec[http]<=2024.3.1,>=2023.1.0->datasets)
  Using cached fsspec-2024.3.1-py3-none-any.whl.meta

## Word Embedding

In [20]:
from datasets import load_dataset
import nltk
from nltk.corpus import stopwords
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
import numpy as np
import os

dataset = load_dataset("PiC/phrase_similarity")
train_data = dataset['train']
dev_data = dataset['validation']
test_data = dataset['test']

In [21]:
stop_words = set(stopwords.words('english'))

In [22]:
def preprocess_text(text):
    tokens = nltk.word_tokenize(text.lower())
    tokens = [token for token in tokens if token.isalnum() and token not in stop_words]
    return tokens

def load_glove_embeddings(file_path):
    embeddings = {}
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            values = line.split()
            word = values[0]
            vector = np.asarray(values[1:], dtype='float32')
            embeddings[word] = vector
    return embeddings

def get_phrase_embedding(phrase):
    phrase_tokens = preprocess_text(phrase)
    phrase_embedding = np.zeros(300) 
    for token in phrase_tokens:
        if token in word_embedding_model:
            phrase_embedding += word_embedding_model[token]
    if len(phrase_tokens) > 0:
        phrase_embedding /= len(phrase_tokens)
    return phrase_embedding

In [23]:

word_embedding_model = load_glove_embeddings("glove.6B.300d.txt")

In [24]:

X_train = np.array([np.concatenate((get_phrase_embedding(pair['phrase1']), get_phrase_embedding(pair['phrase2']))) for pair in train_data])
y_train = np.array([pair['label'] for pair in train_data])
X_dev = np.array([np.concatenate((get_phrase_embedding(pair['phrase1']), get_phrase_embedding(pair['phrase2']))) for pair in dev_data])
y_dev = np.array([pair['label'] for pair in dev_data])
X_test = np.array([np.concatenate((get_phrase_embedding(pair['phrase1']), get_phrase_embedding(pair['phrase2']))) for pair in test_data])
y_test = np.array([pair['label'] for pair in test_data])

In [25]:

clf = LogisticRegression(max_iter=1000)
clf.fit(X_train, y_train)

In [26]:
dev_predictions = clf.predict(X_dev)
dev_accuracy = accuracy_score(y_dev, dev_predictions)
print("Validation Accuracy:", dev_accuracy)

Validation Accuracy: 0.388


In [27]:

test_predictions = clf.predict(X_test)
test_accuracy = accuracy_score(y_test, test_predictions)
print("Test Accuracy:", test_accuracy)

Test Accuracy: 0.3565


In [28]:
def predict_similarity(phrase1, phrase2):   # Inference Function
    phrase1_embedding = get_phrase_embedding(phrase1)
    phrase2_embedding = get_phrase_embedding(phrase2)
    similarity_score = cosine_similarity([phrase1_embedding], [phrase2_embedding])[0][0]
    return similarity_score

In [29]:
phrase1 = "newly formed camp"
phrase2 = "recently made encampment"
similarity_score = predict_similarity(phrase1, phrase2)
print("Similarity Score of '", phrase1, "' and '", phrase2, "' is ",similarity_score)


Similarity Score of ' newly formed camp ' and ' recently made encampment ' is  0.5574376128137137


## DistilBert embedding

https://www.kaggle.com/code/jinalswarnakar/part1

In [None]:
import numpy as np
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from datasets import load_dataset
import torch
from transformers import DistilBertTokenizer, DistilBertModel

dataset = load_dataset("PiC/phrase_similarity")
train_data = dataset['train']
dev_data = dataset['validation']
test_data = dataset['test']

In [None]:
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
model = DistilBertModel.from_pretrained('distilbert-base-uncased')

In [None]:
def encode_phrases(phrases):
    inputs = tokenizer(phrases, padding=True, truncation=True, return_tensors="pt")
    return inputs

def get_phrase_embedding(inputs):
    with torch.no_grad():
        outputs = model(**inputs)
    embeddings = torch.mean(outputs.last_hidden_state, dim=1).squeeze().numpy()
    return embeddings

In [None]:
def prepare_data(data):
    X = []
    for pair in data:
        phrase1_embedding = get_phrase_embedding(encode_phrases(pair['phrase1']))
        phrase2_embedding = get_phrase_embedding(encode_phrases(pair['phrase2']))
        combined_embedding = np.concatenate((phrase1_embedding, phrase2_embedding))
        X.append(combined_embedding)
    X = np.array(X)
    y = np.array([pair['label'] for pair in data])
    return X, y

In [None]:
X_train, y_train = prepare_data(train_data)
X_dev, y_dev = prepare_data(dev_data)
X_test, y_test = prepare_data(test_data)

In [None]:
clf = LogisticRegression(max_iter=1000)
clf.fit(X_train, y_train)

In [None]:
dev_predictions = clf.predict(X_dev)
dev_accuracy = accuracy_score(y_dev, dev_predictions)
print("Validation Accuracy:", dev_accuracy)

In [None]:
test_predictions = clf.predict(X_test)
test_accuracy = accuracy_score(y_test, test_predictions)
print("Test Accuracy:", test_accuracy)

In [None]:
def predict_similarity(phrase1, phrase2):
    phrase1_embedding = get_phrase_embedding(encode_phrases([phrase1]))
    phrase2_embedding = get_phrase_embedding(encode_phrases([phrase2]))
    combined_embedding = np.concatenate((phrase1_embedding, phrase2_embedding)).reshape(1, -1)
    similarity_score = clf.predict_proba(combined_embedding)[:, 1][0]
    return similarity_score

phrase1 = "newly formed camp"
phrase2 = "recently made encampment"
similarity_score = predict_similarity(phrase1, phrase2)
print("Similarity Score:", similarity_score)