In [42]:
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import gensim.downloader as api
from sentence_transformers import SentenceTransformer, util
import torch

# Define the sentences
sentence1 = """
The software engineer is planning to enhance her skills by pursuing a master’s degree in computer science
"""
sentence2 = """
The mechanical engineer is eager to improve his qualifications by obtaining a master’s degree in engineering management.
"""
# Load pre-trained Word2Vec model
w2v_model = api.load("word2vec-google-news-300")

# 1. N-Gram Similarity
def n_gram_similarity(s1, s2, n=3):
    vectorizer = CountVectorizer(ngram_range=(n, n))
    X = vectorizer.fit_transform([s1, s2])
    return cosine_similarity(X[0:1], X[1:2])[0][0]

# 2. Bag-of-Words (BoW) Similarity
def bow_similarity(s1, s2):
    vectorizer = CountVectorizer()
    X = vectorizer.fit_transform([s1, s2])
    return cosine_similarity(X[0:1], X[1:2])[0][0]

# 3. TF-IDF Similarity
def tfidf_similarity(s1, s2):
    vectorizer = TfidfVectorizer()
    X = vectorizer.fit_transform([s1, s2])
    return cosine_similarity(X[0:1], X[1:2])[0][0]

# 4. Word Mover's Distance (WMD)
def wmd_similarity(s1, s2):
    s1_tokens = [word for word in s1.lower().split() if word in w2v_model.key_to_index]
    s2_tokens = [word for word in s2.lower().split() if word in w2v_model.key_to_index]
    
    if not s1_tokens or not s2_tokens:
        return 0  # Return 0 if either sentence has no valid tokens

    wmd_distance = w2v_model.wmdistance(s1_tokens, s2_tokens)
    similarity = 1 / (1 + wmd_distance)  # Convert distance to similarity
    return similarity

# 5. BERTScore Similarity
def bertscore_similarity(s1, s2):
    from bert_score import score
    _, _, F1 = score([s1], [s2], lang='en', verbose=False)
    return F1[0].item()

# 6. Sentence-BERT Similarity
def sentence_bert_similarity(s1, s2):
    model = SentenceTransformer('all-MiniLM-L6-v2')
    embeddings1 = model.encode(s1, convert_to_tensor=True)
    embeddings2 = model.encode(s2, convert_to_tensor=True)
    cosine_score = util.pytorch_cos_sim(embeddings1, embeddings2)
    return cosine_score.item()

# 7. SimCSE Similarity
def simcse_similarity(s1, s2):
    model = SentenceTransformer('princeton-nlp/sup-simcse-bert-base-uncased')  # Using a SimCSE model
    embeddings1 = model.encode(s1, convert_to_tensor=True)
    embeddings2 = model.encode(s2, convert_to_tensor=True)
    cosine_score = util.pytorch_cos_sim(embeddings1, embeddings2)
    return cosine_score.item()

# Calculate similarities
n_gram_sim = n_gram_similarity(sentence1, sentence2)
print(f"N-Gram Similarity: {n_gram_sim}")
bow_sim = bow_similarity(sentence1, sentence2)
print(f"Bag-of-Words Similarity: {bow_sim}")
tfidf_sim = tfidf_similarity(sentence1, sentence2)
print(f"TF-IDF Similarity: {tfidf_sim}")
wmd_sim = wmd_similarity(sentence1, sentence2)
print(f"Word Mover's Distance Similarity: {wmd_sim}")
bertscore_sim = bertscore_similarity(sentence1, sentence2)
print(f"BERTScore Similarity: {bertscore_sim}")
sentence_bert_sim = sentence_bert_similarity(sentence1, sentence2)
print(f"Sentence-BERT Similarity: {sentence_bert_sim}")
simcse_sim = simcse_similarity(sentence1, sentence2)
print(f"SimCSE Similarity: {simcse_sim}")


N-Gram Similarity: 0.07142857142857144
Bag-of-Words Similarity: 0.5
TF-IDF Similarity: 0.3360969272762575
Word Mover's Distance Similarity: 0.6216586985153256


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BERTScore Similarity: 0.9510157704353333


No sentence-transformers model found with name princeton-nlp/sup-simcse-bert-base-uncased. Creating a new one with mean pooling.


Sentence-BERT Similarity: 0.6023814678192139
SimCSE Similarity: 0.8354148864746094


In [40]:
from refchecker import LLMExtractor, LLMChecker

extractor = LLMExtractor(
    claim_format='triplet', 
    model='bedrock/meta.llama3-70b-instruct-v1:0',
    batch_size=50,
    )

response_extract_results = extractor.extract(
            batch_responses=[sentence1, sentence2]
        )
response_claims = [[c.content for c in res.claims] for res in response_extract_results]
print(response_claims)

checker = LLMChecker(
        model = 'bedrock/meta.llama3-70b-instruct-v1:0', 
        batch_size=50
)

question = """
Describe a job.
"""
print(len(response_claims[0]))
print(len(response_claims[1]))  
batch_labels = checker.check(
            batch_claims=response_claims,
            batch_questions=[question] * 2,
            batch_references=[[sentence1,sentence2]] * 2,
            is_joint=True,
            joint_check_num=1
            
        )

print(len(batch_labels[0]))
print(len(batch_labels[1])) 
print(batch_labels)

100%|██████████| 1/1 [00:02<00:00,  2.70s/it]


[[['The software engineer', 'is', 'planning'], ['The software engineer', 'planning to', 'enhance her skills'], ['The software engineer', 'enhance her skills by', 'pursuing a master’s degree in computer science'], ['The software engineer', 'pursuing', 'a master’s degree in computer science'], ['a master’s degree in computer science', 'in', 'computer science'], ['The software engineer', 'has skills', 'her skills']], [['The mechanical engineer', 'is', 'eager'], ['The mechanical engineer', 'wants to', 'improve his qualifications'], ['The mechanical engineer', 'wants to improve his qualifications by', 'obtaining a master’s degree'], ['The master’s degree', 'is in', 'engineering management'], ['The mechanical engineer', 'wants to obtain', 'a master’s degree in engineering management']]]
6
5


100%|██████████| 1/1 [00:01<00:00,  1.01s/it]

6
5
[[['Entailment', 'Neutral'], ['Entailment', 'Neutral'], ['Entailment', 'Neutral'], ['Entailment', 'Neutral'], ['Entailment', 'Neutral'], ['Entailment', 'Neutral']], [['Neutral', 'Entailment'], ['Neutral', 'Entailment'], ['Neutral', 'Entailment'], ['Neutral', 'Entailment'], ['Neutral', 'Entailment']]]



