## Sentence Similarity 

### Zero short setting.

ChatGPT

In [1]:
import openai
import pandas as pd


openai.api_key = "API-KEY"


sentences = [
    ("The cat sat on the mat", "The dog lay on the rug"),
    ("A bird chirped in the tree", "The sun shone brightly in the sky"),
    ("She played the piano beautifully", "He sang a song loudly"),
    ("The children ran and played in the park", "The adults walked and talked in the garden")
]

def compute_sentence_similarity(sentences):
    similarity_scores = []
    for sentence1, sentence2 in sentences:
        
        prompt = f"sentence 1: {sentence1}\nsentence 2: {sentence2}\nHow similar are these sentences on a scale from 0 to 1? Provide a numerical similarity score. Your output should only give similarity score in float."
       
        response = openai.ChatCompletion.create(
            model="gpt-3.5-turbo",  # Using the turbo model
            messages=[
                {"role": "system", "content": "You are an AI language model."},
                {"role": "user", "content": prompt}
            ]
        )
        
        similarity_score = response.choices[0].message["content"].strip()
        print(similarity_score)
        try:
            similarity_score = float(similarity_score)
        except ValueError:
            similarity_score = 0.0  # Default to 0.0 if parsing fails
        similarity_scores.append(similarity_score)
    return similarity_scores


similarity_scores = compute_sentence_similarity(sentences)


results_df = pd.DataFrame({"sentence Pair": [f"{sentence[0]} - {sentence[1]}" for sentence in sentences],
                           "Similarity Score": similarity_scores})
print(results_df)

0.5
0.2
0.6
0.5
                                       sentence Pair  Similarity Score
0    The cat sat on the mat - The dog lay on the rug               0.5
1  A bird chirped in the tree - The sun shone bri...               0.2
2  She played the piano beautifully - He sang a s...               0.6
3  The children ran and played in the park - The ...               0.5


Bard

In [2]:
import os
from dotenv import load_dotenv
import google.generativeai as genai
import pandas as pd

load_dotenv()
api_key = os.getenv("API_KEY")

genai.configure(api_key=api_key)


sentences = [
    ("The cat sat on the mat", "The dog lay on the rug"),
    ("A bird chirped in the tree", "The sun shone brightly in the sky"),
    ("She played the piano beautifully", "He sang a song loudly"),
    ("The children ran and played in the park", "The adults walked and talked in the garden")
]

def compute_sentence_similarity(sentences):   # Function to compute similarity scores for sentences
    similarity_scores = []
    for sentence1, sentence2 in sentences:
        prompt = f"How similar are the sentences '{sentence1}' and '{sentence2}'? Provide a similarity score between 0 and 1."
        response = genai.generate_text(prompt=prompt)
        output = response.candidates[0]['output']
        try:
            similarity_score = float(output.strip())
        except ValueError:
            similarity_score = 0.0 
        similarity_scores.append(similarity_score)
    return similarity_scores

similarity_scores = compute_sentence_similarity(sentences)

results_df = pd.DataFrame({"sentence Pair": [f"{sentence[0]} - {sentence[1]}" for sentence in sentences],
                           "Similarity Score": similarity_scores})
print(results_df)


                                       sentence Pair  Similarity Score
0    The cat sat on the mat - The dog lay on the rug              0.75
1  A bird chirped in the tree - The sun shone bri...              0.00
2  She played the piano beautifully - He sang a s...              0.25
3  The children ran and played in the park - The ...              0.30


`all-mpnet-base-v2` from the Sentence Transformers library.

In [3]:
from sentence_transformers import SentenceTransformer, util


model = SentenceTransformer('all-mpnet-base-v2')


sentences = [
    ("The cat sat on the mat", "The dog lay on the rug"),
    ("A bird chirped in the tree", "The sun shone brightly in the sky"),
    ("She played the piano beautifully", "He sang a song loudly"),
    ("The children ran and played in the park", "The adults walked and talked in the garden")
]

for sentence1, sentence2 in sentences:
  sentence1_embedding = model.encode(sentence1)
  sentence2_embedding = model.encode(sentence2)
  similarity = util.pytorch_cos_sim(sentence1_embedding, sentence2_embedding).item()
  print(f"Similarity Score (sentence: '{sentence1}' vs '{sentence2}'): {similarity}")


  from tqdm.autonotebook import tqdm, trange
2024-06-06 19:27:26.885518: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-06-06 19:27:26.886830: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2024-06-06 19:27:26.914936: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


Similarity Score (sentence: 'The cat sat on the mat' vs 'The dog lay on the rug'): 0.48438724875450134
Similarity Score (sentence: 'A bird chirped in the tree' vs 'The sun shone brightly in the sky'): 0.2539435625076294
Similarity Score (sentence: 'She played the piano beautifully' vs 'He sang a song loudly'): 0.2710907757282257
Similarity Score (sentence: 'The children ran and played in the park' vs 'The adults walked and talked in the garden'): 0.1723189651966095


## Few Short Editing

ChatGPT

In [4]:
import openai
import pandas as pd

openai.api_key = "API-KEY"

sentences_with_scores = [
    (("The newly formed camp is bustling with activity.", "The recently made encampment is lively."), 0.8),
    (("One data point cannot represent the entire population.", "A particular statistic may not reflect the overall trend."), 0.6),
    (("The particular structure of the building is unique.", "The specific edifice stands out in the skyline."), 0.7),
    (("The involved people are dedicated to the cause.", "The participating individuals show great commitment."), 0.9)
]

def compute_sentence_similarity(sentences_with_scores):
    similarity_scores = []
    for (sentence1, sentence2), true_similarity_score in sentences_with_scores:
       
        prompt = f"sentence 1: {sentence1}\nsentence 2: {sentence2}\nHow similar are these sentences on a scale from 0 to 1? Provide a numerical similarity score. Your output should only give similarity score in float."
       
        response = openai.ChatCompletion.create(
            model="gpt-3.5-turbo",  # Using the turbo model
            messages=[
                {"role": "system", "content": "You are an AI language model."},
                {"role": "user", "content": prompt}
            ]
        )
        
        similarity_score = response.choices[0].message["content"].strip()
        try:
            similarity_score = float(similarity_score)
        except ValueError:
            similarity_score = 0.0  # Default to 0.0 if parsing fails
        similarity_scores.append(similarity_score)
    return similarity_scores


similarity_scores = compute_sentence_similarity(sentences_with_scores)

results_df = pd.DataFrame({"sentence Pair": [f"{sentence[0]} - {sentence[1]}" for sentence, _ in sentences_with_scores],
                           "True Similarity Score": [score for _, score in sentences_with_scores],
                           "Computed Similarity Score": similarity_scores})


In [5]:
results_df

Unnamed: 0,sentence Pair,True Similarity Score,Computed Similarity Score
0,The newly formed camp is bustling with activit...,0.8,0.75
1,One data point cannot represent the entire pop...,0.6,0.7
2,The particular structure of the building is un...,0.7,0.7
3,The involved people are dedicated to the cause...,0.9,0.8


Bard

In [6]:
import os
from dotenv import load_dotenv
import google.generativeai as genai
import pandas as pd

load_dotenv()
api_key = os.getenv("API_KEY")

genai.configure(api_key=api_key)

sentences_with_scores = [
    (("The newly formed camp is bustling with activity.", "The recently made encampment is lively."), 0.8),
    (("One data point cannot represent the entire population.", "A particular statistic may not reflect the overall trend."), 0.6),
    (("The particular structure of the building is unique.", "The specific edifice stands out in the skyline."), 0.7),
    (("The involved people are dedicated to the cause.", "The participating individuals show great commitment."), 0.9)
]

def compute_sentence_similarity(sentences_with_scores):   # Function to compute similarity scores for sentences
    similarity_scores = []
    for (sentence1, sentence2), true_similarity_score in sentences_with_scores:
        prompt = f"How similar are the sentences '{sentence1}' and '{sentence2}'? Provide a similarity score between 0 and 1."
        response = genai.generate_text(prompt=prompt)
        output = response.candidates[0]['output']
        try:
            similarity_score = float(output.strip())
        except ValueError:
            similarity_score = 0.0 
        similarity_scores.append(similarity_score)
    return similarity_scores


similarity_scores = compute_sentence_similarity(sentences_with_scores)

results_df = pd.DataFrame({"sentence Pair": [f"{sentence[0]} - {sentence[1]}" for sentence, _ in sentences_with_scores],
                           "True Similarity Score": [score for _, score in sentences_with_scores],
                           "Computed Similarity Score": similarity_scores})


In [7]:
results_df

Unnamed: 0,sentence Pair,True Similarity Score,Computed Similarity Score
0,The newly formed camp is bustling with activit...,0.8,0.8
1,One data point cannot represent the entire pop...,0.6,0.8
2,The particular structure of the building is un...,0.7,0.5
3,The involved people are dedicated to the cause...,0.9,0.85


`all-mpnet-base-v2` from the Sentence Transformers library.

In [8]:
from sentence_transformers import SentenceTransformer, util


model = SentenceTransformer('all-mpnet-base-v2')


sentences_with_scores = [
    (("The newly formed camp is bustling with activity.", "The recently made encampment is lively."), 0.8),
    (("One data point cannot represent the entire population.", "A particular statistic may not reflect the overall trend."), 0.6),
    (("The particular structure of the building is unique.", "The specific edifice stands out in the skyline."), 0.7),
    (("The involved people are dedicated to the cause.", "The participating individuals show great commitment."), 0.9)
]

for (sentence1, sentence2), true_similarity_score in sentences_with_scores:
    sentence1_embedding = model.encode(sentence1)
    sentence2_embedding = model.encode(sentence2)
    similarity = util.pytorch_cos_sim(sentence1_embedding, sentence2_embedding).item()
    print(f"sentence Pair: '{sentence1}' - '{sentence2}'")
    print(f"True Similarity Score: {true_similarity_score}")
    print(f"Computed Similarity Score: {similarity}\n")




sentence Pair: 'The newly formed camp is bustling with activity.' - 'The recently made encampment is lively.'
True Similarity Score: 0.8
Computed Similarity Score: 0.7742079496383667

sentence Pair: 'One data point cannot represent the entire population.' - 'A particular statistic may not reflect the overall trend.'
True Similarity Score: 0.6
Computed Similarity Score: 0.6612098217010498

sentence Pair: 'The particular structure of the building is unique.' - 'The specific edifice stands out in the skyline.'
True Similarity Score: 0.7
Computed Similarity Score: 0.6238105893135071

sentence Pair: 'The involved people are dedicated to the cause.' - 'The participating individuals show great commitment.'
True Similarity Score: 0.9
Computed Similarity Score: 0.8070569038391113

