In [1]:
!pip install -U -q google.generativeai

In [2]:
import google.generativeai as genai
import google.ai.generativelanguage as glm
from tqdm.auto import tqdm
tqdm.pandas()
from datasets import load_dataset
from google.api_core import retry
from sklearn.metrics import mean_squared_error
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

In [3]:



genai.configure(api_key="KEY")

In [4]:

for m in genai.list_models():
  if 'embedContent' in m.supported_generation_methods:
    print(m.name)


models/embedding-001
models/text-embedding-004


In [5]:
# Load the English version of the STSB dataset
dataset = load_dataset("stsb_multi_mt", "en")

train_df = dataset['train'].to_pandas()
eval_df = dataset['dev'].to_pandas()
test_df = dataset['test'].to_pandas()

train_df['similarity_score'] /= 5.0
eval_df['similarity_score'] /= 5.0
test_df['similarity_score'] /= 5.0

In [6]:

def make_embed_text_fn(model):

  @retry.Retry(timeout=300.0)
  def embed_fn(text: str) -> list[float]:
    # Set the task_type 
    embedding = genai.embed_content(model=model,
                                    content=text,
                                    task_type="SEMANTIC_SIMILARITY")
    return embedding["embedding"]

  return embed_fn

def create_embeddings(df):
  model = 'models/embedding-001'
  df['Embeddings'] = df['Text'].progress_apply(make_embed_text_fn(model))
  return df


In [7]:
def compute_similarity(text1, text2):
    embed_fn = make_embed_text_fn('models/embedding-001')

    embedding1 = embed_fn(text1)
    embedding2 = embed_fn(text2)

    embedding1 = np.array(embedding1).reshape(1, -1)
    embedding2 = np.array(embedding2).reshape(1, -1)

    similarity_score = cosine_similarity(embedding1, embedding2)[0][0]
    
    return similarity_score


In [8]:
sim_score1 = compute_similarity("I had a bad day", "I had so much fun")
print("Similarity score 1:", sim_score1)

sim_score2 = compute_similarity("I had a bad day", "Everything was terrible today")
print("Similarity score 2:", sim_score2)

Similarity score 1: 0.6774482211903217
Similarity score 2: 0.831589530274747


In [9]:
test_df['Predicted_Similarity'] = test_df.apply(lambda row: compute_similarity(row['sentence1'], row['sentence2']), axis=1)

mse = mean_squared_error(test_df['similarity_score'], test_df['Predicted_Similarity'])

print("Mean Squared Error using -> models/embedding-001:", mse)

Mean Squared Error using -> models/embedding-001: 0.1540781486773203


# models/text-embedding-004

In [10]:

def compute_similarity(text1, text2):
    embed_fn = make_embed_text_fn('models/text-embedding-004')

    embedding1 = embed_fn(text1)
    embedding2 = embed_fn(text2)

    embedding1 = np.array(embedding1).reshape(1, -1)
    embedding2 = np.array(embedding2).reshape(1, -1)

    similarity_score = cosine_similarity(embedding1, embedding2)[0][0]
    
    return similarity_score

In [11]:
sim_score1 = compute_similarity("I had a bad day", "I had so much fun")
print("Similarity score 1:", sim_score1)

sim_score2 = compute_similarity("I had a bad day", "Everything was terrible today")
print("Similarity score 2:", sim_score2)

Similarity score 1: 0.5714316764065699
Similarity score 2: 0.8609537144957176


In [12]:
test_df['Predicted_Similarity'] = test_df.apply(lambda row: compute_similarity(row['sentence1'], row['sentence2']), axis=1)
    
mse = mean_squared_error(test_df['similarity_score'], test_df['Predicted_Similarity'])

print("Mean Squared Error using -> models/embedding-001:", mse)

Mean Squared Error using -> models/embedding-001: 0.11055572881987943
