In [2]:
import pandas as pd

# Sample dataset
data = {
    "text": [
        "How is the weather today?",
        "¿Qué tiempo hace hoy?",
        "What's the weather like?",
        "The weather is nice today.",
        "¿Cómo está el clima hoy?",
        "It's raining today."
    ]
}

df = pd.DataFrame(data)

In [3]:
import re

# Function to clean text
def clean_text(text):
    # Remove special characters and numbers
    text = re.sub(r'[^A-Za-zÀ-ÿ\s]', '', text)
    # Convert to lowercase
    text = text.lower()
    return text

# Apply the cleaning function
df['cleaned_text'] = df['text'].apply(clean_text)
print(df[['text', 'cleaned_text']])

                         text               cleaned_text
0   How is the weather today?   how is the weather today
1       ¿Qué tiempo hace hoy?        qué tiempo hace hoy
2    What's the weather like?     whats the weather like
3  The weather is nice today.  the weather is nice today
4    ¿Cómo está el clima hoy?     cómo está el clima hoy
5         It's raining today.          its raining today


In [4]:
from transformers import AutoTokenizer, AutoModel
import torch

# Load model and tokenizer
tokenizer = AutoTokenizer.from_pretrained('jinaai/jina-embeddings-v2-base-es', trust_remote_code=True)
model = AutoModel.from_pretrained('jinaai/jina-embeddings-v2-base-es', trust_remote_code=True)

# Tokenize and encode the cleaned text
inputs = tokenizer(df['cleaned_text'].tolist(), return_tensors="pt", padding=True, truncation=True)

# Obtain embeddings
with torch.no_grad():
    outputs = model(**inputs)
    # Taking the mean of token embeddings to get sentence-level embeddings
    df['embeddings'] = outputs.last_hidden_state.mean(dim=1).numpy().tolist()

print(df[['cleaned_text', 'embeddings']])

                cleaned_text  \
0   how is the weather today   
1        qué tiempo hace hoy   
2     whats the weather like   
3  the weather is nice today   
4     cómo está el clima hoy   
5          its raining today   

                                          embeddings  
0  [0.11146073043346405, -0.11799396574497223, 0....  
1  [0.04967658966779709, -0.15929220616817474, 0....  
2  [0.043915051966905594, 0.037842195481061935, -...  
3  [0.09060945361852646, -0.19453072547912598, 0....  
4  [0.11705266684293747, -0.09279539436101913, 0....  
5  [0.07673726230859756, -0.1369733363389969, 0.3...  


In [5]:
from numpy.linalg import norm
import numpy as np

# Cosine similarity function
def cos_sim(a, b):
    return (np.dot(a, b)) / (norm(a) * norm(b))

# Calculate similarity between the first two samples
similarity = cos_sim(np.array(df['embeddings'][0]), np.array(df['embeddings'][1]))
print(f"Cosine Similarity between first two samples: {similarity:.4f}")

Cosine Similarity between first two samples: 0.8673
