In [None]:
# Import python packages
import streamlit as st
import pandas as pd

# We can also use Snowpark for our analyses!
from snowflake.snowpark.context import get_active_session
session = get_active_session()


In [None]:
! pip install sentence-transformers

**import libraries**

In [None]:
from sentence_transformers import SentenceTransformer
from typing import Any, Dict, List, Optional
import pandas as pd
import re
import sys
from tqdm import tqdm
import torch

In [None]:
print(torch.cuda.get_device_name(0))

**extract call parameters**

In [None]:
params = sys.argv

source_table = params[0]
output_table = params[1]
id_column = params[2]
columns_to_clean = params[3].split(',')
embedding_model = params[4]

print("sys.argv:", sys.argv)



**import the data**

In [None]:
  # Example table
columns = [id_column] + columns_to_clean

initial_df = session.table(source_table).to_pandas()
df_to_embedd = initial_df[columns].copy()
df_to_embedd

**Clean the columns to embedd**

In [None]:
#define function to clean text before embedding

def clean_columns_to_embedd(text_to_clean: str, start_text: str) -> str:
    """
    Format text of the columns used for embedding and add a prefix to that text.

    Args:
        text_to_clean (str): The text to be cleaned and formatted.
        start_text (str): The prefix to add to the cleaned text.

    Returns:
        str: Cleaned and formatted text in the format "{start_text}: {cleaned_text}."
             Returns empty string if input is None or empty.
    """

    if text_to_clean is None or text_to_clean == "":
        return ""

    text = str(text_to_clean)

    text = re.sub(r"[\[\]'\"]", "", text)

    text = text.replace("|", ",")

    text = text.lower()

    text = re.sub(r"[^a-z0-9 .,?!]+", "", text)

    text = re.sub(r" +", " ", text)

    text = re.sub(r" ,", ",", text)

    text = text.strip()

    return f"{start_text}: {text}."


def clean_columns_to_embedding(
    columns_to_clean: List[str], df: pd.DataFrame
) -> pd.DataFrame:
    """Extract and clean required columns for embedding."""

    for col in columns_to_clean:
        start_text = f" recipe {col.lower()}"
        df[col] = df[col].apply(lambda text: clean_columns_to_embedd(text, start_text))

    return df

In [None]:
#cleant the dataframe and concat embedding columns

cleaned_df = clean_columns_to_embedding(columns_to_clean, df_to_embedd)
cleaned_df["TEXT_TO_EMBEDD"] = cleaned_df[columns_to_clean].agg(" ".join, axis=1)
cleaned_df

**embedd the columns TEXT_TO_EMBEDD**

In [None]:
#load the model
model = SentenceTransformer(
                    embedding_model,
                    trust_remote_code=True,
                    device="cuda",
                )

In [None]:
#define funtion to compute embeddings
def compute_embedding_columns(
    df: pd.DataFrame,
    embedding_model: SentenceTransformer,
    name_embedding_column_input: str,
    name_embedding_column_output: str = "EMBEDDING",
    batch_size: int = 128,
) -> pd.DataFrame:
    """
    Create an embedding column by computing embeddings batch by batch.

    Args:
        df (pd.DataFrame): dataframe containing the data
        embedding_model (SentenceTransformer): model used to compute the embeddings
        name_embedding_column_input (str): column used as input text
        name_embedding_column_output (str): column to store embeddings
        batch_size (int): batch size for embedding computation

    Returns:
        pd.DataFrame: dataframe with the new embedding column
    """

    texts = df[name_embedding_column_input].tolist()
    all_embeddings = []

    # Use tqdm to show progress
    for start_idx in tqdm(range(0, len(texts), batch_size), desc="Computing embeddings"):
        batch_texts = texts[start_idx : start_idx + batch_size]

        batch_embeddings = embedding_model.encode(
            batch_texts,
            batch_size=batch_size,
            show_progress_bar=False,  # tqdm will show progress instead
            normalize_embeddings=True,
            convert_to_numpy=True,
        )

        all_embeddings.extend(batch_embeddings)

    # Convert embeddings to lists for Pandas/Snowflake
    df[name_embedding_column_output] = [emb.tolist() for emb in all_embeddings]

    return df

In [None]:
#compute embeddings
cleaned_df_with_embedding = compute_embedding_columns(
                cleaned_df,
                model,
                name_embedding_column_input="TEXT_TO_EMBEDD",
                name_embedding_column_output="EMBEDDING",
            )

**join with initial_df**

In [None]:
# join back the id column
embedding_df = cleaned_df_with_embedding[
    [id_column, "EMBEDDING"]
].copy()


final_df = initial_df.merge(
    embedding_df[[id_column, "EMBEDDING"]], on=id_column, how="left"
)


**convert embedding to vector format**

In [None]:
# Convert embeddings to list format for Snowflake VECTOR type
final_df["EMBEDDING"] = final_df["EMBEDDING"].apply(
    lambda x: x.tolist() if hasattr(x, "tolist") else x
)

# Get embedding dimension
embedding_dim = len(final_df["EMBEDDING"].iloc[0])

# Write to a temporary table first
temp_table = f"{output_table}_TEMP"
session.create_dataframe(final_df).write.mode(
    "overwrite"
).save_as_table(temp_table)

# Create final table with correct types
session.sql(
    f"""
    CREATE OR REPLACE TABLE {output_table} AS
    SELECT 
        {', '.join([col for col in final_df.columns if col != 'EMBEDDING'])},
        EMBEDDING::VECTOR(FLOAT, {embedding_dim}) as EMBEDDING
    FROM {temp_table}
"""
).collect()

# Drop temp table
session.sql(f"DROP TABLE {temp_table}").collect()