In [1]:
import pandas as pd

In [2]:
node_df = pd.read_csv('checkpoint/nodes_info.csv')
edge_df = pd.read_csv('checkpoint/edge_info.csv')
community_df = pd.read_csv('checkpoint/community_info.csv')

In [3]:
import pandas as pd
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np
import os

def encode_and_store_embeddings(
    df: pd.DataFrame,
    target_column: str = "content",
    save_folder: str = "checkpoint",
    model_name: str = "stella_en_400M_v5",
    prompt_name: str = None,  # Currently unused for documents
    embedding_output_path: str = "faiss_index.index",
    device: str = "cuda"
) -> np.ndarray:
    """
    Encodes text from a specified column in a DataFrame using a SentenceTransformer model,
    then stores the embeddings in a FAISS index on disk.

    Parameters
    ----------
    df : pd.DataFrame
        The input DataFrame containing text data.
    target_column : str, optional
        The column in df that contains the text to be embedded.
        Default is "content".
    save_folder : str, optional
        Folder path where the FAISS index will be saved.
        Default is "checkpoint".
    model_name : str, optional
        Name of the SentenceTransformer model to load.
        Default is "dunzhang/stella_en_400M_v5".
    prompt_name : str, optional
        Prompt name for specialized encoding; currently unused for documents.
    embedding_output_path : str, optional
        File name for the FAISS index.
        Default is "faiss_index.index".
    device : str, optional
        Device to run the model on ("cuda" or "cpu").
        Default is "cuda".

    Returns
    -------
    np.ndarray
        Numpy array of shape (num_documents, embedding_dimension) containing the embeddings.
    """

    # 1) Validate the DataFrame and column
    if target_column not in df.columns:
        raise ValueError(f"The DataFrame must contain the '{target_column}' column.")

    # 2) Ensure the save folder exists
    os.makedirs(save_folder, exist_ok=True)

    # 3) Initialize the model
    print(f"Loading model '{model_name}' onto device '{device}'...")
    model = SentenceTransformer(model_name, trust_remote_code=True).to(device)

    # 4) Retrieve texts from the target column
    docs = df[target_column].tolist()
    print(f"Encoding {len(docs)} documents from column '{target_column}'...")

    # 5) Encode the texts
    embeddings = model.encode(
        docs,
        show_progress_bar=True,
        device=device,
        convert_to_numpy=True
    ).astype(np.float32)

    # 6) Create a FAISS index and add the embeddings
    embedding_dim = embeddings.shape[1]
    print(f"Embeddings shape: {embeddings.shape} (dim={embedding_dim})")

    index = faiss.IndexFlatL2(embedding_dim)  # L2 distance index
    index.add(embeddings)
    print(f"FAISS index now contains {index.ntotal} embeddings.")

    # 7) Save the FAISS index to disk
    index_path = os.path.join(save_folder, embedding_output_path)
    faiss.write_index(index, index_path)
    print(f"Saved FAISS index to '{index_path}'.")

    # 8) Return the embeddings
    return embeddings

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
community_embed = encode_and_store_embeddings(df=community_df, embedding_output_path = "faiss_graph_index.index")

Loading model 'stella_en_400M_v5' onto device 'cuda'...


A matching Triton is not available, some optimizations will not be enabled
Traceback (most recent call last):
  File "F:\anaconda\envs\CRA_LLM\lib\site-packages\xformers\__init__.py", line 57, in _is_triton_available
    import triton  # noqa
ModuleNotFoundError: No module named 'triton'
Some weights of the model checkpoint at stella_en_400M_v5 were not used when initializing NewModel: ['new.pooler.dense.bias', 'new.pooler.dense.weight']
- This IS expected if you are initializing NewModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing NewModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Encoding 56 documents from column 'content'...


Batches: 100%|███████████████████████████████████████████████████████████████████████████| 2/2 [00:00<00:00,  2.29it/s]

Embeddings shape: (56, 8192) (dim=8192)
FAISS index now contains 56 embeddings.
Saved FAISS index to 'checkpoint\faiss_graph_index.index'.





In [5]:
node_embed = encode_and_store_embeddings(df=node_df, embedding_output_path = "faiss_node_index.index")

Some weights of the model checkpoint at stella_en_400M_v5 were not used when initializing NewModel: ['new.pooler.dense.bias', 'new.pooler.dense.weight']
- This IS expected if you are initializing NewModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing NewModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Loading model 'stella_en_400M_v5' onto device 'cuda'...
Encoding 324 documents from column 'content'...


Batches: 100%|█████████████████████████████████████████████████████████████████████████| 11/11 [00:00<00:00, 24.73it/s]


Embeddings shape: (324, 8192) (dim=8192)
FAISS index now contains 324 embeddings.
Saved FAISS index to 'checkpoint\faiss_node_index.index'.


In [6]:
edge_embed = encode_and_store_embeddings(df=edge_df, embedding_output_path = "faiss_edge_index.index")

Some weights of the model checkpoint at stella_en_400M_v5 were not used when initializing NewModel: ['new.pooler.dense.bias', 'new.pooler.dense.weight']
- This IS expected if you are initializing NewModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing NewModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Loading model 'stella_en_400M_v5' onto device 'cuda'...
Encoding 917 documents from column 'content'...


Batches: 100%|█████████████████████████████████████████████████████████████████████████| 29/29 [00:00<00:00, 34.56it/s]

Embeddings shape: (917, 8192) (dim=8192)
FAISS index now contains 917 embeddings.
Saved FAISS index to 'checkpoint\faiss_edge_index.index'.



