In [1]:
#!pip install faiss-cpu

In [2]:
import pickle
import torch
from transformers import AutoTokenizer, AutoModel

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# Load stored traces
def load_traces(file_path):
    with open(file_path, 'rb') as f:
        distinct_traces = pickle.load(f)
    return distinct_traces

In [4]:
# Load traces and frequencies
data = "mip"
trace_data = load_traces('../semantic_data/' + data + '/' + data + '_encoded_trace_frequencies_all.pkl')
print("Loaded traces:", list(trace_data.items())[:5])  

Loaded traces: [((0, 7, 5, 3, 24, 0, 8, 5, 0, 7, 5, 0, 25, 9, 7, 8, 10, 5, 5, 12, 0, 17, 5, 12, 27, 7, 5, 10, 0, 4, 5, 5, 35, 8, 1, 6, 13, 14, 5, 0, 4, 7, 3, 24, 26, 5, 33, 5, 11, 10, 5, 12, 13, 14, 13, 5, 14, 15, 16, 0, 19, 5, 20, 21, 5, 0, 30, 26, 4, 5, 27, 12, 5, 34, 24, 2, 3, 0, 26, 17, 18, 9, 7, 10, 19, 20, 5, 31, 19, 5, 20, 5, 5, 21, 23), 0.191516813160229), ((0, 17, 5, 0, 25, 5, 12, 5, 7, 26, 4, 5, 0, 25, 28, 5, 7, 10, 0, 7, 5, 4, 26, 5, 0, 10, 1, 26, 5, 3, 5, 13, 24, 14, 0, 3, 6, 35, 8, 5, 27, 26, 5, 34, 5, 0, 6, 5, 25, 4, 3, 26, 5, 13, 14, 5, 13, 14, 16, 0, 8, 17, 5, 5, 35, 6, 18, 19, 20, 22, 19, 20, 5, 31, 5, 0, 25, 5, 27, 7, 10, 5, 0, 8, 5, 12, 1, 3, 7, 5, 19, 20, 5, 31, 23), 0.191516813160229), ((0, 1, 0, 4, 5, 1, 5, 0, 3, 8, 6, 5, 5, 27, 1, 34, 5, 0, 6, 0, 1, 17, 5, 9, 10, 12, 5, 0, 4, 5, 35, 8, 12, 7, 1, 25, 28, 35, 6, 5, 13, 5, 14, 0, 5, 35, 6, 7, 13, 10, 12, 14, 5, 16, 0, 17, 12, 5, 7, 24, 5, 3, 10, 0, 1, 5, 8, 19, 35, 6, 5, 20, 21, 0, 5, 26, 5, 7, 27, 9, 10, 12, 0, 1, 

In [5]:
def convert_tensor_to_list(encoded_traces):
    formatted_traces = {}
    for trace_tensor, freq in encoded_traces.items():
        # Ensure trace_tensor is a list if it's not already a tuple
        if isinstance(trace_tensor, tuple):
            formatted_traces[trace_tensor] = freq
        else:
            formatted_traces[tuple(trace_tensor.tolist())] = freq  # Convert tensor to tuple (hashable)
    return formatted_traces

def convert_trace_to_text(formatted_traces):
    text_traces = {}
    for trace, freq in formatted_traces.items():
        text_trace = " ".join(map(str, trace))  # Convert tuple of numbers to space-separated string
        text_traces[text_trace] = freq
    return text_traces

In [6]:
formatted_traces = convert_tensor_to_list(trace_data)
#print("Converted Traces:", formatted_traces)

**Convert Encoded Traces to Text Format for BERT**

In [7]:
text_traces = convert_trace_to_text(formatted_traces)
#print("Text Traces:", text_traces)

**Generate BERT Embeddings for Encoded Traces**

In [8]:
tokenizer = AutoTokenizer.from_pretrained('prajjwal1/bert-medium', truncation_side='left')
model = AutoModel.from_pretrained('prajjwal1/bert-medium')

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

def generate_bert_embeddings(text_traces):
    """
    Convert each trace into an embedding using a pretrained BERT model.
    
    Args:
        text_traces (dict): {trace_text: frequency} mapping.

    Returns:
        dict: {trace_text: (embedding vector, frequency)}
    """
    model.eval()  # Set BERT to evaluation mode
    trace_embeddings = {}

    with torch.no_grad():
        for trace_text, freq in text_traces.items():
            # Tokenize the trace sequence
            inputs = tokenizer(trace_text, return_tensors="pt", padding=True, truncation=True)
            inputs = {k: v.to(device) for k, v in inputs.items()}  # Move tensors to GPU if available
            
            # Get BERT embeddings
            outputs = model(**inputs)
            cls_embedding = outputs.last_hidden_state[:, 0, :].squeeze().cpu().numpy()  # Extract [CLS] token embedding

            # Store embedding with frequency
            trace_embeddings[trace_text] = (cls_embedding, freq)

    return trace_embeddings


In [18]:
# Generate embeddings
bert_embeddings = generate_bert_embeddings(text_traces)
#print("Example Embedding:", list(bert_embeddings.items())[:1])  # Check format

**Store Embeddings for Future Use**

In [10]:
def save_embeddings(file_path, embeddings):
    with open(file_path, 'wb') as f:
        pickle.dump(embeddings, f)

In [11]:
save_embeddings('../semantic_data/' + data + '/trace_embeddings_vd.pkl', bert_embeddings)
print("Embeddings saved successfully!")

Embeddings saved successfully!


In [12]:
###############################################################
############# Trace emdedding complete ########################
###############################################################

**Step 2: Storing BERT Embeddings in FAISS for Fast Retrieval**

In [13]:
import pickle

# Load saved embeddings
def load_embeddings(file_path):
    with open(file_path, 'rb') as f:
        embeddings = pickle.load(f)
    return embeddings

# Load embeddings
bert_embeddings = load_embeddings('../semantic_data/' + data + '/trace_embeddings_vd.pkl')
print("Loaded", len(bert_embeddings), "trace embeddings")

Loaded 1000 trace embeddings


In [14]:
import faiss
import numpy as np

def create_faiss_index(embedding_dim):
    """
    Creates a FAISS index for fast similarity search.

    Args:
        embedding_dim (int): The dimension of the embeddings (BERT: 768).

    Returns:
        faiss.IndexFlatL2: Initialized FAISS index.
    """
    index = faiss.IndexFlatL2(embedding_dim)  # L2 distance search index
    return index

# Initialize FAISS index
embedding_dim = len(next(iter(bert_embeddings.values()))[0])  # Get embedding size
faiss_index = create_faiss_index(embedding_dim)
print("FAISS Index Created (Dimension:", embedding_dim, ")")

FAISS Index Created (Dimension: 512 )


In [15]:
def insert_embeddings_into_faiss(embeddings, index):
    """
    Inserts embeddings into the FAISS index.

    Args:
        embeddings (dict): {trace_text: (embedding_vector, frequency)}
        index (faiss.Index): FAISS index to store embeddings.

    Returns:
        list: Mapping of FAISS index positions to trace texts.
    """
    trace_mapping = []
    vectors = []

    for trace_text, (embedding, freq) in embeddings.items():
        trace_mapping.append((trace_text, freq))  # Store original text and frequency
        vectors.append(embedding)

    vectors = np.array(vectors).astype('float32')
    index.add(vectors)  # Insert all embeddings into FAISS

    return trace_mapping

# Insert embeddings into FAISS
trace_mapping = insert_embeddings_into_faiss(bert_embeddings, faiss_index)
print("Inserted", len(trace_mapping), "traces into FAISS")

Inserted 1000 traces into FAISS


In [16]:
save_embeddings('../semantic_data/' + data + '/trace_mapping_vd.pkl', trace_mapping)
print("Mapping saved successfully!")

Mapping saved successfully!


In [17]:
def save_faiss_index(index, file_path):
    """
    Saves a FAISS index to a file.

    Args:
        index (faiss.Index): The FAISS index to save.
        file_path (str): Path to save the index.
    """
    faiss.write_index(index, file_path)

save_faiss_index(faiss_index, '../semantic_data/' + data + '/trace_faiss.index')
print("FAISS index saved successfully!")

FAISS index saved successfully!


**Zero padding encoding**

In [1]:
import pickle
import torch
import faiss
import numpy as np

In [3]:
# Load stored traces
def load_traces(file_path):
    with open(file_path, 'rb') as f:
        distinct_traces = pickle.load(f)
    return distinct_traces

In [5]:
data = "mip"
trace_frequencies = load_traces('../semantic_data/' + data + '/' + data + '_encoded_trace_frequencies_all.pkl')
print("Loaded traces:", list(trace_data.items())[:5])  

Loaded traces: [((0, 7, 5, 3, 24, 0, 8, 5, 0, 7, 5, 0, 25, 9, 7, 8, 10, 5, 5, 12, 0, 17, 5, 12, 27, 7, 5, 10, 0, 4, 5, 5, 35, 8, 1, 6, 13, 14, 5, 0, 4, 7, 3, 24, 26, 5, 33, 5, 11, 10, 5, 12, 13, 14, 13, 5, 14, 15, 16, 0, 19, 5, 20, 21, 5, 0, 30, 26, 4, 5, 27, 12, 5, 34, 24, 2, 3, 0, 26, 17, 18, 9, 7, 10, 19, 20, 5, 31, 19, 5, 20, 5, 5, 21, 23), 0.191516813160229), ((0, 17, 5, 0, 25, 5, 12, 5, 7, 26, 4, 5, 0, 25, 28, 5, 7, 10, 0, 7, 5, 4, 26, 5, 0, 10, 1, 26, 5, 3, 5, 13, 24, 14, 0, 3, 6, 35, 8, 5, 27, 26, 5, 34, 5, 0, 6, 5, 25, 4, 3, 26, 5, 13, 14, 5, 13, 14, 16, 0, 8, 17, 5, 5, 35, 6, 18, 19, 20, 22, 19, 20, 5, 31, 5, 0, 25, 5, 27, 7, 10, 5, 0, 8, 5, 12, 1, 3, 7, 5, 19, 20, 5, 31, 23), 0.191516813160229), ((0, 1, 0, 4, 5, 1, 5, 0, 3, 8, 6, 5, 5, 27, 1, 34, 5, 0, 6, 0, 1, 17, 5, 9, 10, 12, 5, 0, 4, 5, 35, 8, 12, 7, 1, 25, 28, 35, 6, 5, 13, 5, 14, 0, 5, 35, 6, 7, 13, 10, 12, 14, 5, 16, 0, 17, 12, 5, 7, 24, 5, 3, 10, 0, 1, 5, 8, 19, 35, 6, 5, 20, 21, 0, 5, 26, 5, 7, 27, 9, 10, 12, 0, 1, 

In [6]:
max_length = max(len(trace) for trace in trace_frequencies.keys())

In [7]:
def encode_trace(trace, max_length):
    """Converts a trace into a fixed-length vector using zero-padding."""
    vector = np.zeros(max_length, dtype=np.float32)
    vector[:len(trace)] = trace  # Fill available positions with trace values
    return vector

In [8]:
# Create FAISS index
dimension = max_length
index = faiss.IndexFlatL2(dimension)  # L2 (Euclidean) distance index
trace_list = []  # Stores original traces
trace_freq_list = []  # Stores corresponding frequencies

# Add traces to FAISS index
for trace, freq in trace_frequencies.items():
    trace_embedding = encode_trace(trace, max_length)
    index.add(np.array([trace_embedding]))  # Add to FAISS index
    trace_list.append(trace)  # Keep track of original traces
    trace_freq_list.append(freq)

# Save FAISS index and metadata
faiss.write_index(index, '../semantic_data/' + data + '/faiss_0pad_index.bin')
with open('../semantic_data/' + data + '/trace_metadata.pkl', "wb") as f:
    pickle.dump({"trace_list": trace_list, "trace_freq_list": trace_freq_list, "max_length": max_length}, f)

print("FAISS index and metadata saved successfully!")

FAISS index and metadata saved successfully!
