# Embedding analysis

In [109]:
# --------------------------------------------------------------------
# 1. Import necessary libraries
# --------------------------------------------------------------------
import pandas as pd
import numpy as np
import torch
import torch.nn.functional as F
from sentence_transformers import SentenceTransformer, util
from tqdm import tqdm

In [110]:
# --------------------------------------------------------------------
# 2. Load data
# --------------------------------------------------------------------
data_long = pd.read_csv('../Data/data_long.csv', encoding='utf-8-sig')
prototypes_df = pd.read_csv('../Data/prototypes_llama3.3.csv', encoding='utf-8-sig')

In [111]:
# --------------------------------------------------------------------
# 3. Compute statistics on transcription_new
# --------------------------------------------------------------------
# Compute length of each transcription (in words)
data_long['transcription_length'] = data_long['transcription_new'].apply(lambda x: len(str(x).split()))

print("Transcription length statistics:")
print(data_long['transcription_length'].describe())
print("\nTranscription length quantiles:")
print(data_long['transcription_length'].quantile([0.25, 0.5, 0.75, 0.9, 0.95, 0.99]))

Transcription length statistics:
count    1020.000000
mean       41.275490
std        37.342953
min         1.000000
25%        18.000000
50%        33.000000
75%        55.000000
max       368.000000
Name: transcription_length, dtype: float64

Transcription length quantiles:
0.25     18.00
0.50     33.00
0.75     55.00
0.90     81.00
0.95    105.00
0.99    185.86
Name: transcription_length, dtype: float64


In [112]:
# --------------------------------------------------------------------
# 4. Load embedding model
# --------------------------------------------------------------------
model = SentenceTransformer("nomic-ai/nomic-embed-text-v1.5", trust_remote_code=True)

You try to use a model that was created with version 2.4.0.dev0, however, your version is 2.4.0. This might cause unexpected behavior or errors. In that case, try to update to the latest version.



<All keys matched successfully>


In [113]:
# --------------------------------------------------------------------
# 5. Compute Prototype Embeddings (No Merging)
# --------------------------------------------------------------------
prototype_embeddings = {}

for question_id in prototypes_df['question'].unique():
    subset = prototypes_df[prototypes_df['question'] == question_id]

    for func in subset['function'].unique():

        sentences = subset[subset['function'] == func]['prototypes'].values[0].split("\n")
        sentences = [s.split(". ", 1)[1] for s in sentences if ". " in s]
        sentences = ["clustering: " + s for s in sentences]

        embeddings = model.encode(sentences, convert_to_tensor=True)

        # Store mean embedding directly without merging
        prototype_embeddings[(question_id, func)] = F.normalize(embeddings.mean(dim=0), p=2, dim=0)

print("Prototype embeddings computed and normalized (no merging).")

Prototype embeddings computed and normalized (no merging).


In [114]:
# --------------------------------------------------------------------
# Sliding Window Embedding of Transcription_new
# --------------------------------------------------------------------
window_size = 5  # number of words
stride = 3

results = []

for idx, row in tqdm(data_long.iterrows(), total=len(data_long), desc="Embedding sliding windows"):
    transcription = str(row['transcription_new'])
    words = transcription.split()

    if len(words) <= window_size:
        windows = [" ".join(words)]
        starts = [0]
    else:
        windows = [" ".join(words[start:start + window_size]) for start in range(0, len(words) - window_size + 1, stride)]
        starts = list(range(0, len(words) - window_size + 1, stride))

    for start, window_text in zip(starts, windows):
        window_emb = model.encode("clustering: " + window_text, convert_to_tensor=True)

        for func in ['control', 'generation', 'justification', 'regulation']:
            proto_emb = prototype_embeddings.get((row['question'], func))
            if proto_emb is not None:
                similarity = util.cos_sim(window_emb, proto_emb).item()

                # Create result dict from row
                result = row.to_dict()

                # Add sliding window info
                result.update({
                    'start_word': start,
                    'end_word': start + window_size,
                    'function': func,
                    'similarity': similarity
                })

                results.append(result)

print("Sliding window embedding analysis complete.")

# Convert to DataFrame
results_df = pd.DataFrame(results)

Embedding sliding windows: 100%|██████████| 1020/1020 [04:25<00:00,  3.85it/s]

Sliding window embedding analysis complete.





In [115]:
# --------------------------------------------------------------------
# Save Results
# --------------------------------------------------------------------
similarity_df = pd.DataFrame(results)
similarity_df.to_csv('../Data/embedding_flow_similarity.csv', index=False, encoding='utf-8-sig')
print("Results saved to ../Data/embedding_flow_similarity.csv")

Results saved to ../Data/embedding_flow_similarity.csv
