In [1]:
import sentence_transformers
from sentence_transformers import SentenceTransformer


In [2]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

def compute_sentence_similarity(sentence1, sentence2):
    """
    Encodes two sentences using a pre-trained SBERT model and calculates 
    their cosine similarity.

    Args:
        sentence1 (str): The first sentence.
        sentence2 (str): The second sentence.

    Returns:
        float: The cosine similarity score between the two sentences.
    """
    # 1. Load the pre-trained SentenceTransformer model
    # 'all-MiniLM-L6-v2' is a fast and efficient model
    try:
        model = SentenceTransformer('all-MiniLM-L6-v2')
    except Exception as e:
        print(f"Error loading model: {e}")
        return None

    # 2. Encode the sentences to get their embeddings (vector representations)
    embeddings = model.encode([sentence1, sentence2])
    #print(f"emdeddings:{embeddings}\n")
    
    embedding_1 = embeddings[0].reshape(1, -1) # Reshape for sklearn
    embedding_2 = embeddings[1].reshape(1, -1)


    #print(f"embedding_1:{embedding_1}\n")
    #print(f"embedding_2:{embedding_2}") 
    
    # 3. Compute the cosine similarity
    # The result is an array, we take the first (and only) element
    similarity_score = cosine_similarity(embedding_1, embedding_2)[0][0]

    return similarity_score



In [16]:
# --- Example Usage ---
sentence_a = "The cat sat on the mat."
sentence_b = "A feline rested on the rug."
sentence_c = "I like to eat ice cream."

similarity_ab = compute_sentence_similarity(sentence_a, sentence_b)
similarity_ac = compute_sentence_similarity(sentence_a, sentence_c)
similarity_bc = compute_sentence_similarity(sentence_b, sentence_c)

print(f"Sentence A: '{sentence_a}'")
print(f"Sentence B: '{sentence_b}'")
print(f"Sentence C: '{sentence_c}'")
print("-" * 30)

if similarity_ab is not None:
    # Scores closer to 1 indicate higher similarity
    print(f"Similarity (A vs. B): {similarity_ab:.4f}")
    print(f"Similarity (A vs. C): {similarity_ac:.4f}")
    print(f"Similarity (B vs. C): {similarity_ac:.4f}")

    if similarity_ab > similarity_ac:
        print("\nAs expected, sentences A and B are more similar in meaning.")
    else:
        print("\nSomething unexpected happened.")


Sentence A: 'The cat sat on the mat.'
Sentence B: 'A feline rested on the rug.'
Sentence C: 'I like to eat ice cream.'
------------------------------
Similarity (A vs. B): 0.5560
Similarity (A vs. C): 0.0422
Similarity (B vs. C): 0.0422

As expected, sentences A and B are more similar in meaning.


In [56]:
# Write a function that creates embeddings for each sentence in a text file and saves them as a CSV. How can I use the functionality in the app?

#Text / Transcript
#   ↓
#Sentence Segmentation (regex)
#   ↓
#Batch Embeddings (OpenAI)
#   ↓
#CSV / Vector DB
#   ↓
#Coaching • Search • Analytics • XP
 
import re
import pandas as pd
from typing import List

import os
from dotenv import load_dotenv
from openai import OpenAI

load_dotenv(override=True, dotenv_path="../.env.local")
my_api_key = os.getenv("OPEN_AI_API_KEY")
my_openai_client = OpenAI(api_key=my_api_key)

# High-performance sentence splitter

def split_sentences(text: str) -> List[str]:
    """
    Fast, production-grade sentence splitter using regex.
    Handles most spoken transcript cases.
    """
    sentences = re.split(r'(?<=[.!?])\s+', text.strip())
    return [s.strip() for s in sentences if len(s.strip()) > 3]



In [58]:

# Batched embedding generator
def generate_embeddings(
    sentences: List[str],
    model: str = "text-embedding-3-small",
    batch_size: int = 64
):
    """
    Generates embeddings in efficient batches.
    """
    embeddings = []

    for i in range(0, len(sentences), batch_size):
        batch = sentences[i:i + batch_size]

        response = my_openai_client.embeddings.create(
            model=model,
            input=batch
        )

        embeddings.extend([d.embedding for d in response.data])

    return embeddings

def process_file_to_csv(
    input_file: str,
    output_csv: str = "sentence_embeddings.csv"
):
    with open(input_file, "r", encoding="utf-8") as f:
        text = f.read()

    sentences = split_sentences(text)
    embeddings = generate_embeddings(sentences)

    df = pd.DataFrame({
        "sentence_id": range(1, len(sentences) + 1),
        "sentence": sentences,
        "embedding": embeddings
    })

    df.to_csv(output_csv, index=False)
    print(f"Saved {len(sentences)} sentence embeddings → {output_csv}")
    return df


In [59]:
process_file_to_csv("./What_SpeakEQ_Can_Learn.txt")

Saved 33 sentence embeddings → sentence_embeddings.csv


Unnamed: 0,sentence_id,sentence,embedding
0,1,What SpeakEQ Can Learn (and Improve On)\n-----...,"[0.022891094908118248, -0.011460373178124428, ..."
1,2,"Go Beyond Just Delivery, Focus on Emotional Al...","[0.03171725943684578, -0.015057769604027271, -..."
2,3,Let’s realign that.”\nThat’s a huge psychologi...,"[-0.005456444341689348, 0.03932730481028557, 0..."
3,4,Anxiety Detection as a Core Feature\nOrai assu...,"[-0.012402944266796112, 0.017508376389741898, ..."
4,5,"We identify it live, voice tremors, sudden pac...","[-0.016547979786992073, 0.054520245641469955, ..."
5,6,This will be life-changing for users with ADHD...,"[0.006036007311195135, -0.006269796285778284, ..."
6,7,Grammar Feedback in Spoken Context\nThis compe...,"[-0.0019025554647669196, 0.005172717850655317,..."
7,8,"We can analyze clarity, repetition, filler ove...","[0.0255295317620039, 0.03645041957497597, -0.0..."
8,9,Real-Time Emotional Coaching (Not Just Metrics...,"[-0.01809908077120781, 0.009448678232729435, -..."
9,10,Let’s slow the pace.,"[0.046186890453100204, 0.026558153331279755, -..."
