In [1]:
import pickle, faiss, numpy as np
from sentence_transformers import SentenceTransformer

model = SentenceTransformer("all-mpnet-base-v2")

with open("outputs/ROC-spring-embeddings-all-mpnet-base-v2-size-100000.pkl", "rb") as f:
    data = pickle.load(f)
corpus_sentences = data["sentences"]
corpus_embeddings = data["embeddings"].astype("float32")

index = faiss.read_index("outputs/blog_index.faiss")


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def search(query, top_k=5):
    query_vec = model.encode(query).astype("float32")
    query_vec /= np.linalg.norm(query_vec, keepdims=True)
    query_vec = np.expand_dims(query_vec, axis=0)

    distances, corpus_ids = index.search(query_vec, top_k)
    for score, idx in zip(distances[0], corpus_ids[0]):
        print(f"{score:.3f} :: {corpus_sentences[idx][:200]}...")


In [3]:
search("a family sues after a medical mistake", top_k=3)
search("blog about cooking and humility", top_k=3)


0.629 :: Bringing a baby into this world is supposed to be one of the happiest days in parents' lives. However, when a child is injured due to medical negligence during delivery, the once joyous day can sudden...
0.620 :: A. Recently, an analysis of 2012 medical malpractice payouts was made public. This analysis showed that there were 12,142 medical malpractice payouts last year, with five states making up the majority...
0.617 :: One. In some routine medical and surgical procedures, the risks are low enough for most patients that it can be a shock when something goes seriously wrong. Tonsillectomy, for instance, is a very comm...
0.690 :: I’m Amanda. I live and work as a Graphic Designer in Alberta, Canada. I’ve decided to write this blog to catalog my experiences and because I like to write and pretend people are listening. I spend mo...
0.687 :: I absolutely LOVE cooking and love eating! My attitude toward food and healthy living is the “everything in moderation” approach. Eating he

In [11]:
import pickle
import faiss
import numpy as np
import csv

# Load model data and FAISS index
embedding_cache_path = "outputs/ROC-spring-embeddings-all-mpnet-base-v2-size-100000.pkl"
index_path = "outputs/blog_index.faiss"

with open(embedding_cache_path, "rb") as f:
    data = pickle.load(f)

corpus_sentences = data["sentences"]
corpus_embeddings = data["embeddings"].astype("float32")
corpus_embeddings /= np.linalg.norm(corpus_embeddings, axis=1, keepdims=True)

index = faiss.read_index(index_path)

# Parameters
top_k = 10
similarity_min = 0.70
similarity_max = 0.80
pairs = []
used_ids = set()

for i in range(len(corpus_sentences)):
    if len(pairs) >= 25:
        break
    if i in used_ids:
        continue

    query_vec = np.expand_dims(corpus_embeddings[i], axis=0)
    distances, corpus_ids = index.search(query_vec, top_k)

    for score, j in zip(distances[0][1:], corpus_ids[0][1:]):  # skip self
        if j == i or j in used_ids:
            continue
        if similarity_min <= score <= similarity_max:
            pairs.append({
                "blog_1_id": i,
                "blog_1_text": corpus_sentences[i][:500],
                "blog_2_id": j,
                "blog_2_text": corpus_sentences[j][:500],
                "similarity": round(float(score), 3)
            })
            used_ids.add(i)
            used_ids.add(j)
            break

print(f"Collected {len(pairs)} pairs.")

# Save to CSV
output_file = "outputs/similar_blog_pairs.csv"
with open(output_file, "w", newline="", encoding="utf-8") as f:
    writer = csv.DictWriter(f, fieldnames=pairs[0].keys())
    writer.writeheader()
    writer.writerows(pairs)

print(f"Saved {len(pairs)} pairs to {output_file}")


Collected 25 pairs.
Saved 25 pairs to outputs/similar_blog_pairs.csv


In [13]:
import pickle
import faiss
import numpy as np
import csv

# Load model data and FAISS index
embedding_cache_path = "outputs/ROC-spring-embeddings-all-mpnet-base-v2-size-100000.pkl"
index_path = "outputs/blog_index.faiss"

with open(embedding_cache_path, "rb") as f:
    data = pickle.load(f)

corpus_sentences = data["sentences"]
corpus_embeddings = data["embeddings"].astype("float32")
corpus_embeddings /= np.linalg.norm(corpus_embeddings, axis=1, keepdims=True)

index = faiss.read_index(index_path)

# Parameters
top_k = 10
similarity_min = 0.30
similarity_max = 0.40
pairs = []
used_ids = set()

for i in range(len(corpus_sentences)):
    if len(pairs) >= 25:
        break
    if i in used_ids:
        continue

    query_vec = np.expand_dims(corpus_embeddings[i], axis=0)
    distances, corpus_ids = index.search(query_vec, top_k)

    for score, j in zip(distances[0][1:], corpus_ids[0][1:]):  # skip self
        if j == i or j in used_ids:
            continue
        if similarity_min <= score <= similarity_max:
            pairs.append({
                "blog_1_id": i,
                "blog_1_text": corpus_sentences[i][:500],
                "blog_2_id": j,
                "blog_2_text": corpus_sentences[j][:500],
                "similarity": round(float(score), 3)
            })
            used_ids.add(i)
            used_ids.add(j)
            break

print(f"Collected {len(pairs)} pairs.")

# Save to CSV
output_file = "outputs/dissimilar_blog_pairs.csv"
with open(output_file, "w", newline="", encoding="utf-8") as f:
    writer = csv.DictWriter(f, fieldnames=pairs[0].keys())
    writer.writeheader()
    writer.writerows(pairs)

print(f"Saved {len(pairs)} pairs to {output_file}")


Collected 25 pairs.
Saved 25 pairs to outputs/dissimilar_blog_pairs.csv
