In [None]:
import pandas as pd
import random
import numpy as np
import torch
from sentence_transformers import SentenceTransformer
from sentence_transformers.quantization import semantic_search_faiss
from sklearn.metrics.pairwise import cosine_similarity
from transformers import pipeline
from tqdm import tqdm
import jsonlines

In [None]:
df = pd.DataFrame({'auth':['a1','a2','a2','a1','a1','a5'], 'docID':[f"d{x:02d}" for x in range(6)], 'docText':['haha', 'hehe', 'a a a', 'brr', 'ssss', 'kimono aa lola']})

In [None]:
embed_model_name: str = "gabrielloiseau/LUAR-MUD-sentence-transformers"
genre_model_name: str = "classla/xlm-roberta-base-multilingual-text-genre-classifier"
embed_model = SentenceTransformer(embed_model_name, device="cuda")
genre_classifier = pipeline("text-classification",model=genre_model_name,device="cuda")

In [None]:
data_by_author = df.groupby('auth').agg({'docText':list})['docText'].to_dict()
data_by_author

In [None]:
## Compute Embeddings of every doc in df
## Classify Genre for every doc in df
## outer join based on authorID
## Discard those rows where the two documents are same
## Calculate document similarity of left and right
## Discard those that have similarity > threshold


In [None]:
hard_pairs = []
# Progress bar for mining hard positives over authors.
for author, docs in data_by_author.items():
    if len(docs) <= 1:
        continue
    print(f"docs: {docs}")
    embeddings = embed_model.encode(docs, convert_to_tensor=False)
    print(len(embeddings), len(embeddings[0]))
    sim_matrix = cosine_similarity(embeddings)
    print(f"sim_matrix: {sim_matrix}")
    n = len(docs)
            
    # For each anchor, gather candidate indices with similarity below threshold.
    hard_candidates = {}
    for i in range(n):
        for j in range(i + 1, n):
            if sim_matrix[i, j] < 0.9:
                hard_candidates.setdefault(i, []).append(j)
    
    print(f"hard_candidates: {hard_candidates}")

    # Compute the genre for each document.
    doc_genres = []
    for doc in docs:
        classification = genre_classifier(doc, truncation=True)
        genre = classification[0]["label"]
        doc_genres.append(genre)

    print(f"doc_genres: {doc_genres}")

    # For each anchor with candidate positives, choose one candidate with a different genre.
    for anchor_idx, candidates in hard_candidates.items():
        print(f"anchor_idx: {anchor_idx}, candidates: {candidates}")
        anchor_genre = doc_genres[anchor_idx]
        valid_candidates = [cand for cand in candidates if doc_genres[cand] != anchor_genre]
        if valid_candidates:
            chosen_positive_idx = random.choice(valid_candidates)
            hard_pairs.append({
                "author": author,
                "anchor": docs[anchor_idx],
                "positive": docs[chosen_positive_idx]
            })
    print(f"hard_pairs: {hard_pairs}")
    print()


In [None]:
hard_pairs

In [None]:
dfc = df.copy()
dfc

In [None]:
dfc['embeddings'] = dfc['docText'].apply(lambda x: embed_model.encode(x))
dfc['genre'] = dfc['docText'].apply(lambda x: genre_classifier(x, truncation=True)[0]["label"])
dfc

In [None]:
dfcm = dfc.merge(dfc, how='outer', on='auth', suffixes=["_anchor", "_positive"])
dfcm

In [None]:
dfcm = dfcm[(dfcm['docID_anchor'] != dfcm['docID_positive']) & (dfcm['genre_anchor'] != dfcm['genre_positive'])]
dfcm

In [None]:
# Lexicographically sort each docID pair (unordered) using NumPy
doc_min = np.minimum(dfcm['docID_anchor'], dfcm['docID_positive'])
doc_max = np.maximum(dfcm['docID_anchor'], dfcm['docID_positive'])

print(doc_min)
print(doc_max)
# Create a unique key per (auth, unordered docID pair)
dfcm['pair_key'] = dfcm['auth'].astype(str) + '__' + doc_min + '__' + doc_max
dfcm = dfcm.drop_duplicates(subset='pair_key').drop(columns='pair_key')

In [None]:
dfcm

In [None]:
dfcm['similarity_score'] = dfcm.apply(lambda x: cosine_similarity(x['embeddings_anchor'].reshape(1, -1), x['embeddings_positive'].reshape(1, -1))[0][0], axis=1)
dfcm

In [None]:
dfcm = dfcm[dfcm['similarity_score'] < 0.9]
dfcm

In [None]:
dfcm.groupby(['auth', 'docID_anchor']).sample(n=1, random_state=42)

In [None]:
## Combined Code
embed_model_name: str = "gabrielloiseau/LUAR-MUD-sentence-transformers"
genre_model_name: str = "classla/xlm-roberta-base-multilingual-text-genre-classifier"
embed_model = SentenceTransformer(embed_model_name, device="cuda")
genre_classifier = pipeline("text-classification",model=genre_model_name,device="cuda")

df = pd.read_json("/data/araghavan/HIATUS/datadreamer-ta2/data/ta2_jan_2025_trian_data/train_sadiri_processed_with_embeddings_wo_ao3_filtered.jsonl", lines=True)
# df = pd.DataFrame({'auth':['a1','a2','a2','a1','a1','a5'], 'docID':[f"d{x:02d}" for x in range(6)], 'docText':['haha', 'hehe', 'a a a', 'brr', 'ssss', 'kimono aa lola']})
dfc = df.copy()

dfc['embeddings'] = dfc['docText'].apply(lambda x: embed_model.encode(x))
dfc['genre'] = dfc['docText'].apply(lambda x: genre_classifier(x, truncation=True)[0]["label"])

dfcm = dfc.merge(dfc, how='outer', on='auth', suffixes=["_anchor", "_positive"])

dfcm = dfcm[(dfcm['docID_anchor'] != dfcm['docID_positive']) & (dfcm['genre_anchor'] != dfcm['genre_positive'])]

# Lexicographically sort each docID pair (unordered) using NumPy
doc_min = np.minimum(dfcm['docID_anchor'], dfcm['docID_positive'])
doc_max = np.maximum(dfcm['docID_anchor'], dfcm['docID_positive'])

# Create a unique key per (auth, unordered docID pair)
dfcm['pair_key'] = dfcm['auth'].astype(str) + '__' + doc_min + '__' + doc_max
dfcm = dfcm.drop_duplicates(subset='pair_key').drop(columns='pair_key')

dfcm['similarity_score'] = dfcm.apply(lambda x: cosine_similarity(x['embeddings_anchor'].reshape(1, -1), x['embeddings_positive'].reshape(1, -1))[0][0], axis=1)
dfcm = dfcm[dfcm['similarity_score'] < 0.9]
fin_ans = dfcm.groupby(['auth', 'docID_anchor']).sample(n=1, random_state=42)
fin_ans

In [None]:
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
from transformers import pipeline
from sklearn.metrics.pairwise import cosine_similarity
import os
# -----------------------------
# Configs
# -----------------------------
embed_model_name = "gabrielloiseau/LUAR-MUD-sentence-transformers"
doc_genre_model_name = "classla/xlm-roberta-base-multilingual-text-genre-classifier"
data_path = "/data/araghavan/HIATUS/datadreamer-ta2/data/ta2_jan_2025_trian_data/train_sadiri_processed_with_embeddings_wo_ao3_filtered.jsonl"
batch_size = 2048
ceiling_threshold = 0.4
print(os.path.isfile(data_path))
# -----------------------------
# Load Models
# -----------------------------
embed_model = SentenceTransformer(embed_model_name, device="cuda")
doc_genre_classifier = pipeline("text-classification", model=doc_genre_model_name, device="cuda")

# -----------------------------
# Batched Processing Function
# -----------------------------
def batched_process(texts, embed_model, doc_genre_classifier, batch_size=64):
    embeddings = []
    genres = []

    for i in range(0, len(texts), batch_size):
        batch = texts[i:i + batch_size]

        # Embedding generation
        batch_embeds = embed_model.encode(batch, convert_to_numpy=True, batch_size=batch_size)
        embeddings.extend(batch_embeds)

        # Genre classification
        batch_genres = doc_genre_classifier(batch, truncation=True, batch_size=batch_size)
        genres.extend([g["label"] for g in batch_genres])

    return np.array(embeddings), genres

# -----------------------------
# Load Data
# -----------------------------
df = pd.read_json(data_path, lines=True)
print(f"Read JSON: {data_path} with shape: {df.shape}")

# -----------------------------
# Apply Batched Embedding + Genre Inference
# -----------------------------
texts = df["fullText"].tolist()
embeddings, genres = batched_process(texts, embed_model, doc_genre_classifier, batch_size=batch_size)
print(f"Generated embeddings and genre for all docs with shape: {embeddings.shape}")

df["doc_embedding"] = list(embeddings)  # Keep embeddings as list of numpy arrays
df["doc_genre"] = genres

# -----------------------------
# Create Anchor-Positive Pairs (Within Author)
# -----------------------------
dfcm = df.merge(df, how="outer", on="authorID", suffixes=["_anchor", "_positive"])
print(f"Merged Dataframe to generate author anchor-positive pairs")
print(f"Now with shape: {dfcm.shape}")

# Filter out self-pairs and same-genre pairs
dfcm = dfcm.loc[
    (dfcm["documentID_anchor"] != dfcm["documentID_positive"]) &
    (dfcm["doc_genre_anchor"] != dfcm["doc_genre_positive"])
]
print(f"Filtered same anchor-positive pair documents, same genre anchor-positive pair documents")
print(f"Now with shape: {dfcm.shape}")

# -----------------------------
# Deduplicate Unordered Pairs per Author
# -----------------------------
doc_min = np.minimum(dfcm["documentID_anchor"], dfcm["documentID_positive"])
doc_max = np.maximum(dfcm["documentID_anchor"], dfcm["documentID_positive"])
# dfcm["pair_key"] = dfcm["authorID"] + "__" + doc_min + "__" + doc_max
dfcm.loc[:, "pair_key"] = (
    dfcm["authorID"].astype(str) + "__" +
    doc_min.astype(str) + "__" +
    doc_max.astype(str)
)
dfcm = dfcm.drop_duplicates(subset="pair_key").drop(columns="pair_key")
print(f"Dedup anchor-positive pairs, now with shape: {dfcm.shape}")

# -----------------------------
# Compute Cosine Similarity (Vectorized)
# -----------------------------
anchor_embeddings = np.stack(dfcm["doc_embedding_anchor"].values)

positive_embeddings = np.stack(dfcm["doc_embedding_positive"].values)
print(f"Generated pairwise similarity of anchor-positive pairs from their respective embeddings")

dfcm["similarity_score"] = np.diag(cosine_similarity(anchor_embeddings, positive_embeddings))
print(f"Now with shape: {dfcm.shape}")

# -----------------------------
# Filter Low-Similarity Pairs
# -----------------------------
initial_count = dfcm.shape[0]
dfcm = dfcm.loc[dfcm["similarity_score"] < ceiling_threshold]
dfcm = dfcm.reset_index(drop=True)
print(f"Hard positive filtering of considering anchor-positive pairs below threshold of: {ceiling_threshold}")
print(f"Dropped {initial_count - dfcm.shape[0]} pairs above similarity threshold")
print(f"Now with shape: {dfcm.shape}")

# -----------------------------
# Sample One Positive per Anchor per Author
# -----------------------------
fin_ans = dfcm.groupby(["authorID", "documentID_anchor"], group_keys=False).sample(n=1, random_state=42)
print(f"Grouped author, anchors and sampled candidates to generate auth-anchor-positive pairs")

# -----------------------------
# Final Output
# -----------------------------
print("Final sampled result shape:", fin_ans.shape)
# Save or return as needed
# fin_ans.to_json("final_pairs.jsonl", orient="records", lines=True)

In [None]:

    # For each anchor, gather candidate indices with similarity below threshold.
    hard_candidates = {}
    for i in range(n):
        for j in range(i + 1, n):
            if sim_matrix[i, j] < self.ceiling_threshold:
                hard_candidates.setdefault(i, []).append(j)

    # Compute the genre for each document.
    doc_genres = []
    for doc in docs:
        classification = self.genre_classifier(doc, truncation=True)
        genre = classification[0]["label"]
        doc_genres.append(genre)

    # For each anchor with candidate positives, choose one candidate with a different genre.
    for anchor_idx, candidates in hard_candidates.items():
        anchor_genre = doc_genres[anchor_idx]
        valid_candidates = [cand for cand in candidates if doc_genres[cand] != anchor_genre]
        if valid_candidates:
            chosen_positive_idx = random.choice(valid_candidates)
            hard_pairs.append({
                "author": author,
                "anchor": docs[anchor_idx],
                "positive": docs[chosen_positive_idx]
            })
