In [1]:
# %pip install sentence_transformers==3.0.1
# %pip install xformers

In [1]:
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd
import numpy as np
from joblib import Parallel, delayed
from tqdm import tqdm
import re
import os


os.environ["TOKENIZERS_PARALLELISM"] = "false"
os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'expandable_segments:True'
import torch
torch.cuda.empty_cache()
print(torch.cuda.is_available())


# Load the pre-trained Sentence Transformer model
# mdl_name = 'stella_en_1.5B_v5' # oom
# mdl_name = 'stella_en_400M_v5'
mdl_name = 'all-mpnet-base-v2'
# mdl_name = 'all-MiniLM-L6-v2'

# model = SentenceTransformer("dunzhang/"+mdl_name, trust_remote_code=True).cuda()
model = SentenceTransformer(mdl_name, trust_remote_code=True).cuda()

# Function to generate embeddings
def get_embeddings(text):
    # Generate embeddings
    embedding = model.encode(text)
    return embedding


  from tqdm.autonotebook import tqdm, trange


True


In [2]:
folder = "qwen_data"
res_df = pd.read_csv("../../"+folder+"/results.csv")
res_df = res_df.fillna("")
fea_df = pd.read_csv("../../data/fea_df.csv")

fea_df['description'] = fea_df['description'].apply(
    lambda x: re.sub(r"Must related to [^(]*\(?.*?\)?\.", "", x).strip()
)

In [3]:
# example sm_id

sm_id = "104a474"
topic = "bodyhate"

res_str = ''
refer_str = fea_df.loc[fea_df.fea==topic, 'description'].tolist()[0]
returned_str = res_df.loc[res_df.sm_id==sm_id, topic+"_phrases"].tolist()[0]

returned_str_ls = returned_str.split(";")
query_embeddings = model.encode([refer_str])
doc_embeddings = model.encode(returned_str_ls)
cos_scores = cosine_similarity(query_embeddings, doc_embeddings)[0]
# get the dot product similarity
dot_scores = np.dot(query_embeddings, doc_embeddings.T)[0]
res_str = "; ".join([f"{returned_str_ls[i]} ({cos_scores[i]})" for i in range(len(returned_str_ls))]) + ";"
                
print(res_str)

wearing baggy clothes (0.3213268220424652);  feeling scared to talk (0.17658159136772156);


In [5]:
def process_batch(res_df_batch, fea_df, folder, mdl_name, model):
    res_df_with_cos_batch = res_df_batch.copy()
    sm_id_counter = 0  # Counter to track the number of processed sm_id
    
    for sm_id in tqdm(res_df_batch.sm_id.unique(), desc="Processing sm_id"):
        for topic in fea_df.fea.unique():
            refer_str = fea_df.loc[fea_df.fea == topic, 'description'].tolist()[0]
            returned_str = res_df_batch.loc[res_df_batch.sm_id == sm_id, f"{topic}_phrases"].tolist()[0]
            
            if returned_str:
                returned_str_ls = returned_str.split(";")
                query_embeddings = model.encode([refer_str])
                doc_embeddings = model.encode(returned_str_ls)
                cos_scores = cosine_similarity(query_embeddings, doc_embeddings)[0]
                res_str = "; ".join([f"{returned_str_ls[i]} ({cos_scores[i]})" for i in range(len(returned_str_ls))]) + ";"
                res_df_with_cos_batch.loc[res_df_with_cos_batch.sm_id == sm_id, f"{topic}_phrases"] = res_str
        
        # # Increment the counter and check if 1000 sm_id have been processed
        # sm_id_counter += 1
        # if sm_id_counter % 1000 == 0:
        #     # Write intermediate results to CSV
        #     output_path = os.path.join("../../", folder, f"results_with_cos_{mdl_name}.csv")
        #     res_df_with_cos_batch.to_csv(output_path, index=False)
        #     print(f"Intermediate results written to {output_path} at {sm_id_counter} sm_id")
    
    # # Final write to ensure all data is saved at the end
    # output_path = os.path.join("../../", folder, f"results_with_cos_{mdl_name}.csv")
    # res_df_with_cos_batch.to_csv(output_path, index=False)
    # print(f"Final results written to {output_path}")

    return res_df_with_cos_batch


In [None]:

# process_batch(res_df, fea_df, folder, mdl_name, model)


# Split res_df into 10 batches
res_df_batches = np.array_split(res_df, 4)

# Run each batch in parallel using joblib
results = Parallel(n_jobs=4)(delayed(process_batch)(batch, fea_df, folder, mdl_name, model) for batch in res_df_batches)

# Combine results back into the main DataFrame
res_df_with_cos = pd.concat(results)


Processing sm_id:  73%|███████▎  | 14077/19294 [16:18<05:49, 14.94it/s]

In [None]:
res_df_with_cos.to_csv("../../"+folder+"/results_with_cos_"+mdl_name+".csv", index=False)