In [1]:
import json
import pandas as pd
from FlagEmbedding import BGEM3FlagModel
import os
import json
import numpy as np
import pickle
from openai import OpenAI
import os
import multiprocess as mp
from tqdm import tqdm

In [2]:
with open('../data/emb/zhihu_emb.pkl', 'rb') as f:
    raw = pickle.load(f)
    zh_sentences = raw[0]
    zh_emb = raw[1]

In [3]:
emb_model = BGEM3FlagModel('BAAI/bge-m3',  
                       use_fp16=True) # Setting use_fp16 to True speeds up computation with a slight performance degradation


Fetching 30 files:   0%|          | 0/30 [00:00<?, ?it/s]

In [4]:
reddit = pd.read_csv('../data/reddit_post.csv')
reddit = reddit[["message_id", "title", "message"]]
reddit.columns = ["message_id","question", "detail"]
reddit_questions = reddit['question'].tolist()
reddit_details = reddit['detail'].tolist()
reddit_sentences = [str(x) + " " + str(y) for x, y in zip(reddit_questions, reddit_details)]


reddit_emb = emb_model.encode(reddit_sentences)['dense_vecs']

with open('../data/emb/reddit_emb.pkl', 'wb') as f:
    pickle.dump([reddit_sentences, reddit_emb], f)

similarity = reddit_emb @ zh_emb.T

top_k = 5
list_top_k = []
for i in range(similarity.shape[0]):
    top_k_idx = np.argsort(similarity[i])[::-1][:top_k]
    top_k_sim = similarity[i][top_k_idx]
    list_top_k.append([(sim,zh_sentences[j]) for sim, j in zip(top_k_sim,top_k_idx)])

reddit['top_1'] = [x[0][1] for x in list_top_k]
reddit['top_1_sim'] = [x[0][0] for x in list_top_k]
reddit['top_2'] = [x[1][1] for x in list_top_k]
reddit['top_2_sim'] = [x[1][0] for x in list_top_k]
reddit['top_3'] = [x[2][1] for x in list_top_k]
reddit['top_3_sim'] = [x[2][0] for x in list_top_k]
reddit['top_4'] = [x[3][1] for x in list_top_k]
reddit['top_4_sim'] = [x[3][0] for x in list_top_k]
reddit['top_5'] = [x[4][1] for x in list_top_k]
reddit['top_5_sim'] = [x[4][0] for x in list_top_k]

reddit.to_csv('../data/matched.csv', index=False)

pre tokenize: 100%|██████████| 113/113 [00:03<00:00, 35.27it/s]
You're using a XLMRobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
Inference Embeddings: 100%|██████████| 113/113 [15:11<00:00,  8.06s/it]


In [2]:
def ir_top5(question, candidates):
        
    client = OpenAI(api_key = os.environ['OPENAI_API_KEY'])
    user_prompt = f"# Question:\n{question}\n\n # Candidates:\nA. {candidates[0]}\nB. {candidates[1]}\nC. {candidates[2]}\nD. {candidates[3]}\nE. {candidates[4]}\n\n"

    response = client.chat.completions.create(
        model="gpt-4o-mini",
        messages=[
            {
            "role": "system",
            "content": [
                {
                "type": "text",
                "text": "Given an English question, pick the most similar one from the list of 5 Chinese questions."
                }
            ]
            },
            {
                "role": "user",
                "content": [
                    {
                    "type": "text",
                    "text": user_prompt
                    }
                ]
            }
        ],
        response_format={
            "type": "json_schema",
            "json_schema": {
            "name": "similar_question_response",
            "strict": True,
            "schema": {
                "type": "object",
                "properties": {
                "response": {
                    "type": "string",
                    "description": "The letter corresponding to the most similar question.",
                    "enum": [
                    "A",
                    "B",
                    "C",
                    "D",
                    "E"
                    ]
                }
                },
                "required": [
                "response"
                ],
                "additionalProperties": False
            }
            }
        },
        temperature=1,
        max_completion_tokens=2048,
        top_p=1,
        frequency_penalty=0,
        presence_penalty=0
        )
    
    return json.loads(response.choices[0].message.content)["response"]

In [4]:
# reddit["gpt_pick"] = reddit.apply(lambda x: ir_top5(str(x['question'])+"\n"+str(x['detail']), [x['top_1'], x['top_2'], x['top_3'], x['top_4'], x['top_5']]), axis=1)
# reddit["gpt_pick_question"] = reddit.apply(lambda x: x[f"top_{'_ABCDE'.index(x['gpt_pick'])}"], axis=1)

reddit = pd.read_csv('../data/matched.csv')

def process_row(row):
    """Function to process each row"""
    question_detail = str(row['question']) + "\n" + str(row['detail'])
    choices = [row['top_1'], row['top_2'], row['top_3'], row['top_4'], row['top_5']]
    gpt_pick = ir_top5(question_detail, choices)
    gpt_pick_question = row[f"top_{'_ABCDE'.index(gpt_pick)}"]
    return gpt_pick, gpt_pick_question

def parallel_process(df):
    """Apply process_row function to each row in parallel with progress tracking"""
    with mp.Pool(mp.cpu_count()) as pool:
        results = list(tqdm(pool.imap(process_row, (row for _, row in df.iterrows())), 
                            total=len(df), desc="Processing"))

    df["gpt_pick"], df["gpt_pick_question"] = zip(*results)
    return df

# Example usage
reddit = parallel_process(reddit)

reddit.to_csv('../data/matched_gpt_4o_mini.csv', index=False)

Processing: 100%|██████████| 28809/28809 [29:54<00:00, 16.05it/s]  


In [5]:
def sim_check(eng_q, chi_q):
        
    client = OpenAI(api_key = os.environ['OPENAI_API_KEY'])
    user_prompt = f"# English Question:\n{eng_q}\n\n # Chinese Question:\n{chi_q}\n\n"

    response = client.chat.completions.create(
        model="gpt-4o-mini",
        messages=[
            {
            "role": "system",
            "content": [
                {
                "type": "text",
                "text": "Given an English question and a Chinese question, determine whether they are asking the same question."
                }
            ]
            },
            {
                "role": "user",
                "content": [
                    {
                    "type": "text",
                    "text": user_prompt
                    }
                ]
            }
        ],
        response_format={
            "type": "json_schema",
            "json_schema": {
            "name": "similar_question_response",
            "strict": True,
            "schema": {
                "type": "object",
                "properties": {
                "response": {
                    "type": "boolean",
                    "description": "Whether the English question and Chinese question are asking the same question.",
                }
                },
                "required": [
                "response"
                ],
                "additionalProperties": False
            }
            }
        },
        temperature=1,
        max_completion_tokens=2048,
        top_p=1,
        frequency_penalty=0,
        presence_penalty=0
        )
    
    return json.loads(response.choices[0].message.content)["response"]

In [6]:
# reddit["gpt_sim"] = reddit.apply(lambda x: sim_check(str(x['question'])+"\n"+str(x['detail']), x['gpt_pick_question']), axis=1)

def sim_check_wrapper(args):
    """ Wrapper function for parallel processing """
    row, sim_check_func = args
    return sim_check_func(str(row['question']) + "\n" + str(row['detail']), row['gpt_pick_question'])

def parallel_apply(df, func, num_workers=4):
    """ Parallel apply using multiprocessing """
    with mp.Pool(num_workers) as pool:
        results = list(tqdm(pool.imap(sim_check_wrapper, [(row, func) for _, row in df.iterrows()]), total=len(df)))
    return results

# Assuming reddit is your DataFrame and sim_check is your function
reddit["gpt_sim"] = parallel_apply(reddit, sim_check, num_workers=mp.cpu_count())

100%|██████████| 28809/28809 [28:14<00:00, 17.00it/s]  


In [7]:
reddit.to_csv('../data/matched_gpt_4o_mini.csv', index=False)

In [8]:
count_false = reddit[reddit['gpt_sim'] == False].shape[0]
count_true = reddit[reddit['gpt_sim'] == True].shape[0]
true_ratio = count_true / (count_false + count_true)
print(f"True ratio: {true_ratio}")

True ratio: 0.2040681731403381


In [13]:
reddit[reddit['gpt_sim'] == True].shape

(5879, 16)

In [12]:
reddit[reddit['gpt_sim'] == True].to_csv('../data/filtered_gpt_4o_mini.csv', index=False)