In [1]:
import json
import pandas as pd
from FlagEmbedding import BGEM3FlagModel
import os
import json
import numpy as np
import pickle
from openai import OpenAI
import os
import multiprocess as mp
from tqdm import tqdm

In [2]:
with open('../data/emb/zhihu_emb.pkl', 'rb') as f:
    raw = pickle.load(f)
    zh_sentences = raw[0]
    zh_emb = raw[1]

In [3]:
emb_model = BGEM3FlagModel('BAAI/bge-m3',  
                       use_fp16=True) # Setting use_fp16 to True speeds up computation with a slight performance degradation


Fetching 30 files:   0%|          | 0/30 [00:00<?, ?it/s]

In [4]:
reddit = pd.read_csv('../data/reddit_post.csv')
reddit = reddit[["message_id", "title", "message"]]
reddit.columns = ["message_id","question", "detail"]
reddit_questions = reddit['question'].tolist()
reddit_details = reddit['detail'].tolist()
reddit_sentences = [str(x) + " " + str(y) for x, y in zip(reddit_questions, reddit_details)]


reddit_emb = emb_model.encode(reddit_sentences)['dense_vecs']

with open('../data/emb/reddit_emb.pkl', 'wb') as f:
    pickle.dump([reddit_sentences, reddit_emb], f)

similarity = reddit_emb @ zh_emb.T

top_k = 5
list_top_k = []
for i in range(similarity.shape[0]):
    top_k_idx = np.argsort(similarity[i])[::-1][:top_k]
    top_k_sim = similarity[i][top_k_idx]
    list_top_k.append([(sim,zh_sentences[j]) for sim, j in zip(top_k_sim,top_k_idx)])

reddit['top_1'] = [x[0][1] for x in list_top_k]
reddit['top_1_sim'] = [x[0][0] for x in list_top_k]
reddit['top_2'] = [x[1][1] for x in list_top_k]
reddit['top_2_sim'] = [x[1][0] for x in list_top_k]
reddit['top_3'] = [x[2][1] for x in list_top_k]
reddit['top_3_sim'] = [x[2][0] for x in list_top_k]
reddit['top_4'] = [x[3][1] for x in list_top_k]
reddit['top_4_sim'] = [x[3][0] for x in list_top_k]
reddit['top_5'] = [x[4][1] for x in list_top_k]
reddit['top_5_sim'] = [x[4][0] for x in list_top_k]

reddit.to_csv('../data/matched.csv', index=False)

pre tokenize: 100%|██████████| 113/113 [00:03<00:00, 35.27it/s]
You're using a XLMRobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
Inference Embeddings: 100%|██████████| 113/113 [15:11<00:00,  8.06s/it]


In [25]:
def ir_top5(question, candidates):
        
    client = OpenAI(api_key = os.environ['OPENAI_API_KEY'])
    user_prompt = f"# Question:\n{question}\n\n # Candidates:\nA. {candidates[0]}\nB. {candidates[1]}\nC. {candidates[2]}\nD. {candidates[3]}\nE. {candidates[4]}\n\n"

    response = client.chat.completions.create(
        model="gpt-4o-mini",
        messages=[
            {
            "role": "system",
            "content": [
                {
                "type": "text",
                "text": "Given an English question, pick the most similar one from the list of 5 Chinese questions."
                }
            ]
            },
            {
                "role": "user",
                "content": [
                    {
                    "type": "text",
                    "text": user_prompt
                    }
                ]
            }
        ],
        response_format={
            "type": "json_schema",
            "json_schema": {
            "name": "similar_question_response",
            "strict": True,
            "schema": {
                "type": "object",
                "properties": {
                "response": {
                    "type": "string",
                    "description": "The letter corresponding to the most similar question.",
                    "enum": [
                    "A",
                    "B",
                    "C",
                    "D",
                    "E"
                    ]
                }
                },
                "required": [
                "response"
                ],
                "additionalProperties": False
            }
            }
        },
        temperature=1,
        max_completion_tokens=2048,
        top_p=1,
        frequency_penalty=0,
        presence_penalty=0
        )
    
    return json.loads(response.choices[0].message.content)["response"]

In [None]:
# reddit["gpt_pick"] = reddit.apply(lambda x: ir_top5(str(x['question'])+"\n"+str(x['detail']), [x['top_1'], x['top_2'], x['top_3'], x['top_4'], x['top_5']]), axis=1)
# reddit["gpt_pick_question"] = reddit.apply(lambda x: x[f"top_{'_ABCDE'.index(x['gpt_pick'])}"], axis=1)

def ir_top5_wrapper(args):
    """Wrapper function to apply ir_top5 in parallel."""
    row, ir_top5_func = args
    return ir_top5_func(
        str(row['question']) + "\n" + str(row['detail']),
        [row['top_1'], row['top_2'], row['top_3'], row['top_4'], row['top_5']]
    )

def get_gpt_pick_question(row):
    """Helper function to determine gpt_pick_question."""
    return row[f"top_{'_ABCDE'.index(row['gpt_pick'])}"]

def parallel_apply(df, func, num_workers=4):
    """Parallel apply using multiprocessing."""
    with mp.Pool(num_workers) as pool:
        results = list(tqdm(pool.imap(func, [(row, func) for _, row in df.iterrows()]), total=len(df)))
    return results

# Multiprocess for "gpt_pick"
reddit["gpt_pick"] = parallel_apply(reddit, ir_top5, num_workers=mp.cpu_count())

# Multiprocess for "gpt_pick_question"
with mp.Pool(mp.cpu_count()) as pool:
    reddit["gpt_pick_question"] = list(tqdm(pool.imap(get_gpt_pick_question, reddit.itertuples(index=False)), total=len(reddit)))

reddit.to_csv('../data/matched_gpt_4o_mini.csv', index=False)

In [4]:
def sim_check(eng_q, chi_q):
        
    client = OpenAI(api_key = os.environ['OPENAI_API_KEY'])
    user_prompt = f"# English Question:\n{eng_q}\n\n # Chinese Question:\n{chi_q}\n\n"

    response = client.chat.completions.create(
        model="gpt-4o-mini",
        messages=[
            {
            "role": "system",
            "content": [
                {
                "type": "text",
                "text": "Given an English question and a Chinese question, determine whether they are asking the same question."
                }
            ]
            },
            {
                "role": "user",
                "content": [
                    {
                    "type": "text",
                    "text": user_prompt
                    }
                ]
            }
        ],
        response_format={
            "type": "json_schema",
            "json_schema": {
            "name": "similar_question_response",
            "strict": True,
            "schema": {
                "type": "object",
                "properties": {
                "response": {
                    "type": "boolean",
                    "description": "Whether the English question and Chinese question are asking the same question.",
                }
                },
                "required": [
                "response"
                ],
                "additionalProperties": False
            }
            }
        },
        temperature=1,
        max_completion_tokens=2048,
        top_p=1,
        frequency_penalty=0,
        presence_penalty=0
        )
    
    return json.loads(response.choices[0].message.content)["response"]

In [None]:
# reddit["gpt_sim"] = reddit.apply(lambda x: sim_check(str(x['question'])+"\n"+str(x['detail']), x['gpt_pick_question']), axis=1)

def sim_check_wrapper(args):
    """ Wrapper function for parallel processing """
    row, sim_check_func = args
    return sim_check_func(str(row['question']) + "\n" + str(row['detail']), row['gpt_pick_question'])

def parallel_apply(df, func, num_workers=4):
    """ Parallel apply using multiprocessing """
    with mp.Pool(num_workers) as pool:
        results = list(tqdm(pool.imap(sim_check_wrapper, [(row, func) for _, row in df.iterrows()]), total=len(df)))
    return results

# Assuming reddit is your DataFrame and sim_check is your function
reddit = pd.read_csv('../data/matched_sample_gpt_4o_mini.csv')
reddit["gpt_sim"] = parallel_apply(reddit, sim_check, num_workers=mp.cpu_count())

100%|██████████| 30/30 [00:01<00:00, 15.58it/s]


In [6]:
reddit.to_csv('../data/matched_gpt_4o_mini.csv', index=False)

In [23]:
count_false = reddit[reddit['gpt_sim'] == False].shape[0]
count_true = reddit[reddit['gpt_sim'] == True].shape[0]
true_ratio = count_true / (count_false + count_true)
print(f"True ratio: {true_ratio}")

True ratio: 0.3


In [26]:
reddit[reddit['gpt_sim'] == True]

Unnamed: 0,question,detail,answer,topic,answer_length,top_1,top_1_sim,top_2,top_2_sim,top_3,top_3_sim,top_4,top_4_sim,top_5,top_5_sim,gpt_pick,gpt_pick_question,gpt_sim,gpt_pick_alt
3,I don‚Äôt know how to help my friend with an u...,"My friend, let‚Äôs call him Saul, has been smo...",You can't help an addict. It's their life and ...,Addiction,285,同学吸毒，约见面该不该去？,0.6025,面对我朋友的情况该怎么办？,0.5986,感觉班上有个同学像是得了精神病，我该怎么办？,0.5776,相信一个人真的可以为你戒烟吗？,0.5728,儿子沉迷游戏，与他讨论，他却说抽烟有害健康你还抽，该如何教育孩子？,0.5674,B,面对我朋友的情况该怎么办？,True,B
8,My brother has been super rude lately‚Ä¶,So my brother has been really rude and annoyin...,"Well if he‚Äôs like 16 and up, just be up fron...",Family,57,弟弟刚中考完 16 岁，中考结束每天游戏度日，因为爸爸说了他几句就敢跟爸爸顶嘴动手，我该怎么办？,0.6025,我表弟喜欢吃屎该怎么办?,0.5796,我兄弟问我 游戏比我重要 是什么意思?,0.5776,马上领证了，发现男朋友离不了游戏，让他少打游戏他会非常生气，正常吗？,0.5757,为什么现在很多论坛的游戏讨论区戾气很大？,0.573,A,弟弟刚中考完 16 岁，中考结束每天游戏度日，因为爸爸说了他几句就敢跟爸爸顶嘴动手，我该怎么办？,True,D
9,How to become more calm when gaming?,Recently I‚Äôve really been losing my cool pla...,It's definitely a frustrating feeling to have ...,Mental Health,61,如果做到玩游戏被骂还能心平气和？,0.649,我做了很多年游戏了，现在感觉越来越羞愧，该怎么调节心态呢？,0.64,打游戏会影响一个人的性格吗？,0.62,游戏本身不是为了娱乐为了放松吗？为什么有的人玩游戏会越玩越恼火心态越爆炸，可是他却越要继续玩...,0.6187,我打不赢别人，怎么办?,0.618,B,我做了很多年游戏了，现在感觉越来越羞愧，该怎么调节心态呢？,True,B
13,I read that skipping supper could lead to weig...,The thought behind it is that because eating a...,I'd recommend looking into [/r/intermittentfas...,Weight Loss,31,不吃晚饭真的会瘦很多吗？,0.671,晚上不吃饭真的能减肥吗？,0.6567,健身加不吃晚饭会瘦吗？,0.6353,晚上不吃饭坚持一周能瘦么？,0.6123,减肥期间该怎样吃宵夜？,0.6045,B,晚上不吃饭真的能减肥吗？,True,B
17,I actually hate myself. How do I build confide...,I am 21F and I have zero self confidence. I fi...,"You last sentence, however dire sounding, hold...",Mental Health,725,女生总是觉得自己长得丑，没有自信怎么办？,0.6294,一个人从自信变为彻底的自卑有多恐怖？,0.606,对自己高考不满意可以变得有多魔鬼？,0.603,小时候缺爱导致的性格缺陷让我逐步毁掉自己的人生该怎么办？,0.6025,你是通过什么事情才知道自己丑的？,0.6,A,女生总是觉得自己长得丑，没有自信怎么办？,True,A
19,What are some tips for your friend whenever th...,,I always say 1) be comfortable(to a degree) an...,Fashion,0,怎样快速学会穿搭和找到自己的风格？,0.672,如何从时尚杂志中学到东西？,0.6587,大家有什么好的建议吗？,0.657,男生如何找准自己的穿衣风格，提升衣着品味？,0.656,请教一句有关穿衣服态度的话？,0.6533,A,怎样快速学会穿搭和找到自己的风格？,True,A
21,What do I say during a job interview?,I am 14 and just got accepted for an interview...,Don't ask about pay or vaccation. Ask when you...,Job,39,招聘通常会被问什么问题？,0.656,HR 通常提的一些面试问题是什么？应该怎么回答？,0.653,面试文员一般会问哪些问题 该怎么回答呢？,0.638,面试者如何回应面试官问的「你有哪些要问我的？」？,0.6377,如果你是一个 Java 面试官，你会问哪些问题？,0.6147,B,HR 通常提的一些面试问题是什么？应该怎么回答？,True,B
23,I have a 2008 Corrolla. Not sure if I should h...,So it‚Äôs a 2008 with 125k miles and it‚Äôs pr...,Depends on whether or not you can comfortably ...,Car,40,汽车玻璃受了点小伤，不知道该修还是该换？,0.618,该不该买辆好车？,0.6,8～10万二手车购买二手车然后性能改装方案推荐？,0.596,麻烦懂的大伙给看看，走全损还是维修？提个建议?,0.5957,如何让自己的车子在售卖时更加保值？使用时需要注意哪些方面？,0.5815,D,麻烦懂的大伙给看看，走全损还是维修？提个建议?,True,D
24,"Getting ""back into life"" and now feeling lonel...","Hello people, i've been really lonely for the ...","I think that now that you have something, your...",Loneliness,266,出来实习了，工作第一天，发现怎么这么也不会，也不敢主动交际，怎么办？,0.6387,为什么毕业工作快一个月了，却会突然很悲伤?,0.635,刚刚步入社会，不会交流，很内向。是真的要改变性格？感觉很不讨喜，很痛苦。,0.627,去年从银行零售岗转到对公客户经理岗，无资源无经验，怀着美好的憧憬一头扎进来发现……我该怎么办？,0.616,毕业第二年，现在做办公室文员，可是明明很简单的工作，自己却做不好。害怕别人的否定，内心自卑，...,0.6113,A,出来实习了，工作第一天，发现怎么这么也不会，也不敢主动交际，怎么办？,True,F


In [27]:
temp = pd.read_csv('../data/matched_gpt_4o_mini.csv')
temp

Unnamed: 0,message_id,question,detail,top_1,top_1_sim,top_2,top_2_sim,top_3,top_3_sim,top_4,top_4_sim,top_5,top_5_sim,gpt_pick,gpt_pick_question,gpt_sim
0,blo6w8,Overly angry mom,Let me set this straight to not concern anyone...,为什么《仙剑奇侠传三》中紫萱和徐长卿不可以在一起？,0.6520,如何评价君岛达己这 3 年的任天堂社长生涯？,0.6465,哪些演员的长相让你看着就觉得他智商不足？,0.6006,自然界有哪些很美好的现象？,0.5990,不爱科研，只为当大学老师而读博，这种做法对吗？,0.5890,E,不爱科研，只为当大学老师而读博，这种做法对吗？,False
1,bqx5pl,Spending Addiction?,"Hello Everyone,\n\nI recently got into a hobby...",严重外伤做止血时必须将纱布塞进伤口里吗？,0.6294,澳洲的创业环境如何？,0.6055,中高端的前卫琴（电吉他）推荐？,0.5933,女方如何让男方在房产证上加名字？,0.5737,基因测序在美国和中国都有哪些商业模式？,0.5720,C,中高端的前卫琴（电吉他）推荐？,False
2,bt57df,Unsure of how to proceed from here,"To sum up my situation, my ex and I dated for ...",家用NAS方案应当如何选择呢？,0.6360,为什么中国古典园林没有供人晒太阳的大草坪？,0.6157,2020年，年轻人买的第一辆车你有什么建议?,0.5990,如果对前途迷茫你该做什么？,0.5970,南京的高校是否衰落得挺厉害？,0.5880,D,如果对前途迷茫你该做什么？,False
3,blvjlt,How to make this relationship work,Hi r/advice\n\nLong time lurker first time pos...,中国产品在海外口碑这么差吗？,0.6445,35岁造价工程师老板让我转管理，我想太多了吗？,0.6167,拍拍贷律师函不理他可以吧。?,0.6147,如果对前途迷茫你该做什么？,0.6094,月经前非常怕冷的原因是什么？需要去医院看么？,0.6060,D,如果对前途迷茫你该做什么？,False
4,btj7m8,"My best friend was shot and killed yesterday, ...",,笛子、二胡、古筝、箫、鼓、古琴、琵琶哪个容易学？,0.5654,储能水电站的效率据说能达到80%，即抽水用电1度，用水的势能发电能有0.8度，真有这么高的效率吗？,0.5650,如何看待上海曹杨路地铁站人身伤亡事件？事故原因可能是什么？,0.5625,IF、红点、G-Mark 和 IDEA 四个全球设计大奖，哪个更权威？,0.5615,水星自转速度很慢，一天比一年还要长，这个单位是以什么为标准呢？,0.5566,C,如何看待上海曹杨路地铁站人身伤亡事件？事故原因可能是什么？,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
28804,11bl6d4,Quite simply over attached. How do I ‘rectify’...,This title sounds like a really lonely life &a...,我觉得“希斯莱杰之后 再无小丑”这句话并不正确 有多少人和我有同感？,0.6240,阴道炎让你们花了多少钱?,0.6020,如果对前途迷茫你该做什么？,0.6016,月经前非常怕冷的原因是什么？需要去医院看么？,0.5810,如何评价三国杀十周年微信公众号公然辱骂原神玩家?,0.5780,C,如果对前途迷茫你该做什么？,False
28805,11bmk7n,How should I respond?,23M\n\nI've been low to no-contact with my par...,有哪些可以夸人长得漂亮的词？,0.6130,中国人寿回应「网传新冠保险阳了却不理赔」，称「尚未接到该客户提交的理赔资料」，如何看待这一回应？,0.5996,如何在 20 天内学完高中化学？,0.5970,拍拍贷律师函不理他可以吧。?,0.5938,如何评价 Netflix 纪录片《乖乖听话：邪教中的祈祷与服从》？,0.5938,D,拍拍贷律师函不理他可以吧。?,False
28806,11egtvx,Dumb question about job interviews,I have 4 interviews this week all for generall...,《三体》中，罗辑威胁三体文明时如果过了三十秒三体人还没有回应，罗辑会扣动扳机吗？,0.6367,中国足球未来怎么样才能持续进步？,0.6323,关于这篇武侠小说，作者还欠缺了多少功力？,0.6294,怎样挑选幼儿园，能避开「虐童老师」这类不合格教育者？,0.6216,卫青霍去病真的是庸才吗？吕思勉老先生在《中国通史》中批评很厉害啊，到底应该信吕先生的，还是影视剧？,0.6143,D,怎样挑选幼儿园，能避开「虐童老师」这类不合格教育者？,False
28807,11eiw57,I just can’t get him off,I recently acquired a bf (whom I care for grea...,五角大楼的新报告警告称「若再不采取紧急行动，中国就将在太空领域超越美国」，如何评价此番言论？,0.5870,学服装设计是怎样的体验？,0.5590,为什么国产摩托车比不过雅马哈、川崎、铃木和本田？,0.5586,Android 平台有哪些优秀的支持 Feedly 的 RSS 阅读器？,0.5576,想转健身教练，打算先买北体的教科书，背理论知识，求帮助？,0.5547,E,想转健身教练，打算先买北体的教科书，背理论知识，求帮助？,False
