In [98]:
import pandas as pd
import os
import random
import cohere
import torch
import numpy as np
from transformers import AutoModel, AutoTokenizer
import openai
from openai import OpenAI
from tqdm.notebook import tqdm

os.environ["TOKENIZERS_PARALLELISM"] = "false"

# initialize openai
os.environ['OPENAI_API_KEY']= ""
openai.api_key = os.environ["OPENAI_API_KEY"]

# initialize cohere
os.environ["CO_API_KEY"] = ""
co = cohere.Client()

import warnings
warnings.filterwarnings('ignore')


CohereError: No API key provided. Provide the API key in the client initialization or the CO_API_KEY environment variable.

In [72]:
# prepare dataset

from datasets import load_dataset
import pandas as pd


dataset = load_dataset("quora")
raw_df = dataset["train"].to_pandas() 
raw_df = raw_df.loc[raw_df['is_duplicate'] == True].reset_index(drop=True)

# 중복되는 id를 개별 컬럼으로 배치
raw_df["q1"] = raw_df["questions"].apply(lambda q: q["text"][0])
raw_df["q2"] = raw_df["questions"].apply(lambda q: q["text"][1])
raw_df["id1"] = raw_df["questions"].apply(lambda q: q["id"][0])
raw_df["id2"] = raw_df["questions"].apply(lambda q: q["id"][1])

q1_to_q2 = raw_df.copy().rename(columns={"q1": "text", "id1": "id", "id2": "dq_id"}).drop(columns=["questions", "q2"])
q2_to_q1 = raw_df.copy().rename(columns={"q2": "text", "id2": "id", "id1": "dq_id"}).drop(columns=["questions", "q1"])
flat_df = pd.concat([q1_to_q2, q2_to_q1])

flat_df = flat_df.sort_values(by=['id']).reset_index(drop=True)
flat_df.loc[flat_df['id'] == 568] 
flat_df = flat_df.loc[((flat_df['id'] <= 15000) & (flat_df['dq_id'] <= 15000))]

# 각 질문 하나당 중복되는 질문 id를 list 형태로 저장
df = flat_df.drop_duplicates("id")
df.loc[:, "duplicated_questions"] = df["id"].apply(lambda qid: flat_df[flat_df["id"] == qid]["dq_id"].tolist())
df = df.drop(columns=["dq_id", "is_duplicate"])
df.loc[:, 'length'] = [len(x) for x in df['duplicated_questions']]

df.loc[[len(i) > 2 for i in df.duplicated_questions]]
df.to_csv("./data/quora_dataset.csv", index=False)

In [73]:
df = pd.read_csv("./data/quora_dataset.csv")
df.head()

Unnamed: 0,text,id,duplicated_questions,length
0,Astrology: I am a Capricorn Sun Cap moon and c...,11,[12],1
1,"I'm a triple Capricorn (Sun, Moon and ascendan...",12,[11],1
2,How can I be a good geologist?,15,[16],1
3,What should I do to be a great geologist?,16,[15],1
4,How do I read and find my YouTube comments?,23,[24],1


In [74]:
text1 = df.loc[2,'text']
print(text1)

How can I be a good geologist?


In [75]:
text2 = df.loc[3,'text']
print(text2)

What should I do to be a great geologist?


In [76]:
def create_embeddings(txt_list, provider='openai'):
    if provider=='openai':
        client = OpenAI()

        response = client.embeddings.create(
        input=txt_list,
        model="text-embedding-3-small")
        responses = [r.embedding for r in response.data]

        return responses
    
    elif provider=='cohere':
        doc_embeds = co.embed(
        txt_list,
        input_type="search_document",
        model="embed-english-v3.0")
        return doc_embeds.embeddings
    else:
        assert False, "Double check provider name"

In [77]:
emb1 = create_embeddings(df.loc[2, 'text'])
emb2 = create_embeddings(df.loc[3, 'text'])

In [78]:
from numpy.linalg import norm

def cosine_similarity(vector_a, vector_b):
    """Calculate the cosine similarity between two vectors."""
    dot_product = np.dot(vector_a, vector_b)
    norm_a = norm(vector_a)
    norm_b = norm(vector_b)
    similarity = dot_product / (norm_a * norm_b)
    return similarity


In [79]:
print("Cosine 유사도 : {}.\n사용된 문장 : \n{}\n{}".format(cosine_similarity(emb1[0], emb2[0]), text1, text2))

Cosine 유사도 : 0.9153125306391902.
사용된 문장 : 
How can I be a good geologist?
What should I do to be a great geologist?


In [80]:
text3 = df.loc[4, 'text']

emb3 = create_embeddings(text3)
print("Cosine 유사도 : {}.\n사용된 문장 : \n{}\n{}".format(cosine_similarity(emb1[0], emb3[0]), text1, text3))

Cosine 유사도 : 0.1817481836952418.
사용된 문장 : 
How can I be a good geologist?
How do I read and find my YouTube comments?


In [81]:
text4 = df.loc[6, 'text']

emb3 = create_embeddings(text4)
print("Cosine 유사도 : {}.\n사용된 문장 : \n{}\n{}".format(cosine_similarity(emb1[0], emb3[0]), text1, text4))

Cosine 유사도 : 0.2795677393942889.
사용된 문장 : 
How can I be a good geologist?
What can make Physics easy to learn?


### Embedding vector Dataset 생성

In [82]:
# openai_emb = create_embeddings(df.text.tolist(), provider='openai')
# cohere_emb = create_embeddings(df.text.tolist(), 'cohere')

# e5 embeddings
# load gpu if possible
device = "cuda" if torch.cuda.is_available() else "cpu"

model_id = "intfloat/e5-base-v2"

# init tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModel.from_pretrained(model_id).to(device)
model.eval()


def create_e5_emb(docs, model):
    """
    e5 embedding 모델을 활용하여 임베딩 벡터 생성
    """
    docs = [f"query: {d}" for d in docs]
    # tokenize
    tokens = tokenizer(
        docs, padding=True, max_length=512, truncation=True, return_tensors="pt"
    ).to(device)
    with torch.no_grad():
        out = model(**tokens)
        last_hidden = out.last_hidden_state.masked_fill(  # from last hidden state
            ~tokens["attention_mask"][..., None].bool(), 0.0
        )
        # average out embeddings per token (non-padding)
        doc_embeds = last_hidden.sum(dim=1) / tokens["attention_mask"].sum(dim=1)[..., None]
    return doc_embeds.cpu().numpy()

tokenizer_config.json:   0%|          | 0.00/314 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/650 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

In [83]:
# data = df.text.tolist()
# batch_size = 128

# for i in tqdm(range(0, len(data), batch_size)):
#     i_end = min(len(data), i+batch_size)
#     data_batch = data[i:i_end]
#     # embed current batch
#     embed_batch = create_e5_emb(data_batch)
#     if i == 0:
#         emb3 = embed_batch.copy()
#     else:
#         emb3 = np.concatenate([emb3, embed_batch.copy()])
# emb3 = [list(e) for e in emb3]
# df['e5_emb'] = emb3
# df.to_csv("quora_dataset_emb.csv", index=False)

In [84]:
df = pd.read_csv("./data/quora_dataset_emb.csv")
# str -> list 형태로 변환
import json
df['openai_emb'] = df['openai_emb'].apply(json.loads)
df['cohere_emb'] = df['cohere_emb'].apply(json.loads)
df['e5_emb'] = df['e5_emb'].apply(json.loads)
df['duplicated_questions'] = df['duplicated_questions'].apply(json.loads)

In [85]:
df.head()

Unnamed: 0,text,id,duplicated_questions,length,openai_emb,cohere_emb,e5_emb
0,Astrology: I am a Capricorn Sun Cap moon and c...,11,[12],1,"[-0.005765771958976984, -0.018585262820124626,...","[-0.05834961, -0.010795593, -0.04522705, 0.035...","[0.059878636, -0.15769655, -0.14131568, -0.546..."
1,"I'm a triple Capricorn (Sun, Moon and ascendan...",12,[11],1,"[0.026014558970928192, -0.014319832436740398, ...","[-0.022338867, -0.0063285828, -0.057128906, 0....","[0.08937627, -0.2954505, -0.33455396, -0.32940..."
2,How can I be a good geologist?,15,[16],1,"[0.005276682320982218, 0.004194203298538923, 0...","[-0.012535095, 0.005092621, -0.033233643, -0.0...","[0.0825816, -0.09264662, -0.78053623, -0.32416..."
3,What should I do to be a great geologist?,16,[15],1,"[0.015116829425096512, 0.0010464431252330542, ...","[-0.013465881, 0.0018148422, -0.052612305, 0.0...","[-0.1653303, 0.19044468, -0.8906647, -0.364357..."
4,How do I read and find my YouTube comments?,23,[24],1,"[0.03505030274391174, -0.0010134828044101596, ...","[-0.0047836304, 0.028137207, -0.037231445, -0....","[0.50644577, -0.62657785, -0.2523397, -0.17112..."


### Test set 선별

In [86]:
# now choose random 1000 rows of answers
test_query = random.choices(df.id, k=1000)

In [87]:
test_query[:5]

[2408, 8192, 2323, 12111, 9335]

In [88]:
test = df.loc[df.id.isin(test_query)]
test

Unnamed: 0,text,id,duplicated_questions,length,openai_emb,cohere_emb,e5_emb
0,Astrology: I am a Capricorn Sun Cap moon and c...,11,[12],1,"[-0.005765771958976984, -0.018585262820124626,...","[-0.05834961, -0.010795593, -0.04522705, 0.035...","[0.059878636, -0.15769655, -0.14131568, -0.546..."
5,How can I see all my Youtube comments?,24,[23],1,"[0.03636268153786659, -0.012933776713907719, -...","[0.03540039, 0.00944519, -0.018493652, -0.0660...","[0.51798314, -0.856419, -0.02671636, -0.262854..."
10,What would a Trump presidency mean for current...,31,"[6937, 12544, 11435, 32, 1101]",5,"[-0.013222329318523407, 0.02965708076953888, 0...","[0.012786865, 0.007156372, -0.008010864, -0.00...","[-0.19193286, 0.0355199, -0.3380041, 0.4697569..."
14,Why are so many Quora users posting questions ...,37,"[12639, 1358, 4951, 1357, 6551, 38]",6,"[0.01824173703789711, -0.004906218498945236, -...","[0.019226074, 0.01802063, -0.015594482, 0.0082...","[0.2927003, -0.54217947, -0.23348098, 0.230665..."
15,Why do people ask Quora questions which can be...,38,"[4950, 4407, 4408, 6552, 6551, 12638, 5041, 12...",14,"[0.02489926852285862, 0.0011929606553167105, 0...","[-0.002243042, 0.011894226, -0.027938843, 0.00...","[0.3183511, -0.48502415, -0.42100692, 0.181978..."
...,...,...,...,...,...,...,...
5507,How do I repair a cracked Apple iPad screen?,14932,[14933],1,"[-0.02857847511768341, 0.022044092416763306, -...","[-0.025787354, 0.006336212, -0.026367188, 0.00...","[-0.78678465, -0.084891155, -0.7357442, 0.1926..."
5513,What are the best 10 books on human psychology?,14956,[14957],1,"[-0.014451244845986366, 0.04510343819856644, 0...","[-0.002462387, 0.016586304, -0.0053596497, 0.0...","[-0.01930344, -0.3888001, -1.0665482, -0.23144..."
5520,What are the consequences for a dog that ate a...,14964,[14965],1,"[0.011731699109077454, -0.0027778330259025097,...","[0.0046577454, 0.0027656555, -0.023910522, 0.0...","[-0.060623508, -0.44962543, -1.0839481, 0.2325..."
5526,Why were the polls so inaccurate in the 2016 e...,14977,"[10435, 14976]",2,"[-0.00614636717364192, 0.025594908744096756, 0...","[0.045928955, 0.007270813, 0.03881836, 0.04837...","[-0.229896, -0.63861954, -0.67509526, 0.541378..."


In [89]:
from sklearn.metrics.pairwise import cosine_similarity

def search_top_k(search_df, search_df_column, id, topk):
    """
    search_df : search를 할 대상 dataframe
    search_df_column : search를 위해 사용될 embedding column name
    id : test query id
    topk : 유사도 기반으로 top-k개 선별
    """
    query = search_df.loc[search_df['id']==id, search_df_column].values[0]
    query_reshaped = np.array(query).reshape(1, -1)
    
    search_df = search_df.loc[search_df['id']!=id]
    # cosine similarity in batch
    similarities = cosine_similarity(query_reshaped, np.vstack(search_df[search_df_column].values)).flatten()
    
    search_df['similarity'] = similarities
    
    # Get top-k indices
    # hence we sort the topk indices again to ensure they are truly the top-k
    topk_indices = np.argpartition(similarities, -topk)[-topk:]
    topk_indices_sorted = topk_indices[np.argsort(-similarities[topk_indices])]
    
    # Retrieve the top-k results
    search_result = search_df.iloc[topk_indices_sorted]
    
    return search_result


- 각 테스트 질문당 데이터 전체를 대상으로 cosine_similarity를 계산하고
- openai embedding, cohere embedding에 대해 각각 질문 k 개씩 진행
- search_result format :
```json
{
    'question id' : cosine_sim 기준 유사한 질문 top-k개를 담은 pd.DataFrame,
    'question id' : ...
}
```

In [90]:
# 각 질문들 중, test 질문과 동일한 질문이 가장 유사하게 도출될 것이기 때문에
# test 질문을 제외한 top-5
query_results_openai = { k:search_top_k(df, 'openai_emb', k, 5) for k in test.id }
query_results_cohere = { k:search_top_k(df, 'cohere_emb', k, 5) for k in test.id }
query_results_e5 = { k:search_top_k(df, 'e5_emb', k, 5) for k in test.id }

In [91]:
test.loc[test.length==3].tail()

Unnamed: 0,text,id,duplicated_questions,length,openai_emb,cohere_emb,e5_emb
4920,How do you disable a Yahoo account?,13133,"[5914, 4139, 3297]",3,"[0.05142635107040405, 0.011298739351332188, 0....","[0.059295654, 0.03527832, 0.015151978, -0.0056...","[-0.4747715, -0.028856754, -0.41105205, -0.582..."
4961,Who won the 2nd U.S. Presidential Debate?,13252,"[10965, 6676, 6675]",3,"[0.019203325733542442, -0.03866427019238472, -...","[0.017166138, -0.0041885376, -0.005996704, 0.0...","[-0.64972717, -0.74344456, -0.7384698, 0.34086..."
5100,Does semen taste good?,13681,"[4395, 4396, 13680]",3,"[-0.004688047803938389, 0.0003611133142840117,...","[0.022918701, 0.032470703, -0.008583069, -0.00...","[0.15672438, -0.20730066, -0.6771037, -0.06450..."
5274,What same food should I eat every day to prote...,14182,"[6194, 854, 14183]",3,"[-0.004403913859277964, -0.04025837033987045, ...","[0.023086548, 0.03036499, -0.08929443, -0.0823...","[0.18870209, -0.3007043, -0.9383021, -0.120102..."
5463,Why MS Dhoni left captaincy though he was a su...,14806,"[12516, 14807, 12515]",3,"[0.05268150195479393, 0.015848418697714806, 0....","[-0.015716553, 0.005004883, -0.036834717, 0.00...","[-0.7862142, -0.78122103, -0.85379386, 0.05855..."


In [92]:
query_results_openai[13133]

Unnamed: 0,text,id,duplicated_questions,length,openai_emb,cohere_emb,e5_emb,similarity
2257,How can you delete your Yahoo mail account?,5914,"[4140, 4139, 3297, 3298, 13133]",5,"[0.04474090412259102, 0.010254010558128357, 0....","[-0.011230469, 0.036224365, -0.023910522, 0.02...","[-0.42633027, -0.02495384, -0.13672635, -0.310...",0.814143
1562,How can can I delete my yahoo email account?,4139,"[3298, 4140, 3297, 13133, 5914]",5,"[0.040250178426504135, 0.00830832589417696, 0....","[-0.0085372925, 0.040893555, -0.024917603, 0.0...","[-0.32373163, -0.17512423, -0.11643425, -0.490...",0.806792
1232,What are some ways to delete my Yahoo Mail acc...,3297,"[4140, 13133, 3298, 4139, 5914]",5,"[0.04302453249692917, -0.01035858690738678, 0....","[-0.0045928955, 0.030288696, -0.0135269165, 0....","[-0.46334347, 0.06597824, -0.14693257, -0.0914...",0.793739
1233,What are some ways to delete my Yahoo Mail acc...,3298,"[4139, 3297, 4140, 5914]",4,"[0.042433131486177444, -0.0011997126275673509,...","[0.00422287, 0.04473877, -0.021118164, 0.01844...","[-0.43668953, -0.0137787815, -0.19602334, -0.0...",0.770388
1563,How can you delete your Yahoo Mail ID?,4140,"[3298, 5914, 4139, 3297]",4,"[0.0572517067193985, -0.01654321514070034, 0.0...","[-0.015411377, 0.035339355, -0.027999878, 0.02...","[-0.5276045, -0.021487916, -0.39395866, -0.062...",0.737111


In [93]:
def score_accuracy(full_df, tmp_df, test_id):
    """
    각 테스트 질문과 유사하다고 판단된 질문들 중, 실제 duplicated_questions에 들어있는 질문들을 count
    """
    duplicated_questions = full_df.loc[full_df['id'] == test_id, 'duplicated_questions'].values[0]

    # 본인 ID는 제외
    filtered_df = tmp_df[tmp_df['id'] != test_id]
    # 현재 retrieve 해온 ID들이, 테스트 질문 내에 들어있는 아이디들인지 count
    match_count = filtered_df['id'].isin(duplicated_questions).sum()

    # Calculate the accuracy in terms of percentage
    if filtered_df.shape[0]<len(duplicated_questions):
        percentage = (match_count / filtered_df.shape[0])
    else:
        percentage = (match_count / len(duplicated_questions))
    return percentage

In [94]:
accuracy_openai = [score_accuracy(df, query_results_openai[i], i) for i in query_results_openai.keys()]
accuracy_cohere = [score_accuracy(df, query_results_cohere[i], i) for i in query_results_cohere.keys()]
accuracy_e5 = [score_accuracy(df, query_results_e5[i], i) for i in query_results_e5.keys()]

In [95]:
np.mean(accuracy_openai)

0.9560979971387696

In [96]:
np.mean(accuracy_cohere)

0.9540951359084406

In [97]:
np.mean(accuracy_e5)

0.9488018597997139