In [None]:
from dotenv import load_dotenv
load_dotenv()

# 질문들

disease_questions=[
    obesity_questions, 
    cataract_questions, 
    dementia_questions, 
    diabetes_questions,
    rhinitis_questions,
    gastritis_questions,
    hair_loss_questions,
    hemorrhoid_questions,
    hypertension_questions,
    hyperlipidemia_questions,
    periodontal_disease_questions
    ]

In [6]:
# YAML 파일에서 리스트 불러오기
import yaml
path = 'c:/Users/USER/Desktop/GAS5_final_HSHCrew/AI/code/reranker/dataset/yaml/'
diseases = [
    'cataract', 
    'obesity', 
    'dementia', 
    'diabetes',
    'rhinitis',
    'gastritis',
    'hair_loss',
    'hemorrhoid',
    'hypertension',
    'periodontal_disease',
    'hyperlipidemia',
    ]
diseases_questions = []
for disease in diseases:
    with open(path+f'{disease}_questions.yaml', 'r') as file:
        loaded_questions = yaml.load(file, Loader=yaml.FullLoader)
        diseases_questions+=loaded_questions
len(diseases_questions)

# 불러온 리스트 출력
# print(loaded_questions)


2200

# vectorDB 불러오기

In [None]:
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import FAISS

embeddings = OpenAIEmbeddings()
vector_store = FAISS.load_local('c:/Users/USER/Desktop/GAS5_final_HSHCrew/AI/code/reranker/dataset/vectorDB', OpenAIEmbeddings(), allow_dangerous_deserialization=True)

# dataset 생성

In [None]:
import math
import itertools
import random
from rank_bm25 import BM25Okapi
# from sentence_transformers import SentenceTransformer, util
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
from tqdm import tqdm
# 0. GPU 설정
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# 1. 모델 로딩 (최적화 및 GPU 적용)
# 전역 변수로 모델을 한 번만 로드합니다.
bge_model = None
bge_tokenizer = None
cross_encoder_model = None
cross_tokenizer = None

def load_models(ranker_models):
    """
    필요한 랭커 모델들을 로드하고 GPU로 이동합니다.
    """
    global bge_model, bge_tokenizer, cross_encoder_model, cross_tokenizer

    if 'BGE-Reranker' in ranker_models and bge_model is None:
        bge_tokenizer = AutoTokenizer.from_pretrained('BAAI/bge-reranker-v2-m3')
        bge_model = AutoModelForSequenceClassification.from_pretrained('BAAI/bge-reranker-v2-m3')
        bge_model.to(device)

    if 'Cross-Encoder' in ranker_models and cross_encoder_model is None:
        cross_tokenizer = AutoTokenizer.from_pretrained('cross-encoder/ms-marco-MiniLM-L-6-v2')
        cross_encoder_model = AutoModelForSequenceClassification.from_pretrained('cross-encoder/ms-marco-MiniLM-L-6-v2')
        cross_encoder_model.to(device)

# 2. 리랭커 모델로부터 점수 획득
def get_ranker_scores(ranker_models, query, documents):
    """
    각 리랭커 모델로부터 문서들에 대한 점수를 획득합니다.
    """
    global bge_model, bge_tokenizer, cross_encoder_model, cross_tokenizer

    ranker_scores = {model_name: {} for model_name in ranker_models}

    # 쿼리와 문서 리스트 준비
    doc_ids = list(documents.keys())
    doc_texts = [documents[doc_id] for doc_id in doc_ids]

    # 1. BM25 모델
    if 'BM25' in ranker_models:
        # 토큰화
        tokenized_docs = [doc.split() for doc in doc_texts]
        bm25 = BM25Okapi(tokenized_docs)
        # 쿼리 토큰화
        tokenized_query = query.split()
        # 스코어 계산
        scores = bm25.get_scores(tokenized_query)
        # 결과 저장
        for doc_id, score in zip(doc_ids, scores):
            ranker_scores['BM25'][doc_id] = score

    # 2. BGE-Reranker 모델
    if 'BGE-Reranker' in ranker_models:
        # 입력 생성 (GPU로 이동)
        inputs = bge_tokenizer([query]*len(doc_texts), doc_texts, padding=True, truncation=True, return_tensors='pt').to(device)
        # 모델 추론
        with torch.no_grad():
            outputs = bge_model(**inputs)
            logits = outputs.logits
            scores = logits.squeeze(-1)
        # 결과 저장 (CPU로 이동하여 float로 변환)
        for doc_id, score in zip(doc_ids, scores):
            ranker_scores['BGE-Reranker'][doc_id] = score.item()

    # 3. Cross-Encoder 모델
    if 'Cross-Encoder' in ranker_models:
        # 입력 생성 (GPU로 이동)
        inputs = cross_tokenizer([query]*len(doc_texts), doc_texts, padding=True, truncation=True, return_tensors='pt').to(device)
        # 모델 추론
        with torch.no_grad():
            outputs = cross_encoder_model(**inputs)
            logits = outputs.logits
            scores = logits.squeeze(-1)
        # 결과 저장 (CPU로 이동하여 float로 변환)
        for doc_id, score in zip(doc_ids, scores):
            ranker_scores['Cross-Encoder'][doc_id] = score.item()

    return ranker_scores

# 이하 코드는 이전과 동일합니다.

# 3. 각 리랭커 모델의 우열관계 도출
def get_pairwise_preferences(ranker_scores):
    """
    각 리랭커 모델별로 문서 쌍에 대한 우열관계를 도출합니다.
    """
    pairwise_preferences = {model: [] for model in ranker_scores}
    for model, scores in ranker_scores.items():
        docs = list(scores.keys())
        for doc_a, doc_b in itertools.combinations(docs, 2):
            score_a = scores[doc_a]
            score_b = scores[doc_b]
            if score_a > score_b:
                pairwise_preferences[model].append((doc_a, doc_b))  # doc_a가 승
            elif score_a < score_b:
                pairwise_preferences[model].append((doc_b, doc_a))  # doc_b가 승
            else:
                continue
    return pairwise_preferences

# 4. ELO Rating 계산
def calculate_elo_ratings(documents, pairwise_preferences):
    """
    모든 리랭커 모델의 우열관계를 합산하여 문서별 ELO 점수를 계산합니다.
    """
    elo_ratings = {doc_id: 1500 for doc_id in documents}
    K = 32
    s = 400

    all_preferences = []
    for prefs in pairwise_preferences.values():
        all_preferences.extend(prefs)

    for doc_a, doc_b in all_preferences:
        Ra = elo_ratings[doc_a]
        Rb = elo_ratings[doc_b]
        Ea = 1 / (1 + 10 ** ((Rb - Ra) / s))
        Sa = 1
        elo_ratings[doc_a] = Ra + K * (Sa - Ea)
        elo_ratings[doc_b] = Rb + K * ((1 - Sa) - (1 - Ea))

    return elo_ratings

# 5. Adaptive Margin 계산
def calculate_adaptive_margin(elo_ratings, s=400):
    margins = {}
    docs = list(elo_ratings.keys())
    for doc_a, doc_b in itertools.combinations(docs, 2):
        Ra = elo_ratings[doc_a]
        Rb = elo_ratings[doc_b]
        delta = abs(Ra - Rb)
        P_ab = 1 / (1 + math.exp(-delta / s))
        margin = 1 - P_ab
        margins[(doc_a, doc_b)] = margin
    return margins

# 6. 트리플 생성
def generate_training_triples(query, documents, elo_ratings, adaptive_margins):
    triples = []
    doc_ids = list(documents.keys())

    for doc_a, doc_b in itertools.permutations(doc_ids, 2):
        Ra = elo_ratings[doc_a]
        Rb = elo_ratings[doc_b]
        if Ra > Rb:
            margin = adaptive_margins.get((doc_a, doc_b), 0.0)
            triple = {
                'query': query,
                'positive_document': documents[doc_a],
                'negative_document': documents[doc_b],
                'margin': margin
            }
            triples.append(triple)
    return triples

# 7. 데이터셋 저장
def save_dataset_for_llama(triples, filename='llama_ranker_dataset.jsonl'):
    import json
    with open(filename, 'w', encoding='utf-8') as f:
        for triple in triples:
            json_line = json.dumps(triple, ensure_ascii=False)
            f.write(json_line + '\n')

# 예시 실행
if __name__ == "__main__":
    # 여러 개의 쿼리 리스트
    queries = diseases_questions  # obesity_questions는 쿼리의 리스트라고 가정합니다.

    ranker_models = ["BM25", "BGE-Reranker", "Cross-Encoder"]

    # 모델 로딩 (최적화 및 GPU 적용)
    load_models(ranker_models)

    # 전체 트리플을 저장할 리스트
    all_triples = []

    # 각 쿼리에 대해 처리
    for query in tqdm(queries):
        # 쿼리별로 문서들을 가져옴
        retriever = vector_store.as_retriever(
            search_kwargs={
                'k': 3,
            }
        )

        docs = retriever.invoke(query)

        # 문서들이 3개 미만인 경우 건너뜀
        if len(docs) < 3:
            print(f"쿼리 '{query}'에 대한 문서가 3개 미만입니다. 건너뜁니다.")
            continue

        documents = {
            "doc1": docs[0].page_content,
            "doc2": docs[1].page_content,
            "doc3": docs[2].page_content,
        }

        # 1. 리랭커 모델로부터 점수 획득
        ranker_scores = get_ranker_scores(ranker_models, query, documents)
        # print(f"\n쿼리: {query}")
        # print("리랭커 모델 점수:")
        # for model, scores in ranker_scores.items():
            # print(f"{model}: {scores}")

        # 2. 각 리랭커 모델의 우열관계 도출
        pairwise_preferences = get_pairwise_preferences(ranker_scores)
        # print("\n리랭커 모델별 우열관계:")
        # for model, prefs in pairwise_preferences.items():
        #     print(f"{model}: {prefs}")

        # 3. ELO Rating 계산
        elo_ratings = calculate_elo_ratings(documents, pairwise_preferences)
        print("\nELO Ratings:")
        for doc_id, rating in elo_ratings.items():
            print(f"{doc_id}: {rating:.2f}")

        # 4. Adaptive Margin 계산
        adaptive_margins = calculate_adaptive_margin(elo_ratings)
        print("\nAdaptive Margins:")
        for (doc_a, doc_b), margin in adaptive_margins.items():
            print(f"Margin({doc_a}, {doc_b}) = {margin:.4f}")

        # 5. 트리플 생성
        triples = generate_training_triples(query, documents, elo_ratings, adaptive_margins)
        print("\n트레이닝 트리플:")
        for triple in triples:
            print(triple)
            all_triples.append(triple)  # 전체 트리플 리스트에 추가

    # 6. 전체 데이터셋 저장
    save_dataset_for_llama(all_triples)

    print("\n전체 데이터셋이 'llama_ranker_dataset.jsonl' 파일로 저장되었습니다.")
