In [None]:
from parser import DocxParser
from chunker import TextChunker
from retriever import DenseRetrieverConfig, DenseRetriever
from embedding import HuggingFaceEmbedding
from llm import QwenChat
from tqdm import tqdm
import glob
import pandas as pd
import re
import nltk
nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('wordnet')

llm_model = '/home/chenbohua/lz/models/Qwen3-4B'
vector_model = '/home/chenbohua/lz/models/bge-base-zh-v1.5'

In [None]:
DOCX_PATH = '/home/chenbohua/lz/初赛A榜/赛题制度文档'
dp = DocxParser()
tc = TextChunker()
all_paragraphs = []
all_chunks = []
for filepath in tqdm(glob.glob(f'{DOCX_PATH}/*/*.docx')):
    paragraphs = dp.parse(filepath)
    all_paragraphs.append(paragraphs)
for paragraphs in tqdm(all_paragraphs):
    chunks = tc.get_chunks(paragraphs, 256)
    all_chunks.extend(chunks)

In [None]:
dense_config = DenseRetrieverConfig(
    model_name_or_path=vector_model,
    dim=768,
    index_path='./index_store'
)
embedding_generator = HuggingFaceEmbedding(model_name=vector_model, device='cuda')
retriever = DenseRetriever(dense_config, embedding_generator)
retriever.build_from_texts(all_chunks)
retriever.save_index('./index_store')

In [None]:
model = QwenChat(path=llm_model, device='cuda')

In [None]:
COMP_DIR = '/home/chenbohua/lz/初赛A榜/数据集A'
# 构建用户提问模板
def get_query(row):
    category = row['category']
    question = row['question']
    if category == '选择题':
        content = row['content']
        return f'{category}: {question}\n{content}'
    else:
        return f'{category}: {question}'

train = pd.read_json(f'{COMP_DIR}/train.json', lines=True)
test = pd.read_json(f'{COMP_DIR}/testA.json', lines=True)
train['query'] = train.apply(lambda row: get_query(row), axis=1)
test['query'] = test.apply(lambda row: get_query(row), axis=1)
train_ans = pd.read_json(f'{COMP_DIR}/train_answer.json', lines=True)

In [None]:
results = []
for _, row in tqdm(test.iterrows(), total=len(test)):
    query = row['query']
    contents = retriever.retrieve(query=query, top_k=4)
    content = '\n'.join(['- ' + content['text'] for content in contents])
    result, _ = model.chat(query, [], content)
    results.append(result)

In [None]:
test['answer'] = results

In [None]:
def postprocessing(text):
    result = re.sub(r'\<think\>[\s\S]*\</think\>', '', text)
    return result.strip()
test['answer_p'] = test['answer'].apply(postprocessing)

In [None]:
res = []
i0 = 0
i1 = 0
i2 = 0 
for i, row in tqdm(test.iterrows()):
    # 正常有答案
    if (not re.search(r'\<think\>', row['answer_p'])) and (not re.search(r'\</think\>', row['answer_p'])):
        if row['category'] == '选择题':
            res.append(row['answer_p'].split(','))
        else:
            res.append(row['answer_p'])
        i0 += 1
    # 只有</think>
    elif re.search(r'\</think\>', row['answer_p']):
        new_ans = row['answer_p'].replace('</think>', '').strip()
        if row['category'] == '选择题':
            res.append(new_ans.split(','))
        else:
            res.append(new_ans)
        i1 += 1
    else:
        # 太长了, 没有思考完, 只有 <think>
        if row['category'] == '选择题':
            res.append(['C']) # 都选 C
        else:
            res.append(row['answer_p'][-50:]) # 最后五十个字
        i2 += 1
print(i0, i1, i2)

In [None]:
test['answer'] = res
test[['id', 'answer']].to_json('result.json', lines=True, orient='records')

In [61]:
import json
import pandas as pd
data = pd.read_json('result_qw3_14b_bak.json', lines=True)

In [62]:
null_count = 0

def process(item):
    global null_count
    if isinstance(item, str):
        if '\n\n\n' in item:
            item = item.split('\n\n\n')[1].strip()
        return item
    else:
        new_item = []
        for i in item:
            if '\n\n\n' in i:
                i = i.split('\n\n\n')[1].strip()
            if i in 'ABCD':
                new_item.append(i)
        if not len(new_item):
            new_item.append('C')
            null_count += 1
        return new_item

data['answer'] = data['answer'].apply(process)
null_count

523

In [63]:
# data.to_json('result_qw3_14b.json', lines=True, orient='records', force_ascii=False)
data['answer'][6]

['C']