In [1]:
from sklearn.feature_extraction.text import CountVectorizer

def space_tokenizer(text):
    return text.split(" ")

vectorizer = CountVectorizer(ngram_range=(2, 2), analyzer='word', tokenizer=space_tokenizer)

query_text = '本 契約 內容 的 變更 應 經由 誰 同意 並 批註 ？'
query_ngrams = vectorizer.fit_transform([query_text])
print(vectorizer.get_feature_names_out())

['並 批註' '內容 的' '同意 並' '契約 內容' '應 經由' '批註 ？' '本 契約' '的 變更' '經由 誰' '誰 同意'
 '變更 應']




In [2]:
import json

with open('../source/競賽資料集/dataset/preliminary/questions_v2_cleaned_ckip.json') as fp:
    questions = json.load(fp)

with open('../source/競賽資料集/reference_text/corpus_v2_cleaned_ckip.json') as fp:
    corpus = json.load(fp)

qid = 4
for question in questions:
    if question['qid'] == qid:
        source = question['source']
        query_ws = question['query_ws']
        print(source)
        print(query_ws)
        break

vectorizer = CountVectorizer(ngram_range=(2, 2), analyzer='word', tokenizer=space_tokenizer)
query_ngrams = vectorizer.fit_transform([' '.join(query_ws)])

print(vectorizer.get_feature_names_out())

for source_id in source:
    print(f'source_id: {source_id}')
    source_text = corpus['insurance'][str(source_id)]
    doc_ngrams = vectorizer.transform([' '.join(source_text)])
    match_count = query_ngrams.dot(doc_ngrams.T).toarray()[0, 0]
    # print(match_count)
    query_ngram_count = query_ngrams.sum()
    ngram_score = match_count / query_ngram_count
    print(ngram_score)

[186, 627, 536, 179, 174, 178]
['本', '契約', '內容', '的', '變', '更', '應', '經由', '誰', '同意', '並', '批註']
['並 批註' '內容 的' '同意 並' '契約 內容' '應 經由' '本 契約' '的 變' '經由 誰' '誰 同意' '變 更'
 '更 應']
source_id: 186
1.0
source_id: 627
4.090909090909091
source_id: 536
0.45454545454545453
source_id: 179
0.9090909090909091
source_id: 174
0.45454545454545453
source_id: 178
0.36363636363636365


In [11]:
def dp_word_lcs(query, document):
    # 將查詢和文檔分割為單詞列表
    if isinstance(query, list):
        query_words = query
    else:
        query_words = query.split()
        
    if isinstance(document, list):
        document_words = document
    else:
        document_words = document.split()
    
    # 初始化動態規劃表格，大小為 (len(query_words) + 1) x (len(document_words) + 1)
    dp = [[0] * (len(document_words) + 1) for _ in range(len(query_words) + 1)]
    
    # 填充 DP 表格
    for i in range(1, len(query_words) + 1):
        for j in range(1, len(document_words) + 1):
            if query_words[i - 1] == document_words[j - 1]:  # 單詞匹配
                dp[i][j] = dp[i - 1][j - 1] + 1
            else:
                dp[i][j] = max(dp[i - 1][j], dp[i][j - 1])
    
    # LCS 長度位於表格的右下角
    return dp[len(query_words)][len(document_words)]

query = "example query"
document = "this is an example document containing query words"
lcs_length = dp_word_lcs(query, document)
print("LCS Length (in words):", lcs_length)

LCS Length (in words): 2


In [12]:
import json

with open('../source/競賽資料集/dataset/preliminary/questions_v2_cleaned_ckip.json') as fp:
    questions = json.load(fp)

with open('../source/競賽資料集/reference_text/corpus_v2_cleaned_ckip.json') as fp:
    corpus = json.load(fp)

qid = 4
for question in questions:
    if question['qid'] == qid:
        source = question['source']
        query_ws = question['query_ws']
        print(source)
        print(query_ws)
        break
    
for source_id in source:
    print(f'source_id: {source_id}')
    source_text = corpus['insurance'][str(source_id)]
    lcs_length = dp_word_lcs(query_ws, source_text)
    print(lcs_length)

[186, 627, 536, 179, 174, 178]
['本', '契約', '內容', '的', '變', '更', '應', '經由', '誰', '同意', '並', '批註']
source_id: 186
8
source_id: 627
8
source_id: 536
5
source_id: 179
8
source_id: 174
5
source_id: 178
5
