# 测试通过嵌入检索的成功率 - 测试纯英文情况

## 准备

In [3]:
%%time

INDEX_PATH="retrieve-en-index"
DATA_PATH="retrieve-en-data"

PROXY="http://192.168.0.134:7890"

CPU times: user 2 µs, sys: 0 ns, total: 2 µs
Wall time: 3.58 µs


In [11]:
%%time

test_data=[
    {
        "url": "https://www.bbc.com/news/articles/cn0099v8ywpo",
        "question": "How does the upcoming debate between Donald Trump and Joe Biden differ from past presidential debates in the United States?"
    },{
        "url": "https://www.bbc.com/news/articles/cx77l5ej2yyo",
        "question": "How did the BBC team ultimately identify the smuggler known as Jabal in Luxembourg?"
    },{
        "url": "https://www.bbc.com/news/articles/cl5y3d0dzk4o",
        "question": "What prompted Hong Kong to intensify its focus on \"patriotic\" education starting from 2020?"
    },{
        "url": "https://www.bbc.com/news/articles/c4nglpj3dllo",
        "question": "What is Webtoon Entertainment's market value ahead of its US listing, and what is its expected initial share price on Nasdaq?"
    },{
        "url": "https://www.bbc.com/news/articles/ceqd10qej32o",
        "question": "How did Cate Campbell reflect on her career and achievements after missing out on qualifying for the 2024 Paris Olympics?"
    }
]

CPU times: user 3 µs, sys: 0 ns, total: 3 µs
Wall time: 4.53 µs


In [13]:
import os

# 设置HTTP代理
os.environ['HTTP_PROXY'] = PROXY
os.environ['HTTPS_PROXY'] = PROXY

In [12]:
%%time

import requests
from gne import GeneralNewsExtractor

def get_news_data(url):
    response = requests.get(url)
    html = response.text

    extractor = GeneralNewsExtractor()
    data = extractor.extract(html, noise_node_list=[
                               '//div[@class="comment-list"]'])
    data['url']=url
    return data

CPU times: user 76.1 ms, sys: 78.3 ms, total: 154 ms
Wall time: 54.8 ms


In [14]:
%%time

!mkdir -p $DATA_PATH

import json

for news in test_data:
    data=get_news_data(news['url'])
    data['url']=news['url']
    file_path = f'./{DATA_PATH}/{data["title"]}.json'
    with open(file_path, 'w') as json_file:
        json.dump(data, json_file, indent=4)

CPU times: user 898 ms, sys: 5.75 ms, total: 904 ms
Wall time: 2.83 s


In [15]:
# 取消设置HTTP代理
if 'HTTP_PROXY' in os.environ:
    del os.environ['HTTP_PROXY']
if 'HTTPS_PROXY' in os.environ:
    del os.environ['HTTPS_PROXY']

In [16]:
%%time

from llama_index.core import(
    Document
)

def data2doc(news_data):
    document=Document(text=news_data['content'], 
                  metadata={"title": news_data['title'],
                            'publish_time': news_data['publish_time'],
                            'author': news_data['author'],
                            'url': news_data['url'],
                            'images': news_data['images'],
                           })
    document.doc_id = document.metadata["title"]
    return document

CPU times: user 2.59 s, sys: 203 ms, total: 2.79 s
Wall time: 2.6 s


In [17]:
%%time

from llama_index.core import SimpleDirectoryReader

documents=SimpleDirectoryReader(input_dir=f"./{DATA_PATH}").load_data(num_workers=4)
for document in documents:
    document.doc_id=document.metadata['file_name']

import json

docs=[]
for document in documents:
    news_data=json.loads(document.text) # documents[0].text
    docs.append(data2doc(news_data))

documents=docs

len(docs)

CPU times: user 10.2 ms, sys: 0 ns, total: 10.2 ms
Wall time: 2.98 s


5

In [18]:
%%time

import nest_asyncio
nest_asyncio.apply()

CPU times: user 0 ns, sys: 1.55 ms, total: 1.55 ms
Wall time: 1.22 ms


In [19]:
%%time

# 加载llm和embeddings
%run ../utils2.py

from llama_index.core import Settings

Settings.llm=get_llm()
Settings.embed_model=get_embedding()

CPU times: user 545 ms, sys: 39.7 ms, total: 585 ms
Wall time: 584 ms


In [20]:
%%time

from llama_index.core import get_response_synthesizer
from llama_index.core import DocumentSummaryIndex
from llama_index.core.node_parser import SentenceSplitter

splitter = SentenceSplitter(chunk_size=1024)

response_synthesizer = get_response_synthesizer(
    response_mode="tree_summarize", 
    use_async=True
)

doc_summary_index = DocumentSummaryIndex.from_documents(
    documents,
    transformations=[splitter],
    response_synthesizer=response_synthesizer,
    show_progress=True,
)

Parsing nodes:   0%|          | 0/5 [00:00<?, ?it/s]

Summarizing documents:   0%|          | 0/5 [00:00<?, ?it/s]

current doc id: Australia's
current doc id: behind Channel crossing which killed Sara, 7
current doc id: chool children sang anthem too softly
current doc id: market debut
current doc id: presidential debate?


Generating embeddings:   0%|          | 0/5 [00:00<?, ?it/s]

CPU times: user 394 ms, sys: 17.2 ms, total: 411 ms
Wall time: 32.4 s


In [21]:
%%time

doc_summary_index.storage_context.persist(INDEX_PATH)

CPU times: user 18.1 ms, sys: 0 ns, total: 18.1 ms
Wall time: 17.9 ms


In [22]:
%%time

from llama_index.core import load_index_from_storage
from llama_index.core import StorageContext

# rebuild storage context
storage_context = StorageContext.from_defaults(persist_dir=INDEX_PATH)
doc_summary_index = load_index_from_storage(storage_context)

CPU times: user 31.6 ms, sys: 0 ns, total: 31.6 ms
Wall time: 31.2 ms


## 检索

### k=1

In [29]:
%%time

from llama_index.core.indices.document_summary import (
    DocumentSummaryIndexEmbeddingRetriever,
)

retriever = DocumentSummaryIndexEmbeddingRetriever(
    doc_summary_index,
    similarity_top_k=1,
)

results=[]

for news in test_data:
    retrieved_nodes = retriever.retrieve(news["question"])
    result=False
    for node in retrieved_nodes:
        if node.metadata['url']==news["url"]:
            result=True
            break
    results.append(result)

results

CPU times: user 19.3 ms, sys: 0 ns, total: 19.3 ms
Wall time: 482 ms


[True, True, True, True, True]

In [30]:
results.count(True) / len(results)

1.0

In [31]:
retrieved_nodes = retriever.retrieve("What prompted Hong Kong to intensify its focus on \"patriotic\" education starting from 2020?")

len(retrieved_nodes)

1

In [33]:
retrieved_nodes[0].text

'Many former opposition lawmakers and democracy campaigners have been jailed since 2020 under a controversial national security law that criminalised all forms of dissent. Depsite international condemnation, Beijing defends the law as essential for stability.\nIn the years since, patriotism has become a byword for China\'s growing control of the city. It\noverhauled Hong Kong\'s electoral system with a "patriots\' law"\nthat barred those seen as "unpatriotic" from political office. More recently, it\nbanned what has effectively been the city\'s unofficial anthem\n, a protest song called Glory to Hong Kong, because of its "seditious" possibilities.\nThe city has also set up a government committee to help "the new generation to really appreciate our Chinese culture, our Chinese history," Hong Kong\'s chief executive John Lee said.\nThe education bureau\'s latest efforts are seen as part of the same campaign.\nIn November last year, the bureau introduced a new subject which would require 

In [34]:
retrieved_nodes[0].metadata

{'title': 'chool children sang anthem too softly',
 'publish_time': '2024-06-27',
 'author': '',
 'url': 'https://www.bbc.com/news/articles/cl5y3d0dzk4o',
 'images': []}

### k=5

In [26]:
%%time

from llama_index.core.indices.document_summary import (
    DocumentSummaryIndexEmbeddingRetriever,
)

retriever = DocumentSummaryIndexEmbeddingRetriever(
    doc_summary_index,
    similarity_top_k=5,
)

results=[]

for news in test_data:
    retrieved_nodes = retriever.retrieve(news["question"])
    result=False
    for node in retrieved_nodes:
        if node.metadata['url']==news["url"]:
            result=True
            break
    results.append(result)

results.count(True) / len(results)

CPU times: user 20.9 ms, sys: 0 ns, total: 20.9 ms
Wall time: 483 ms


1.0

In [28]:
retrieved_nodes = retriever.retrieve("What prompted Hong Kong to intensify its focus on \"patriotic\" education starting from 2020?")

len(retrieved_nodes)

5