這份 Notebook 示範 RAG 使用 Chroma 向量資料庫

In [1]:
from google.colab import userdata
openai_api_key = userdata.get('openai_api_key')

### 可使用 LangChain 來載入文件和 chunking

In [2]:
!pip install langchain

Collecting langchain
  Downloading langchain-0.1.20-py3-none-any.whl (1.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.0/1.0 MB[0m [31m3.9 MB/s[0m eta [36m0:00:00[0m
Collecting dataclasses-json<0.7,>=0.5.7 (from langchain)
  Downloading dataclasses_json-0.6.6-py3-none-any.whl (28 kB)
Collecting langchain-community<0.1,>=0.0.38 (from langchain)
  Downloading langchain_community-0.0.38-py3-none-any.whl (2.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m15.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting langchain-core<0.2.0,>=0.1.52 (from langchain)
  Downloading langchain_core-0.1.52-py3-none-any.whl (302 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m302.9/302.9 kB[0m [31m6.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting langchain-text-splitters<0.1,>=0.0.1 (from langchain)
  Downloading langchain_text_splitters-0.0.1-py3-none-any.whl (21 kB)
Collecting langsmith<0.2.0,>=0.1.17 (from langcha

In [3]:
import requests
import json
from pprint import pp

In [4]:
def get_completion(messages, model="gpt-3.5-turbo", temperature=0, max_tokens=1000, format_type=None):
  payload = { "model": model, "temperature": temperature, "messages": messages, "max_tokens": max_tokens }
  if format_type:
    payload["response_format"] =  { "type": format_type }

  headers = { "Authorization": f'Bearer {openai_api_key}', "Content-Type": "application/json" }
  response = requests.post('https://api.openai.com/v1/chat/completions', headers = headers, data = json.dumps(payload) )
  obj = json.loads(response.text)
  if response.status_code == 200 :
    return obj["choices"][0]["message"]["content"]
  else :
    return obj["error"]

In [5]:
def get_embeddings(input, dimensions = 1536, model="text-embedding-3-small"):
  payload = { "input": input, "model": model, "dimensions": dimensions }
  headers = { "Authorization": f'Bearer {openai_api_key}', "Content-Type": "application/json" }
  response = requests.post('https://api.openai.com/v1/embeddings', headers = headers, data = json.dumps(payload) )
  obj = json.loads(response.text)
  if response.status_code == 200 :
    return obj["data"][0]["embedding"]
  else :
    return obj["error"]

## 用 LangChain 的 Document Loader

https://python.langchain.com/docs/modules/data_connection/document_loaders/

In [6]:
!pip install pypdf

Collecting pypdf
  Downloading pypdf-4.2.0-py3-none-any.whl (290 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m290.4/290.4 kB[0m [31m2.7 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: pypdf
Successfully installed pypdf-4.2.0


資料集用財經週報 PDF 文件:

* https://www.megabank.com.tw/personal/investment/bulletin/weekly-journal?searchyear=&searchmon=10


In [7]:
!wget https://www.megabank.com.tw/-/media/mega/files/bank/personal/fund/bulletin/weekly-journal/market-analysis/113/1130226.pdf

--2024-05-11 09:36:40--  https://www.megabank.com.tw/-/media/mega/files/bank/personal/fund/bulletin/weekly-journal/market-analysis/113/1130226.pdf
Resolving www.megabank.com.tw (www.megabank.com.tw)... 23.10.214.196
Connecting to www.megabank.com.tw (www.megabank.com.tw)|23.10.214.196|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1862439 (1.8M) [application/pdf]
Saving to: ‘1130226.pdf’


2024-05-11 09:36:41 (10.0 MB/s) - ‘1130226.pdf’ saved [1862439/1862439]



In [8]:
from langchain.document_loaders import PyPDFLoader

loader = PyPDFLoader("1130226.pdf")
text_docs = loader.load()

# 抓網頁
#from langchain.document_loaders import WebBaseLoader
#loader = WebBaseLoader("https://eugeneyan.com/writing/llm-patterns/")
#text_docs = loader.load()


In [9]:
len(text_docs)

16

In [10]:
text_docs[0]

Document(page_content='本資料純屬參考性質，兆豐商銀不作任何保證與承諾。上述資料，任何人因信賴此資料而做出或改變決策，本身須承擔一切風險，報告資料並無做出\n買賣任何內文所涉及之證券建議、誘導及鼓勵相關交易。\n本資料純屬參考性質，兆豐商銀不作任何保證與承諾。上述資料，任何人因信賴此資料而做出或改變決策，本身須承擔一切風險，報告資料並無做出\n買賣任何內文所涉及之證券建議、誘導及鼓勵相關交易。1\n財富管理處 投顧小組\n113年2月26日投資研究週報', metadata={'source': '1130226.pdf', 'page': 0})

## 安裝 Chroma

https://www.trychroma.com/

In [11]:
!pip install chromadb

Collecting chromadb
  Downloading chromadb-0.5.0-py3-none-any.whl (526 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m526.8/526.8 kB[0m [31m3.9 MB/s[0m eta [36m0:00:00[0m
Collecting chroma-hnswlib==0.7.3 (from chromadb)
  Downloading chroma_hnswlib-0.7.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (2.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.4/2.4 MB[0m [31m20.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting fastapi>=0.95.2 (from chromadb)
  Downloading fastapi-0.111.0-py3-none-any.whl (91 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m92.0/92.0 kB[0m [31m7.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting uvicorn[standard]>=0.18.3 (from chromadb)
  Downloading uvicorn-0.29.0-py3-none-any.whl (60 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.8/60.8 kB[0m [31m5.5 MB/s[0m eta [36m0:00:00[0m
Collecting posthog>=2.4.0 (from chromadb)
  Downloading posthog-3.5.0-py2.p

In [12]:
import chromadb
chroma_client = chromadb.Client()

collection = chroma_client.create_collection(name="collection8")

## 用 LangChain 來拆 chunks

In [13]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=100)

## 把 chunks 加到 chroma

In [14]:
for page in text_docs:
  chunks = text_splitter.split_text(page.page_content)
  collection.add(
    documents = chunks,
    embeddings = [ get_embeddings(chunk) for chunk in chunks ] , # 若不給 embeddings, chroma 有內建預設的 embedding 算法 (不過還是建議用 OpenAI embeddings 結果好很多)
    metadatas = [ { "page": page.metadata['page'], "date": "2024年2月26日" } for x in range( len(chunks) ) ], # 這裡可加入 metadata 作為之後檢索過濾條件
    ids=[f"doc-1-page-{page.metadata['page']}-chunk-{x}" for x in range( len(chunks) ) ]
  )

In [15]:
question = "AI 產業的趨勢如何?"

### 可以找最相似 chunks 了，使用 chroma 提供的 API:

Chroma API 文件: https://docs.trychroma.com/usage-guide

In [16]:
results = collection.query(
    query_embeddings = get_embeddings(question),
    # 可有 where 參數可針對上述的 metadatas 做過濾，例如日期、頁數等
    n_results=3
)

In [17]:
results

{'ids': [['doc-1-page-13-chunk-0',
   'doc-1-page-12-chunk-1',
   'doc-1-page-13-chunk-1']],
 'distances': [[0.7384488582611084, 0.7645643949508667, 0.7736967206001282]],
 'metadatas': [[{'date': '2024年2月26日', 'page': 13},
   {'date': '2024年2月26日', 'page': 12},
   {'date': '2024年2月26日', 'page': 13}]],
 'embeddings': None,
 'documents': [['本資料純屬參考性質，兆豐商銀不作任何保證與承諾。上述資料，任何人因信賴此資料而做出或改變決策，本身須承擔一切風險，報告資料並無做出\n買賣任何內文所涉及之證券建議、誘導及鼓勵相關交易。讓AI產業成為您投資的神隊友\n•讓AI產業成為您投資的神隊友： Sora敲響「文生影片 」模型的戰役 ，為了不在這場競賽中脫隊 ，科技公司積極開發相\n關技術，因此對下一代 GPU和CPU的需求有增無減 ，投研機構 Market .us預期 2024年半導體市場規模將達到 6,731\n億美元，年成長 8%。佈局生產 AI晶片的股票成為參與 AI大趨勢的好方式 。\n•透過定期定額 ，把握 AI產業的長線趨勢行情： AIGC不僅是一種技術 ，更是一種改變未來的力量 ，甚至顛覆傳統遊戲\n開發，不僅能有效降低開發成本 ，更能加強遊戲內容豐富度與互動感 ，提升用戶付費頻率進而擴大市佔與獲利 。彭博\n預估，生成式 AI佔全球企業的支出比重將在 2032年由目前不到 3%提升至 12%，2032年AIGC的相關營收規模預估可',
   '定位為配角 NPC（Non -playable characters ，非玩家角色 ）隨著 AIGC的發展，AI有可能轉變成主導遊戲走向的要角 ，\n透過運算生成各種情境給玩家帶來全新體驗 。投研機構 Market Research 預期遊戲業 AI產值將從 2022年的 9億美元，\n在2032年成長到 71億美元，年複合成長率達 23%。市\n場\n回\n顧\n市\n場\n焦\n點\

### 組出 context

In [18]:
documents = results['documents'][0]
context = '\n'.join('* ' + doc for doc in documents)

In [19]:
print(context)

* 本資料純屬參考性質，兆豐商銀不作任何保證與承諾。上述資料，任何人因信賴此資料而做出或改變決策，本身須承擔一切風險，報告資料並無做出
買賣任何內文所涉及之證券建議、誘導及鼓勵相關交易。讓AI產業成為您投資的神隊友
•讓AI產業成為您投資的神隊友： Sora敲響「文生影片 」模型的戰役 ，為了不在這場競賽中脫隊 ，科技公司積極開發相
關技術，因此對下一代 GPU和CPU的需求有增無減 ，投研機構 Market .us預期 2024年半導體市場規模將達到 6,731
億美元，年成長 8%。佈局生產 AI晶片的股票成為參與 AI大趨勢的好方式 。
•透過定期定額 ，把握 AI產業的長線趨勢行情： AIGC不僅是一種技術 ，更是一種改變未來的力量 ，甚至顛覆傳統遊戲
開發，不僅能有效降低開發成本 ，更能加強遊戲內容豐富度與互動感 ，提升用戶付費頻率進而擴大市佔與獲利 。彭博
預估，生成式 AI佔全球企業的支出比重將在 2032年由目前不到 3%提升至 12%，2032年AIGC的相關營收規模預估可
* 定位為配角 NPC（Non -playable characters ，非玩家角色 ）隨著 AIGC的發展，AI有可能轉變成主導遊戲走向的要角 ，
透過運算生成各種情境給玩家帶來全新體驗 。投研機構 Market Research 預期遊戲業 AI產值將從 2022年的 9億美元，
在2032年成長到 71億美元，年複合成長率達 23%。市
場
回
顧
市
場
焦
點
聚
焦
議
題
資
產
觀
點
132022~2032 年AIGC 遊戲產值 全球資訊長規劃 AI應用生產投入時程
（單位：百萬美元）
資料來源： Morgan Stanley 資訊長調查， 2023.10 資料來源 ：MarketResearch.biz ，2023.12
* 預估，生成式 AI佔全球企業的支出比重將在 2032年由目前不到 3%提升至 12%，2032年AIGC的相關營收規模預估可
達1.3兆美元，年複合成長率達 42%。新一輪新的 AI生產力革命已至 ，不妨透過定期定額紀律投資 ，把握 AI產業的長
線趨勢行情 ，掌握承接優質 AI產業的投資機會 。市
場
回
顧
市
場
焦
點
聚
焦
議
題
資
產
觀
點
14全球半導體市場規模
（單位：百萬美元）
資料來源： Market.us 

## 組出 prompt 來問 LLM

In [20]:
prompt = f"""
I'm going to give you a document. Then I'm going to ask you a question about it. I'd like you to first write down exact quotes of parts of the document that would help answer the question, and then I'd like you to answer the question using facts from the quoted content. Here is the document:

<document>
{context}
</document>

Here is the first question:  {question}

First, find the quotes from the document that are most relevant to answering the question, and then print them in numbered order. Quotes should be relatively short.

If there are no relevant quotes, write "No relevant quotes" instead.

Then, answer the question, starting with "Answer:".  Do not include or reference quoted content verbatim in the answer. Don't say "According to Quote [1]" when answering. Instead make references to quotes relevant to each section of the answer solely by adding their bracketed numbers at the end of relevant sentences.

Thus, the format of your overall response should look like what's shown between the <example></example> tags.  Make sure to follow the formatting and spacing exactly.

<example>

Relevant quotes:
[1] "Company X reported revenue of $12 million in 2021."
[2] "Almost 90% of revenue came from widget sales, with gadget sales making up the remaining 10%."

Answer:
Company X earned $12 million. [1]  Almost 90% of it was from widget sales. [2]

</example>

If the question cannot be answered by the document, say so.

Answer the question immediately without preamble.
請用台灣繁體中文回答.
"""

In [21]:
result = get_completion([ {"role": "user", "content": prompt }], model="gpt-4-turbo-preview")
print(result)

Relevant quotes:
[1] "科技公司積極開發相關技術，因此對下一代 GPU和CPU的需求有增無減 ，投研機構 Market .us預期 2024年半導體市場規模將達到 6,731億美元，年成長 8%。"
[2] "彭博預估，生成式 AI佔全球企業的支出比重將在 2032年由目前不到 3%提升至 12%，2032年AIGC的相關營收規模預估可達1.3兆美元，年複合成長率達 42%。"
[3] "投研機構 Market Research 預期遊戲業 AI產值將從 2022年的 9億美元，在2032年成長到 71億美元，年複合成長率達 23%。"

Answer:
AI產業的趨勢顯示出強勁的成長潛力。科技公司對於下一代GPU和CPU的需求持續增加，預示著半導體市場將在未來幾年內持續擴大[1]。此外，生成式AI在全球企業支出中的比重預計將從不到3%增加到12%，且相關營收規模預估將達到1.3兆美元，顯示出極高的年複合成長率[2]。在遊戲業方面，AI的產值也預期將從9億美元增長到71億美元，這進一步證明了AI產業的快速成長和潛力[3]。


## 最後做個 demo 的 UI

In [22]:
!pip install gradio

Collecting gradio
  Downloading gradio-4.31.0-py3-none-any.whl (12.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.3/12.3 MB[0m [31m45.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting aiofiles<24.0,>=22.0 (from gradio)
  Downloading aiofiles-23.2.1-py3-none-any.whl (15 kB)
Collecting ffmpy (from gradio)
  Downloading ffmpy-0.3.2.tar.gz (5.5 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting gradio-client==0.16.2 (from gradio)
  Downloading gradio_client-0.16.2-py3-none-any.whl (315 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m315.5/315.5 kB[0m [31m26.5 MB/s[0m eta [36m0:00:00[0m
Collecting pydub (from gradio)
  Downloading pydub-0.25.1-py2.py3-none-any.whl (32 kB)
Collecting ruff>=0.2.2 (from gradio)
  Downloading ruff-0.4.4-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (8.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.7/8.7 MB[0m [31m42.2 MB/s[0m eta [36m0:00:00[0m
[?25hCol

In [None]:
import gradio as gr
gr.close_all()

def handle_input(query):
  results = collection.query(
    query_embeddings = get_embeddings(query),
    n_results=3
  )

  documents = results['documents'][0]
  context = '\n'.join('* ' + doc for doc in documents)

  prompt = f"""
I'm going to give you a document. Then I'm going to ask you a question about it. I'd like you to first write down exact quotes of parts of the document that would help answer the question, and then I'd like you to answer the question using facts from the quoted content. Here is the document:

<document>
{context}
</document>

Here is the first question:  {query}

First, find the quotes from the document that are most relevant to answering the question, and then print them in numbered order. Quotes should be relatively short.

If there are no relevant quotes, write "No relevant quotes" instead.

Then, answer the question, starting with "Answer:".  Do not include or reference quoted content verbatim in the answer. Don't say "According to Quote [1]" when answering. Instead make references to quotes relevant to each section of the answer solely by adding their bracketed numbers at the end of relevant sentences.

Thus, the format of your overall response should look like what's shown between the <example></example> tags.  Make sure to follow the formatting and spacing exactly.

<example>

Relevant quotes:
[1] "Company X reported revenue of $12 million in 2021."
[2] "Almost 90% of revenue came from widget sales, with gadget sales making up the remaining 10%."

Answer:
Company X earned $12 million. [1]  Almost 90% of it was from widget sales. [2]

</example>

If the question cannot be answered by the document, say so.

Answer the question immediately without preamble.
請用台灣繁體中文回答.
"""

  result = get_completion([ {"role": "user", "content": prompt }], model="gpt-4-turbo-preview")
  return result

demo = gr.Interface(fn=handle_input,
                    inputs=[gr.Textbox(label="您的問題", lines=1)],
                    outputs=[gr.Textbox(label="回答", lines=10)],
                    allow_flagging="never",
                    title="與財經報告 PDF 聊天",
                    examples=[ ["AI產業趨勢如何?"], ["美國經濟如何?"], ["中國經濟如何?"], ["台灣經濟如何?"]]
                   )
demo.launch(share=True, debug=True)

Colab notebook detected. This cell will run indefinitely so that you can see errors and logs. To turn off, set debug=False in launch().
Running on public URL: https://3920933beabedb6798.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from Terminal to deploy to Spaces (https://huggingface.co/spaces)


## 使用 Vector Store 的好處

* Vector Store 實做了複雜的 ANN 算法，才可以處理非常多的向量數據
* 有 CRUD 功能，資料會增增減減
* 有 filters 功能: 若存資料時有帶 metadata，搜尋時可搭配過濾條件 (e.g. 用戶UI上有條件表單，或是透過 LLM 先擷取用戶是否有過濾條件 https://python.langchain.com/docs/modules/data_connection/retrievers/self_query )

Vector Store 各家能力和寫法都不同

Chroma 詳見: https://docs.trychroma.com/getting-started