Skip to content

Commit

Permalink
fix: rag langchain demo with retrievals
Browse files Browse the repository at this point in the history
  • Loading branch information
LongxingTan committed May 30, 2024
1 parent 556aa96 commit 0429d4e
Show file tree
Hide file tree
Showing 29 changed files with 606 additions and 442 deletions.
50 changes: 43 additions & 7 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -115,28 +115,64 @@ pip install chromadb
```

```python
from retrievals.tools.langchain import LangchainEmbedding, LangchainReranker
from retrievals.tools.langchain import LangchainEmbedding, LangchainReranker, LangchainLLM
from retrievals import RerankModel
from langchain.retrievers import ContextualCompressionRetriever
from langchain_community.vectorstores import Chroma as Vectorstore
from langchain.prompts.prompt import PromptTemplate
from langchain.chains import RetrievalQA

persist_directory = './database/faiss/faiss.index'
embeddings = LangchainEmbedding(model_name_or_path="BAAI/bge-large-zh-v1.5")
persist_directory = './database/faiss.index'
embed_model_name_or_path = "sentence-transformers/all-MiniLM-L6-v2"
rerank_model_name_or_path = "BAAI/bge-reranker-base"
llm_model_name_or_path = "microsoft/Phi-3-mini-128k-instruct"

embeddings = LangchainEmbedding(model_name_or_path=embed_model_name_or_path)
vectordb = Vectorstore(
persist_directory=persist_directory,
embedding_function=embeddings,
)
retrieval_args = {"search_type" :"similarity", "score_threshold": 0.15, "k": 10}
retriever = vectordb.as_retriever(**retrieval_args)

ranker = RerankModel.from_pretrained("maidalun1020/bce-reranker-base_v1")
ranker = RerankModel.from_pretrained(rerank_model_name_or_path)
reranker = LangchainReranker(model=ranker, top_n=3)
compression_retriever = ContextualCompressionRetriever(
base_compressor=reranker, base_retriever=retriever
)

query = 'what is open-retrievals?'
docs = compression_retriever.invoke(query)
llm = LangchainLLM(model_name_or_path=llm_model_name_or_path)

RESPONSE_TEMPLATE = """[INST]
<>
You are a helpful AI assistant. Use the following pieces of context to answer the user's question.<>
Anything between the following `context` html blocks is retrieved from a knowledge base.
{context}
REMEMBER:
- If you don't know the answer, just say that you don't know, don't try to make up an answer.
- Let's take a deep breath and think step-by-step.
Question: {question}[/INST]
Helpful Answer:
"""

PROMPT = PromptTemplate(template=RESPONSE_TEMPLATE, input_variables=["context", "question"])

qa_chain = RetrievalQA.from_chain_type(
llm,
chain_type='stuff',
retriever=compression_retriever,
chain_type_kwargs={
"verbose": True,
"prompt": PROMPT,
}
)

user_query = 'Introduce this'
response = qa_chain({"query": user_query})
print(response)
```

[//]: # (**RAG with LLamaIndex**)
Expand Down Expand Up @@ -173,7 +209,7 @@ train_dataset = load_dataset('shibing624/nli_zh', 'STS-B')['train']
train_dataset = train_dataset.rename_columns({'sentence1': 'query', 'sentence2': 'positive'})
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, use_fast=False)
model = AutoModelForEmbedding.from_pretrained(model_name_or_path, pooling_method="cls")
# model.set_train_type('pointwise') # 'pointwise', 'pairwise', 'listwise'
# model = model.set_train_type('pointwise') # 'pointwise', 'pairwise', 'listwise'
optimizer = AdamW(model.parameters(), lr=5e-5)
num_train_steps=int(len(train_dataset) / batch_size * epochs)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0.05 * num_train_steps, num_training_steps=num_train_steps)
Expand Down
50 changes: 43 additions & 7 deletions README_ja-JP.md
Original file line number Diff line number Diff line change
Expand Up @@ -109,28 +109,64 @@ pip install chromadb

- サーバー
```python
from retrievals.tools.langchain import LangchainEmbedding, LangchainReranker
from retrievals.tools.langchain import LangchainEmbedding, LangchainReranker, LangchainLLM
from retrievals import RerankModel
from langchain.retrievers import ContextualCompressionRetriever
from langchain_community.vectorstores import Chroma as Vectorstore
from langchain.prompts.prompt import PromptTemplate
from langchain.chains import RetrievalQA

persist_directory = './database/faiss/faiss.index'
embeddings = LangchainEmbedding(model_name_or_path="BAAI/bge-large-zh-v1.5")
persist_directory = './database/faiss.index'
embed_model_name_or_path = "sentence-transformers/all-MiniLM-L6-v2"
rerank_model_name_or_path = "BAAI/bge-reranker-base"
llm_model_name_or_path = "microsoft/Phi-3-mini-128k-instruct"

embeddings = LangchainEmbedding(model_name_or_path=embed_model_name_or_path)
vectordb = Vectorstore(
persist_directory=persist_directory,
embedding_function=embeddings,
)
retrieval_args = {"search_type" :"similarity", "score_threshold": 0.15, "k": 10}
retriever = vectordb.as_retriever(**retrieval_args)

ranker = RerankModel.from_pretrained("maidalun1020/bce-reranker-base_v1")
ranker = RerankModel.from_pretrained(rerank_model_name_or_path)
reranker = LangchainReranker(model=ranker, top_n=3)
compression_retriever = ContextualCompressionRetriever(
base_compressor=reranker, base_retriever=retriever
)

query = 'what is open-retrievals?'
docs = compression_retriever.invoke(query)
llm = LangchainLLM(model_name_or_path=llm_model_name_or_path)

RESPONSE_TEMPLATE = """[INST]
<>
You are a helpful AI assistant. Use the following pieces of context to answer the user's question.<>
Anything between the following `context` html blocks is retrieved from a knowledge base.
{context}
REMEMBER:
- If you don't know the answer, just say that you don't know, don't try to make up an answer.
- Let's take a deep breath and think step-by-step.
Question: {question}[/INST]
Helpful Answer:
"""

PROMPT = PromptTemplate(template=RESPONSE_TEMPLATE, input_variables=["context", "question"])

qa_chain = RetrievalQA.from_chain_type(
llm,
chain_type='stuff',
retriever=compression_retriever,
chain_type_kwargs={
"verbose": True,
"prompt": PROMPT,
}
)

user_query = 'Introduce this'
response = qa_chain({"query": user_query})
print(response)
```

[//]: # (**RAG with LLamaIndex**)
Expand Down Expand Up @@ -167,7 +203,7 @@ train_dataset = load_dataset('shibing624/nli_zh', 'STS-B')['train']
train_dataset = train_dataset.rename_columns({'sentence1': 'query', 'sentence2': 'positive'})
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, use_fast=False)
model = AutoModelForEmbedding.from_pretrained(model_name_or_path, pooling_method="cls")
# model.set_train_type('pointwise') # 'pointwise', 'pairwise', 'listwise'
# model = model.set_train_type('pointwise') # 'pointwise', 'pairwise', 'listwise'
optimizer = AdamW(model.parameters(), lr=5e-5)
num_train_steps=int(len(train_dataset) / batch_size * epochs)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0.05 * num_train_steps, num_training_steps=num_train_steps)
Expand Down
50 changes: 43 additions & 7 deletions README_zh-CN.md
Original file line number Diff line number Diff line change
Expand Up @@ -119,28 +119,64 @@ pip install chromadb
```

```python
from retrievals.tools.langchain import LangchainEmbedding, LangchainReranker
from retrievals.tools.langchain import LangchainEmbedding, LangchainReranker, LangchainLLM
from retrievals import RerankModel
from langchain.retrievers import ContextualCompressionRetriever
from langchain_community.vectorstores import Chroma as Vectorstore
from langchain.prompts.prompt import PromptTemplate
from langchain.chains import RetrievalQA

persist_directory = './database/faiss/faiss.index'
embeddings = LangchainEmbedding(model_name_or_path="BAAI/bge-large-zh-v1.5")
persist_directory = './database/faiss.index'
embed_model_name_or_path = "sentence-transformers/all-MiniLM-L6-v2"
rerank_model_name_or_path = "BAAI/bge-reranker-base"
llm_model_name_or_path = "microsoft/Phi-3-mini-128k-instruct"

embeddings = LangchainEmbedding(model_name_or_path=embed_model_name_or_path)
vectordb = Vectorstore(
persist_directory=persist_directory,
embedding_function=embeddings,
)
retrieval_args = {"search_type" :"similarity", "score_threshold": 0.15, "k": 10}
retriever = vectordb.as_retriever(**retrieval_args)

ranker = RerankModel.from_pretrained("maidalun1020/bce-reranker-base_v1")
ranker = RerankModel.from_pretrained(rerank_model_name_or_path)
reranker = LangchainReranker(model=ranker, top_n=3)
compression_retriever = ContextualCompressionRetriever(
base_compressor=reranker, base_retriever=retriever
)

query = '1974年,谁获得了东南亚自由搏击的冠军?'
docs = compression_retriever.invoke(query)
llm = LangchainLLM(model_name_or_path=llm_model_name_or_path)

RESPONSE_TEMPLATE = """[INST]
<>
You are a helpful AI assistant. Use the following pieces of context to answer the user's question.<>
Anything between the following `context` html blocks is retrieved from a knowledge base.
{context}
REMEMBER:
- If you don't know the answer, just say that you don't know, don't try to make up an answer.
- Let's take a deep breath and think step-by-step.
Question: {question}[/INST]
Helpful Answer:
"""

PROMPT = PromptTemplate(template=RESPONSE_TEMPLATE, input_variables=["context", "question"])

qa_chain = RetrievalQA.from_chain_type(
llm,
chain_type='stuff',
retriever=compression_retriever,
chain_type_kwargs={
"verbose": True,
"prompt": PROMPT,
}
)

user_query = '1974年,谁获得了东南亚自由搏击的冠军?'
response = qa_chain({"query": user_query})
print(response)
```

[//]: # (**搭配LLamaIndex构建RAG应用**)
Expand Down Expand Up @@ -177,7 +213,7 @@ train_dataset = load_dataset('shibing624/nli_zh', 'STS-B')['train']
train_dataset = train_dataset.rename_columns({'sentence1': 'query', 'sentence2': 'positive'})
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, use_fast=False)
model = AutoModelForEmbedding.from_pretrained(model_name_or_path, pooling_method="cls")
# model.set_train_type('pointwise') # 'pointwise', 'pairwise', 'listwise'
# model = model.set_train_type('pointwise') # 'pointwise', 'pairwise', 'listwise'
optimizer = AdamW(model.parameters(), lr=5e-5)
num_train_steps=int(len(train_dataset) / batch_size * epochs)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0.05 * num_train_steps, num_training_steps=num_train_steps)
Expand Down
Empty file.
4 changes: 0 additions & 4 deletions examples/0_embeddings/embed_pairwise.sh

This file was deleted.

Empty file.
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@

accelerate launch \
--config_file conf/conf_llm.yaml \
embed_llm_finetune.py \
llm_finetune_for_embed.py \
--model_name_or_path mistralai/Mistral-7B-v0.1 \
--train_data \
--output_dir modeloutput \
File renamed without changes.
54 changes: 54 additions & 0 deletions examples/0_embeddings/pairwise_finetune.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
import torch.nn as nn
from datasets import load_dataset
from transformers import (
AdamW,
AutoTokenizer,
TrainingArguments,
get_linear_schedule_with_warmup,
)

from src.retrievals import (
AutoModelForEmbedding,
PairCollator,
RetrievalTrainer,
TripletCollator,
)
from src.retrievals.losses import (
ArcFaceAdaptiveMarginLoss,
InfoNCE,
SimCSE,
TripletLoss,
)

model_name_or_path: str = "sentence-transformers/paraphrase-multilingual-mpnet-base-v2"
batch_size: int = 128
epochs: int = 3

train_dataset = load_dataset('shibing624/nli_zh', 'STS-B')['train']
train_dataset = train_dataset.rename_columns({'sentence1': 'query', 'sentence2': 'positive'})
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, use_fast=False)
model = AutoModelForEmbedding.from_pretrained(model_name_or_path, pooling_method="cls")
model = model.set_train_type('pairwise')

optimizer = AdamW(model.parameters(), lr=5e-5)
num_train_steps = int(len(train_dataset) / batch_size * epochs)
scheduler = get_linear_schedule_with_warmup(
optimizer, num_warmup_steps=0.05 * num_train_steps, num_training_steps=num_train_steps
)

training_arguments = TrainingArguments(
output_dir='./checkpoints',
num_train_epochs=epochs,
per_device_train_batch_size=batch_size,
remove_unused_columns=False,
)
trainer = RetrievalTrainer(
model=model,
args=training_arguments,
train_dataset=train_dataset,
data_collator=PairCollator(tokenizer, max_length=512),
loss_fn=InfoNCE(nn.CrossEntropyLoss(label_smoothing=0.05)),
)
trainer.optimizer = optimizer
trainer.scheduler = scheduler
trainer.train()
Original file line number Diff line number Diff line change
@@ -1,3 +1,10 @@
"""
CUDA_VISIBLE_DEVICES=0 python pairwise_finetune2.py \
--model_name_or_path bert-base-multilingual-uncased \
--train_data ./example_data/toy_finetune_data.jsonl \
--output_dir modeloutput
"""

import logging
import os
import random
Expand Down Expand Up @@ -217,11 +224,7 @@ def main():
cache_dir=model_args.cache_dir,
use_fast=False,
)

train_dataset = TrainDatasetForEmbedding(args=data_args, tokenizer=tokenizer)
# for i in range(8):
# print(train_dataset[i])
print(len(train_dataset))

# model = PairwiseModel(model_args.model_name_or_path, pooling_method="mean")
model = AutoModelForEmbedding.from_pretrained(model_args.model_name_or_path, pooling_method="mean")
Expand Down
4 changes: 0 additions & 4 deletions examples/1_retrieval/retrieval_faiss.py
Original file line number Diff line number Diff line change
Expand Up @@ -82,13 +82,9 @@ def recall():
batch_size=512,
top_k=20,
)
print(indices)

dataset = load_from_disk(CFG.wikipedia_path)
for i in range(len(df)):
df.loc[i, "context"] = "-" + "\n-".join([dataset[int(j)]["text"] for j in indices[i]])

print(df)
print(df.columns)

df.to_csv(CFG.output_dir + "context_df.csv", index=False)
Loading

0 comments on commit 0429d4e

Please sign in to comment.