Documentation | 中文 | **日本語
Open-Retrievals is an easy-to-use python framework getting SOTA text embeddings, oriented to information retrieval and LLM retrieval augmented generation, based on PyTorch and Transformers.
- Contrastive learning enhanced embeddings
- LLM embeddings
- fast RAG demo
Prerequisites
pip install transformers
pip install faiss-cpu # if necessary
pip install peft # if necessary
With pip
pip install open-retrievals
Use Pretrained weights
from retrievals import AutoModelForEmbedding
sentences = ["Hello world", "How are you doing?", "Open-retrievals is a text embedding libraries for RAG application"]
model_name_or_path = "sentence-transformers/all-MiniLM-L6-v2"
model = AutoModelForEmbedding(model_name_or_path, pooling_method="mean", normalize_embeddings=True)
sentence_embeddings = model.encode(sentences, convert_to_tensor=True)
print(sentence_embeddings)
Build Index and Retrieval
from retrievals import AutoModelForEmbedding, AutoModelForRetrieval
sentences = ['A dog is chasing car.', 'A man is playing a guitar.']
model_name_or_path = "sentence-transformers/all-MiniLM-L6-v2"
index_path = './database/faiss/faiss.index'
model = AutoModelForEmbedding(model_name_or_path)
model.build_index(sentences, index_path=index_path)
query_embed = model.encode("He plays guitar.")
matcher = AutoModelForRetrieval()
dists, indices = matcher.similarity_search(query_embed, index_path=index_path)
print(indices)
Rerank
from torch.optim import AdamW
from transformers import AutoTokenizer, TrainingArguments, get_cosine_schedule_with_warmup
from retrievals import RerankCollator, RerankModel, RerankTrainer, RerankDataset
model_name_or_path: str = "microsoft/mdeberta-v3-base"
learning_rate: float = 3e-5
batch_size: int = 64
epochs: int = 3
train_dataset = RerankDataset(args=data_args)
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, use_fast=False)
model = RerankModel(model_name_or_path, pooling_method="mean")
optimizer = AdamW(model.parameters(), lr=learning_rate)
num_train_steps = int(len(train_dataset) / batch_size * epochs)
scheduler = get_cosine_schedule_with_warmup(optimizer, num_warmup_steps=100, num_training_steps=num_train_steps)
training_args = TrainingArguments(
learning_rate=2e-5,
per_device_train_batch_size=1,
num_train_epochs=2,
output_dir = './checkpoints',
)
trainer = RerankTrainer(
model=model,
args=training_args,
train_dataset=train_dataset,
data_collator=RerankCollator(tokenizer, max_length=data_args.query_max_length),
)
trainer.optimizer = optimizer
trainer.scheduler = scheduler
trainer.train()
trainer.save_model('weights')
RAG with LangChain
pip install langchain
- Server
from retrievals.tools.langchain import LangchainEmbedding, LangchainReranker
from retrievals import RerankModel
from langchain.retrievers import ContextualCompressionRetriever
from langchain_community.vectorstores import Chroma as Vectorstore
persist_directory = './database/faiss.index'
embeddings = LangchainEmbedding(model_name="BAAI/bge-large-zh-v1.5")
vectordb = Vectorstore(
persist_directory=persist_directory,
embedding_function=embeddings,
)
retrieval_args = {"search_type" :"similarity", "score_threshold": 0.15, "k": 30}
retriever = vectordb.as_retriever(retrieval_args)
rank = RerankModel("maidalun1020/bce-reranker-base_v1", use_fp16=True)
reranker = LangchainReranker(model=rank, top_n=7)
compression_retriever = ContextualCompressionRetriever(
base_compressor=reranker, base_retriever=retriever
)
query = 'what is open-retrievals?'
docs = compression_retriever.get_relevant_documents(query)
Finetune transformers weights by contrastive learning
from transformers import AutoTokenizer
from retrievals import AutoModelForEmbedding, AutoModelForRetrieval, RetrievalTrainer, PairCollator, TripletCollator
from retrievals.losses import ArcFaceAdaptiveMarginLoss, InfoNCE, SimCSE, TripletLoss
from retrievals.data import RetrievalDataset, RerankDataset
model_name_or_path = "sentence-transformers/paraphrase-multilingual-mpnet-base-v2"
train_dataset = RetrievalDataset(args=data_args)
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, use_fast=False)
model = AutoModelForEmbedding(model_name_or_path, pooling_method="cls")
optimizer = get_optimizer(model, lr=5e-5, weight_decay=1e-3)
lr_scheduler = get_scheduler(optimizer, num_train_steps=int(len(train_dataset) / 2 * 1))
trainer = RetrievalTrainer(
model=model,
args=training_args,
train_dataset=train_dataset,
data_collator=TripletCollator(tokenizer, max_length=data_args.query_max_length),
loss_fn=TripletLoss(),
)
trainer.optimizer = optimizer
trainer.scheduler = lr_scheduler
trainer.train()
Finetune LLM for embedding by Contrastive learning
from retrievals import AutoModelForEmbedding
model = AutoModelForEmbedding(
"mistralai/Mistral-7B-v0.1",
pooling_method='cls',
query_instruction=f'Instruct: Retrieve semantically similar text\nQuery: '
)
Search by Cosine similarity/KNN
from retrievals import AutoModelForEmbedding, AutoModelForRetrieval
query_texts = ['A dog is chasing car.']
document_texts = ['A man is playing a guitar.', 'A bee is flying low']
model_name_or_path = "sentence-transformers/all-MiniLM-L6-v2"
model = AutoModelForEmbedding(model_name_or_path)
query_embeddings = model.encode(query_texts, convert_to_tensor=True)
document_embeddings = model.encode(document_texts, convert_to_tensor=True)
matcher = AutoModelForRetrieval(method='cosine')
dists, indices = matcher.similarity_search(query_embeddings, document_embeddings, top_k=1)