In [18]:
import json

from llama_index.core import SimpleDirectoryReader
from llama_index.core.node_parser import SentenceSplitter
from llama_index.core.schema import MetadataMode
from sentence_transformers import SentenceTransformer


from llama_index.core.evaluation import EmbeddingQAFinetuneDataset
from llama_index.finetuning import generate_qa_embedding_pairs
from llama_index.finetuning import SentenceTransformersFinetuneEngine
from llama_index.llms.openai import OpenAI
from llama_index.embeddings.openai import OpenAIEmbedding

from llama_index.core import VectorStoreIndex
from llama_index.core.schema import TextNode
from tqdm.notebook import tqdm
import pandas as pd

from config import OPENAI_API_KEY

In [6]:
TRAIN_FILES = ["./data/rocket_liquid_prop/part_1.pdf"]
VAL_FILES = ["./data/rocket_liquid_prop/part_2.pdf"]

TRAIN_CORPUS_FPATH = "./data/train_corpus.json"
VAL_CORPUS_FPATH = "./data/val_corpus.json"

In [7]:
def load_corpus(files, verbose=False):
    if verbose:
        print(f"Loading files {files}")

    reader = SimpleDirectoryReader(input_files=files)
    docs = reader.load_data()
    if verbose:
        print(f"Loaded {len(docs)} docs")

    parser = SentenceSplitter()
    nodes = parser.get_nodes_from_documents(docs, show_progress=verbose)

    if verbose:
        print(f"Parsed {len(nodes)} nodes")

    return nodes

In [9]:
train_nodes = load_corpus(TRAIN_FILES, verbose=True)
val_nodes = load_corpus(VAL_FILES, verbose=True)

Ignoring wrong pointing object 33 0 (offset 0)
Ignoring wrong pointing object 657 0 (offset 0)
Ignoring wrong pointing object 1202 0 (offset 0)
Ignoring wrong pointing object 1517 0 (offset 0)
Ignoring wrong pointing object 2153 0 (offset 0)


Loading files ['./data/rocket_liquid_prop/part_1.pdf']
Loaded 257 docs


Parsing nodes:   0%|          | 0/257 [00:00<?, ?it/s]

Ignoring wrong pointing object 47 0 (offset 0)
Ignoring wrong pointing object 860 0 (offset 0)
Ignoring wrong pointing object 1013 0 (offset 0)
Ignoring wrong pointing object 2030 0 (offset 0)
Ignoring wrong pointing object 2153 0 (offset 0)


Parsed 266 nodes
Loading files ['./data/rocket_liquid_prop/part_2.pdf']
Loaded 301 docs


Parsing nodes:   0%|          | 0/301 [00:00<?, ?it/s]

Parsed 307 nodes


In [None]:
generate_qa_pairs = False

if generate_qa_pairs:
    
    train_dataset = generate_qa_embedding_pairs(
        llm=OpenAI(model="gpt-3.5-turbo"), nodes=train_nodes
    )
    # TODO this now caches stuff to disk and tries to complete it with validation, skipping most of validation
    val_dataset = generate_qa_embedding_pairs(
        llm=OpenAI(model="gpt-3.5-turbo"), nodes=val_nodes
    )
    
    train_dataset.save_json("train_dataset.json")
    val_dataset.save_json("val_dataset.json")

else:

    train_dataset = EmbeddingQAFinetuneDataset.load_json("train_dataset.json")
    val_dataset = EmbeddingQAFinetuneDataset.load_json("val_dataset.json")


In [21]:
medium_model = SentenceTransformer("BAAI/bge-base-en-v1.5", device='cuda')
small_model = SentenceTransformer("BAAI/bge-small-en-v1.5", device='cuda')
finetune_engine = SentenceTransformersFinetuneEngine(
    train_dataset,
    model_id=small_model,
    model_output_path="test_model",
    val_dataset=val_dataset,
    device='cuda'
)

INFO:sentence_transformers.SentenceTransformer:Load pretrained SentenceTransformer: BAAI/bge-small-en-v1.5
Load pretrained SentenceTransformer: BAAI/bge-small-en-v1.5


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/94.8k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/743 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/133M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/366 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

INFO:sentence_transformers.SentenceTransformer:Use pytorch device_name: mps
Use pytorch device_name: mps


In [22]:
finetune_engine.finetune()

Epoch:   0%|          | 0/2 [00:00<?, ?it/s]

Iteration:   0%|          | 0/54 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [None]:
embed_model = finetune_engine.get_finetuned_model()

In [None]:
embed_model

In [16]:
def evaluate(
    dataset,
    embed_model,
    top_k=5,
    verbose=False,
):
    corpus = dataset.corpus
    queries = dataset.queries
    relevant_docs = dataset.relevant_docs

    nodes = [TextNode(id_=id_, text=text) for id_, text in corpus.items()]
    index = VectorStoreIndex(
        nodes, embed_model=embed_model, show_progress=True
    )
    retriever = index.as_retriever(similarity_top_k=top_k)

    eval_results = []
    for query_id, query in tqdm(queries.items()):
        retrieved_nodes = retriever.retrieve(query)
        retrieved_ids = [node.node.node_id for node in retrieved_nodes]
        expected_id = relevant_docs[query_id][0]
        is_hit = expected_id in retrieved_ids  # assume 1 relevant doc

        eval_result = {
            "is_hit": is_hit,
            "retrieved": retrieved_ids,
            "expected": expected_id,
            "query": query_id,
        }
        eval_results.append(eval_result)
    return eval_results

In [None]:
ada = OpenAIEmbedding()
ada_val_results = evaluate(val_dataset, ada)

In [None]:
df_ada = pd.DataFrame(ada_val_results)
hit_rate_ada = df_ada["is_hit"].mean()
hit_rate_ada