In [2]:
import re
import json
from typing import Optional

import spacy
import numpy as np
from sentence_transformers import SentenceTransformer, util
from llama_cpp import Llama, LogitsProcessorList
from lmformatenforcer import CharacterLevelParser, JsonSchemaParser
from lmformatenforcer.integrations.llamacpp import build_llamacpp_logits_processor, build_token_enforcer_tokenizer_data

from src.utils.lexrank import degree_centrality_scores
from src.utils.prompts import get_summarization_prompt, get_question_prompt

from langchain.embeddings import HuggingFaceEmbeddings
from langchain_community.document_loaders import TextLoader
from langchain_text_splitters import CharacterTextSplitter
from langchain_community.vectorstores import Chroma


SENTENCE_TRANSFORMERS_HOME="./models/embeddings"
MODEL_PATH = "models/llama-2-13b-chat.Q2_K.gguf"
EMBEDDING_MODEL_PATH = "all-mpnet-base-v2"


  from .autonotebook import tqdm as notebook_tqdm


In [3]:
embedding_model = HuggingFaceEmbeddings(
    model_name=EMBEDDING_MODEL_PATH,
    cache_folder=SENTENCE_TRANSFORMERS_HOME
)

In [4]:
raw_documents = TextLoader('data/book1-txt.txt').load()

In [5]:
text_splitter = CharacterTextSplitter(separator=".", chunk_size=1000, chunk_overlap=0)
documents = text_splitter.split_documents(raw_documents)
db = Chroma.from_documents(documents, embedding_model)

Created a chunk of size 1067, which is longer than the specified 1000
Created a chunk of size 2320, which is longer than the specified 1000
Created a chunk of size 1424, which is longer than the specified 1000
Created a chunk of size 1001, which is longer than the specified 1000


In [28]:
question = "What had Gulliver found on Liliput island?"

In [29]:
docs = db.similarity_search(question)
page_contents = [doc.page_content for doc in docs]
prompt = get_question_prompt(question, page_contents[0])

In [30]:
text_splitter2 = CharacterTextSplitter(chunk_size=256, chunk_overlap=0, separator=".",)
documents2 = text_splitter2.create_documents([page_contents[0]])
db2 = Chroma.from_documents(documents2, embedding_model)

Created a chunk of size 260, which is longer than the specified 256


In [31]:
len(documents2)

5

In [32]:
docs2 = db2.similarity_search(question)
page_contents = [doc.page_content for doc in docs2]

In [33]:
page_contents[0]

'Gulliver may\nbe a little dissatisfied; but I was resolved to fit the work as much as\npossible to the general capacity of readers. However, if my own\nignorance in sea affairs shall have led me to commit some mistakes, I\nalone am answerable for them, and if any traveller hath a curiosity to\nsee the whole work at large, as it came from the hand of the author, I\nwill be ready to gratify him.\n\nAs for any farther particulars relating to the author, the reader will\nreceive satisfaction from the first pages of the book.\n\n                                        RICHARD SYMPSON.\n\n[Illustration]\n\n[Illustration]\n\n\n\n\nTRAVELS.\n\nPART I.\n\n\n_A VOYAGE TO LILLIPUT_.\n\n\n\n\nCHAPTER I.\n\n     THE AUTHOR GIVES SOME ACCOUNT OF HIMSELF AND FAMILY: HIS FIRST\n     INDUCEMENTS TO TRAVEL. HE IS SHIPWRECKED, AND SWIMS FOR HIS LIFE;\n     GETS SAFE ASHORE IN THE COUNTRY OF LILLIPUT; IS MADE A PRISONER,\n     AND CARRIED UP THE COUNTRY.\n\n\nMy father had a small estate in Nottinghamshi

In [34]:
from pydantic import BaseModel

class SummaryAnswerFormat(BaseModel):
    answer: str

def get_question_prompt(question, context):
    system_prompt = "Please ensure that your responses only use the provided CONTEXT."
    message = f"Given the following CONTEXT: {context} \nAnswer the QUESTION: {question}"
    prompt_template = f'<s>[INST] <<SYS>>\n{system_prompt}\n<</SYS>>\n\n{message} [/INST]'
    return prompt_template

In [42]:
prompt = get_question_prompt(question, page_contents[0].replace('\n', '').replace('  ', ''))
prompt

'<s>[INST] <<SYS>>\nPlease ensure that your responses only use the provided CONTEXT.\n<</SYS>>\n\nGiven the following CONTEXT: Gulliver maybe a little dissatisfied; but I was resolved to fit the work as much aspossible to the general capacity of readers. However, if my ownignorance in sea affairs shall have led me to commit some mistakes, Ialone am answerable for them, and if any traveller hath a curiosity tosee the whole work at large, as it came from the hand of the author, Iwill be ready to gratify him.As for any farther particulars relating to the author, the reader willreceive satisfaction from the first pages of the book.RICHARD SYMPSON.[Illustration][Illustration]TRAVELS.PART I._A VOYAGE TO LILLIPUT_.CHAPTER I. THE AUTHOR GIVES SOME ACCOUNT OF HIMSELF AND FAMILY: HIS FIRST INDUCEMENTS TO TRAVEL. HE IS SHIPWRECKED, AND SWIMS FOR HIS LIFE; GETS SAFE ASHORE IN THE COUNTRY OF LILLIPUT; IS MADE A PRISONER, AND CARRIED UP THE COUNTRY.My father had a small estate in Nottinghamshire; I 

In [36]:
# from book_worm import BookWorm
#
# book_worm = BookWorm()

llama_model_loader: loaded meta data with 19 key-value pairs and 363 tensors from models/llama-2-13b-chat.Q2_K.gguf (version GGUF V2)
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
llama_model_loader: - kv   0:                       general.architecture str              = llama
llama_model_loader: - kv   1:                               general.name str              = LLaMA v2
llama_model_loader: - kv   2:                       llama.context_length u32              = 4096
llama_model_loader: - kv   3:                     llama.embedding_length u32              = 5120
llama_model_loader: - kv   4:                          llama.block_count u32              = 40
llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 13824
llama_model_loader: - kv   6:                 llama.rope.dimension_count u32              = 128
llama_model_loader: - kv   7:                 llama.attention.head_count u32          

In [None]:
from lmformatenforcer import CharacterLevelParser, JsonSchemaParser

result = book_worm._llamacpp_with_character_level_parser(prompt, JsonSchemaParser(SummaryAnswerFormat.schema()))

Llama.generate: prefix-match hit


In [None]:
import json

json.loads(result)

In [39]:
page_contents[0]

'Gulliver may\nbe a little dissatisfied; but I was resolved to fit the work as much as\npossible to the general capacity of readers. However, if my own\nignorance in sea affairs shall have led me to commit some mistakes, I\nalone am answerable for them, and if any traveller hath a curiosity to\nsee the whole work at large, as it came from the hand of the author, I\nwill be ready to gratify him.\n\nAs for any farther particulars relating to the author, the reader will\nreceive satisfaction from the first pages of the book.\n\n                                        RICHARD SYMPSON.\n\n[Illustration]\n\n[Illustration]\n\n\n\n\nTRAVELS.\n\nPART I.\n\n\n_A VOYAGE TO LILLIPUT_.\n\n\n\n\nCHAPTER I.\n\n     THE AUTHOR GIVES SOME ACCOUNT OF HIMSELF AND FAMILY: HIS FIRST\n     INDUCEMENTS TO TRAVEL. HE IS SHIPWRECKED, AND SWIMS FOR HIS LIFE;\n     GETS SAFE ASHORE IN THE COUNTRY OF LILLIPUT; IS MADE A PRISONER,\n     AND CARRIED UP THE COUNTRY.\n\n\nMy father had a small estate in Nottinghamshi

In [40]:
# replace /n with nothing before creating the prompt
# make the second similarity search based on all the responses
