In [1]:
import torch
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.document_loaders.text import TextLoader
from langchain.text_splitter import TokenTextSplitter, RecursiveCharacterTextSplitter 
from langchain.vectorstores import Chroma
from langchain.chains import RetrievalQA
from langchain import HuggingFacePipeline,PromptTemplate
from transformers import AutoTokenizer,pipeline

import evaluate

  from .autonotebook import tqdm as notebook_tqdm


# Falcon7b-Instruct

## Define Embeding logic

In [2]:
def create_sbert_mpnet():
        device = "cuda" if torch.cuda.is_available() else "cpu"
        print(f'Embedding on device: {device}')
        return HuggingFaceEmbeddings(model_name='sentence-transformers/all-mpnet-base-v2', model_kwargs={"device": device})

In [3]:
embedding = create_sbert_mpnet()

Embedding on device: cuda


In [4]:
# Load the txt
loader = TextLoader("page1.txt")
documents = loader.load()

# Split documents and create text snippets
text_splitter = RecursiveCharacterTextSplitter(chunk_size=100, chunk_overlap=0)
texts = text_splitter.split_documents(documents)
text_splitter = TokenTextSplitter(chunk_size=1000, chunk_overlap=10, encoding_name="cl100k_base")  # This the encoding for text-embedding-ada-002
texts = text_splitter.split_documents(texts)

vectordb_from_text = Chroma.from_documents(documents=texts, embedding=embedding, persist_directory=None)
vectordb_from_text.get()

{'ids': ['2e774bf6-2a30-11ee-9a76-394bc61c6320',
  '2e774bf7-2a30-11ee-9a76-394bc61c6320',
  '2e774bf8-2a30-11ee-9a76-394bc61c6320',
  '2e774bf9-2a30-11ee-9a76-394bc61c6320',
  '2e774bfa-2a30-11ee-9a76-394bc61c6320',
  '2e774bfb-2a30-11ee-9a76-394bc61c6320',
  '2e774bfc-2a30-11ee-9a76-394bc61c6320',
  '2e774bfd-2a30-11ee-9a76-394bc61c6320',
  '2e774bfe-2a30-11ee-9a76-394bc61c6320',
  '2e774bff-2a30-11ee-9a76-394bc61c6320',
  '2e774c00-2a30-11ee-9a76-394bc61c6320',
  '2e774c01-2a30-11ee-9a76-394bc61c6320',
  '2e774c02-2a30-11ee-9a76-394bc61c6320',
  '2e774c03-2a30-11ee-9a76-394bc61c6320',
  '2e774c04-2a30-11ee-9a76-394bc61c6320',
  '2e774c05-2a30-11ee-9a76-394bc61c6320',
  '2e774c06-2a30-11ee-9a76-394bc61c6320',
  '2e774c07-2a30-11ee-9a76-394bc61c6320',
  '2e774c08-2a30-11ee-9a76-394bc61c6320',
  '2e774c09-2a30-11ee-9a76-394bc61c6320',
  '2e774c0a-2a30-11ee-9a76-394bc61c6320',
  '2e774c0b-2a30-11ee-9a76-394bc61c6320',
  '2e774c0c-2a30-11ee-9a76-394bc61c6320',
  '2e774c0d-2a30-11ee-9a76-

## 7b-instruct

### Load LLM

In [5]:
tokenizer = AutoTokenizer.from_pretrained("tiiuae/falcon-7b-instruct")

hf_pipeline = pipeline(
        task="text-generation",
        model = "tiiuae/falcon-7b-instruct",
        tokenizer = tokenizer,
        trust_remote_code = True,
        max_new_tokens=100,
        model_kwargs={
                "device_map": 'auto', 
                "load_in_8bit": True, 
                "max_length": 512,
                'pad_token_id': 11,
                "torch_dtype":torch.bfloat16,
                'temperature' : 0.0,
                }
        )

Loading checkpoint shards: 100%|██████████| 2/2 [00:09<00:00,  4.52s/it]


In [6]:
llm = hf_pipeline
llm.model.hf_device_map

{'': 0}

### QA Pipeline

In [7]:
hf_llm = HuggingFacePipeline(pipeline=llm)
retriever = vectordb_from_text.as_retriever(search_kwargs={"k":4})
qa = RetrievalQA.from_chain_type(llm=hf_llm, chain_type="stuff",retriever=retriever)

# Defining a default prompt for flan models
question_falcon_template = """
Answer the question as truthfully as possible, and if the answer is not contained within the file,
say "I don't know."
context: {context}
question: {question}
answer: 
"""

QUESTION_FALCON_PROMPT = PromptTemplate(
    template=question_falcon_template, input_variables=["context","question"]
)
qa.combine_documents_chain.llm_chain.prompt = QUESTION_FALCON_PROMPT
qa.combine_documents_chain.verbose = False
# qa.return_source_documents = True

In [8]:
%%time
question = "What is the text about?"
qa({"query":question,})



CPU times: user 5.98 s, sys: 742 ms, total: 6.72 s
Wall time: 6.86 s


{'query': 'What is the text about?',
 'result': "\nThe text is about a declaration of a person named Oscar Alberto Islas Mendoza, who is a depositor in a bank. The text states that he has been in contact with the bank and has been informed of the bank's policies and procedures."}

In [9]:
%%time
question = "What is the date of escritura 34,501?"
qa({"query":question,})

CPU times: user 2.19 s, sys: 263 ms, total: 2.45 s
Wall time: 2.45 s


{'query': 'What is the date of escritura 34,501?',
 'result': '\nThe date of escritura 34,501 is 20 October 2011.'}

In [10]:
%%time
question = "What's the number of Contrato Mastro?"
qa({"query":question,})

CPU times: user 1.83 s, sys: 133 ms, total: 1.97 s
Wall time: 1.97 s


{'query': "What's the number of Contrato Mastro?",
 'result': '\nThe number of the contract is "AP000000718".'}

In [11]:
%%time
question = "Who is the representative of AB2C Leasing de Mexico?"
qa({"query":question,})

CPU times: user 4.24 s, sys: 444 ms, total: 4.68 s
Wall time: 4.68 s


{'query': 'Who is the representative of AB2C Leasing de Mexico?',
 'result': '\nThe representative of AB2C Leasing de Mexico is Roberto Nuñez y Bandera, Notario Público No. 1 de México, Distrito Federal.'}

In [12]:
%%time
question = "What is the date of acta 250?"
qa({"query":question,})

CPU times: user 1.92 s, sys: 224 ms, total: 2.14 s
Wall time: 2.14 s


{'query': 'What is the date of acta 250?',
 'result': '\nThe date of acta 250 is 20 marzo de 2020.'}

In [13]:
%%time
question = "What is the number of notario  Licericiado Francisco Javier Acevedo Macari?"
qa({"query":question,})

CPU times: user 2.36 s, sys: 245 ms, total: 2.61 s
Wall time: 2.61 s


{'query': 'What is the number of notario  Licericiado Francisco Javier Acevedo Macari?',
 'result': '\nThe number of notario Licericiado Francisco Javier Acevedo Macari is 149.'}

## 7b Finetuned

In [5]:
from peft import PeftConfig
from transformers import AutoModelForCausalLM

In [6]:
config = PeftConfig.from_pretrained('./training_results/checkpoint-240')
tokenizer = AutoTokenizer.from_pretrained(config.base_model_name_or_path)

hf_pipeline = pipeline(
        task="text-generation",
        model = config.base_model_name_or_path,
        tokenizer = tokenizer,
        trust_remote_code = True,
        max_new_tokens=100,
        model_kwargs={
                "device_map": 'auto', 
                "load_in_8bit": True, 
                "max_length": 512,
                'pad_token_id': 11,
                "torch_dtype":torch.bfloat16,
                'temperature' : 0.0,
                }
        )

Loading checkpoint shards: 100%|██████████| 2/2 [00:10<00:00,  5.08s/it]


In [7]:
llm = hf_pipeline
llm.model.hf_device_map

{'': 0}

In [8]:
hf_llm = HuggingFacePipeline(pipeline=llm)
retriever = vectordb_from_text.as_retriever(search_kwargs={"k":4})
qa = RetrievalQA.from_chain_type(llm=hf_llm, chain_type="stuff",retriever=retriever)

# Defining a default prompt for flan models
question_falcon_template = """
Answer the question as truthfully as possible, and if the answer is not contained within the file,
say "I don't know."
context: {context}
question: {question}
answer: 
"""

QUESTION_FALCON_PROMPT = PromptTemplate(
    template=question_falcon_template, input_variables=["context","question"]
)
qa.combine_documents_chain.llm_chain.prompt = QUESTION_FALCON_PROMPT
qa.combine_documents_chain.verbose = False

In [9]:
%%time
question = "What is the text about?"
qa({"query":question,})



CPU times: user 6.75 s, sys: 892 ms, total: 7.64 s
Wall time: 7.81 s


{'query': 'What is the text about?',
 'result': "\nThe text is about a declaration of a person named Oscar Alberto Islas Mendoza, who is a depositor in a bank. The text states that he has been in contact with the bank and has been informed of the bank's policies and procedures."}

In [10]:
%%time
question = "What is the date of escritura 34,501?"
qa({"query":question,})

CPU times: user 2.35 s, sys: 273 ms, total: 2.62 s
Wall time: 2.62 s


{'query': 'What is the date of escritura 34,501?',
 'result': '\nThe date of escritura 34,501 is 20 October 2011.'}

In [11]:
%%time
question = "What's the number of Contrato Mastro?"
qa({"query":question,})

CPU times: user 1.83 s, sys: 242 ms, total: 2.07 s
Wall time: 2.07 s


{'query': "What's the number of Contrato Mastro?",
 'result': '\nThe number of the contract is "AP000000718".'}

In [12]:
%%time
question = "Who is the representative of AB2C Leasing de Mexico?"
qa({"query":question,})

CPU times: user 4.06 s, sys: 453 ms, total: 4.51 s
Wall time: 4.51 s


{'query': 'Who is the representative of AB2C Leasing de Mexico?',
 'result': '\nThe representative of AB2C Leasing de Mexico is Roberto Nuñez y Bandera, Notario Público No. 1 de México, Distrito Federal.'}

In [13]:
%%time
question = "What is the date of acta 250?"
qa({"query":question,})

CPU times: user 2.04 s, sys: 204 ms, total: 2.25 s
Wall time: 2.25 s


{'query': 'What is the date of acta 250?',
 'result': '\nThe date of acta 250 is 20 marzo de 2020.'}

In [14]:
%%time
question = "What is the number of notario Licericiado Francisco Javier Acevedo Macari?"
qa({"query":question,})

CPU times: user 2.47 s, sys: 225 ms, total: 2.7 s
Wall time: 2.7 s


{'query': 'What is the number of notario  Licericiado Francisco Javier Acevedo Macari?',
 'result': '\nThe number of notario Licericiado Francisco Javier Acevedo Macari is 149.'}

# GPT 3.5

In [2]:
from dotenv import load_dotenv
load_dotenv()

True

In [3]:
%%time
from langchain.document_loaders import TextLoader
from langchain.indexes import VectorstoreIndexCreator
loader = TextLoader('page1.txt')
index = VectorstoreIndexCreator().from_loaders([loader])

# Question-answering
index.query("¿De quien habla el texto?")

CPU times: user 336 ms, sys: 28 ms, total: 364 ms
Wall time: 8.06 s


' El texto habla de AB8C Leasing de México, S.A. de C.V., una Sociedad Anónima Promotora de Inversión de Capital Variable que fue constituida bajo el nombre de Boston Leasing México, S.A. de C.V. y cuyo primer testimonio fue inscrito en el Registro Público de Comercio del Distrito Federal bajo el Folio Mercantil No. 257016 el 16 de Noviembre de 1999. La denominación social fue cambiada por la de AB8C Leasing de México, S.A. de C.V. mediante Escritura No. 53,174 de fecha 6 de junio de 2005, otorgada ante la fe del Lic. Roberto Nuñez y Bandera, Notario Público No. 1 de México, Distrito Federal, y cuyo testimonio quedó debidamente inscrito en el Registro Público de Comercio del Distrito Federal bajo el Folio Mercantil No.'

In [4]:
%%time
index.query('¿Cuál es la fecha de la escritura 34,501?')

CPU times: user 4.96 ms, sys: 1.75 ms, total: 6.72 ms
Wall time: 1.58 s


' 20 de octubre del 2011.'

In [5]:
%%time
index.query('¿Cuál es el numero del Contrato Maestro?')

CPU times: user 49.1 ms, sys: 0 ns, total: 49.1 ms
Wall time: 2.94 s


' El número del Contrato Maestro es AP000000718.'

In [6]:
%%time
index.query('¿Quien es el representante de AB2C Leasing de Mexico?')

CPU times: user 7.9 ms, sys: 0 ns, total: 7.9 ms
Wall time: 3.27 s


' Maria Isabel Bolio Montero y Pablo Enrique Romero Gonzalez.'

In [7]:
%%time
index.query('¿Cual es la fecha del acta 250?')

CPU times: user 51.6 ms, sys: 0 ns, total: 51.6 ms
Wall time: 2.98 s


' 20 de marzo de 2020.'

In [8]:
%%time
index.query('¿Cual es el numero del notario Licericiado Francisco Javier Acevedo Macari?')

CPU times: user 6.84 ms, sys: 0 ns, total: 6.84 ms
Wall time: 3.27 s


' El numero del notario Licericiado Francisco Javier Acevedo Macari es 67 sesenta y siete.'