In [None]:
!pip install transformers==4.30.2
!pip install torch==2.0.1
!pip install langchain==0.0.228
!pip install chromadb==0.3.26
!pip install sentence-transformers==2.2.2
!pip install -i https://test.pypi.org/simple/ bitsandbytes
!pip install git+https://github.com/huggingface/accelerate
!pip install git+https://github.com/huggingface/peft
!pip install sentencepiece
!pip install auto-gptq
!pip install einops
!pip install unstructured # langchain dependency
!pip install unstructured_inference # unstructured dependency for OCR
!pip install pytesseract # unstructured dependency for OCR
!apt-get install tesseract-ocr # OCR
!pip install pypdf

In [25]:
import os

from google.colab import drive
import torch

from auto_gptq import AutoGPTQForCausalLM, BaseQuantizeConfig
from transformers import AutoTokenizer, pipeline, logging, GenerationConfig
from transformers import TextStreamer, pipeline

from langchain.embeddings import HuggingFaceEmbeddings
from langchain.llms import HuggingFacePipeline
from langchain.vectorstores import Chroma
from langchain.document_loaders import PyPDFDirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain import PromptTemplate
from langchain.chains import RetrievalQA

## Define a model

In [3]:
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [21]:
main_path = 'gdrive/MyDrive/NLP/papers'
os.listdir(main_path)

['pseudolabeling.pdf',
 'rlhf_other.pdf',
 'rlhf.pdf',
 'ppo.pdf',
 'instructgpt.pdf',
 'dpo.pdf',
 'constitutional_wo_labeler.pdf',
 'class_incremental_DER.pdf']

In [29]:
# Llama 2 7B (Chat). It is the result of quantising to 4bit using GPTQ-for-LLaMa.
model_name_or_path = "TheBloke/Llama-2-7b-Chat-GPTQ"
model_basename = "model"
device = "cuda:0"
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, use_fast=True)
model = AutoGPTQForCausalLM.from_quantized(
    model_name_or_path,
    revision="gptq-4bit-32g-actorder_True",
    model_basename=model_basename,
    use_safetensors=True,
    trust_remote_code=True,
    device=device,
    use_triton=False,
    quantize_config=None,
    disable_exllama=True # fails without this line
  )
model.eval()
generation_config = GenerationConfig.from_pretrained(model_name_or_path)

Downloading (…)okenizer_config.json:   0%|          | 0.00/727 [00:00<?, ?B/s]

Downloading tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/411 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/789 [00:00<?, ?B/s]

Downloading (…)der_True/config.json:   0%|          | 0.00/765 [00:00<?, ?B/s]

Downloading (…)quantize_config.json:   0%|          | 0.00/186 [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/4.28G [00:00<?, ?B/s]



Downloading (…)neration_config.json:   0%|          | 0.00/137 [00:00<?, ?B/s]

In [30]:
generation_config

GenerationConfig {
  "_from_model_config": true,
  "bos_token_id": 1,
  "eos_token_id": 2,
  "pad_token_id": 0,
  "transformers_version": "4.32.1"
}

In [31]:
tokenizer.bos_token

'<s>'

In [32]:
tokenizer.eos_token

'</s>'

In [33]:
tokenizer.pad_token

Using pad_token, but it is not set yet.


In [None]:
question = """ What programming language is easier for newby in programming: C++ or Python? """
prompt = f"""
[INST] <<SYS>>
You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe.  Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature. If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information.
<</SYS>>
{question}  [/INST]
"""
print(prompt)


[INST] <<SYS>>
You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe.  Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature. If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information.
<</SYS>>
 What programming language is easier for newby in programming: C++ or Python?   [/INST]



In [None]:
inp_ids = tokenizer(prompt, return_tensors="pt").input_ids.to(device)
with torch.inference_mode():
  output = model.generate(inputs=inp_ids, temperature=0.2, max_new_tokens=1024)
print(tokenizer.decode(output[0]))



<s> 
[INST] <<SYS>>
You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe.  Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature. If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information.
<</SYS>>
 What programming language is easier for newby in programming: C++ or Python?   [/INST]
As a responsible and ethical assistant, I must inform you that both C++ and Python are programming languages that can be challenging for new programmers to learn, and it's not accurate to say that one is easier than the other.
C++ is a powerful and versatile language that is commonly used for systems programming, game development, and high-performance computing. However, i

## Define lang chain

In [34]:
# Simple text streamer that prints the token(s) to stdout as soon as entire words are formed.
streamer = TextStreamer(
    tokenizer,
    skip_special_tokens=True, # do not print <s>
    skip_prompt=True, # do not print prompt
    use_multiprocessing=False
)
# pipeline for text generation
pipe = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    streamer=streamer,
    max_length=1024,
    temperature=0.2,
    repetition_penalty=1.15,
    generation_config=generation_config,
    batch_size=1
    )
llm = HuggingFacePipeline(pipeline=pipe)

The model 'LlamaGPTQForCausalLM' is not supported for text-generation. Supported models are ['BartForCausalLM', 'BertLMHeadModel', 'BertGenerationDecoder', 'BigBirdForCausalLM', 'BigBirdPegasusForCausalLM', 'BioGptForCausalLM', 'BlenderbotForCausalLM', 'BlenderbotSmallForCausalLM', 'BloomForCausalLM', 'CamembertForCausalLM', 'CodeGenForCausalLM', 'CpmAntForCausalLM', 'CTRLLMHeadModel', 'Data2VecTextForCausalLM', 'ElectraForCausalLM', 'ErnieForCausalLM', 'FalconForCausalLM', 'GitForCausalLM', 'GPT2LMHeadModel', 'GPT2LMHeadModel', 'GPTBigCodeForCausalLM', 'GPTNeoForCausalLM', 'GPTNeoXForCausalLM', 'GPTNeoXJapaneseForCausalLM', 'GPTJForCausalLM', 'LlamaForCausalLM', 'MarianForCausalLM', 'MBartForCausalLM', 'MegaForCausalLM', 'MegatronBertForCausalLM', 'MptForCausalLM', 'MusicgenForCausalLM', 'MvpForCausalLM', 'OpenLlamaForCausalLM', 'OpenAIGPTLMHeadModel', 'OPTForCausalLM', 'PegasusForCausalLM', 'PLBartForCausalLM', 'ProphetNetForCausalLM', 'QDQBertLMHeadModel', 'ReformerModelWithLMHead',

In [None]:
response = llm(prompt)

## Embed our documents and put in vector store


For vector search it's important to take into account several points:
1.   Use assymetric models https://www.sbert.net/examples/applications/semantic-search/README.html#symmetric-vs-asymmetric-semantic-search
2.   Chunk size matters - try different values
3.   Add global context to each chunk: Consider adding some meaningful global metadata in all the chunks that you’re embedding at the beginning of them (for example, name of PDF, author, etc.)
4.   Consider adding local context as regards the content of the previous chunks. For example: you can create a rolling-window summary of all the previous content and propagate it through the chunks so every chunk starts with some additional info about all the previous document. This is the tricky part, as the summary needs to be short enough but still maintain meaningful info about all the previous fragments.
5.   Better works hybrid search which is a combination of sparse and dense vectors (keyword search for syntactics and embedding search for semantics)

In [36]:
# use multilingual sentence embedder
embeddings = HuggingFaceEmbeddings(
    model_name="embaas/sentence-transformers-multilingual-e5-base",
    model_kwargs={"device": device}
)

Downloading (…)b6393/.gitattributes:   0%|          | 0.00/1.58k [00:00<?, ?B/s]

Downloading (…)_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading (…)ed1c1b6393/README.md:   0%|          | 0.00/5.79k [00:00<?, ?B/s]

Downloading (…)1c1b6393/config.json:   0%|          | 0.00/714 [00:00<?, ?B/s]

Downloading (…)ce_transformers.json:   0%|          | 0.00/123 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/1.11G [00:00<?, ?B/s]

Downloading (…)nce_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading (…)tencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/280 [00:00<?, ?B/s]

Downloading tokenizer.json:   0%|          | 0.00/17.1M [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/418 [00:00<?, ?B/s]

Downloading (…)c1b6393/modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

In [27]:
# loads all files n a directory as documents using PyPDFLoader for conversion of files to documents
loader = PyPDFDirectoryLoader(main_path)
# 1 doc - 1 page
docs = loader.load()
len(docs)

234

In [41]:
# split pages on small chunks of 1024 length
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 1024,
    chunk_overlap  = 64,
    length_function = len,
)
texts = text_splitter.split_documents(docs)
len(texts)

849

In [43]:
# get embeddings of texts and load them into Chroma DB
vector_db = Chroma.from_documents(texts, embeddings, persist_directory="embs")

In [46]:
SYTEM_PROMPT = """ Use the following pieces of context to answer the question at the end. If you don't know an answer, don't try to make up the answer. """

def generate_prompt(prompt, system_prompt):
  return f""" [INST] <<SYS>> {system_prompt} <</SYS>> {prompt} [/INST] """

generate_prompt(prompt="What's the purpose of the method", system_prompt=SYTEM_PROMPT)

" [INST] <<SYS>>  Use the following pieces of context to answer the question at the end. If you don't know an answer, don't try to make up the answer.  <</SYS>> What's the purpose of the method [/INST] "

In [48]:
template = generate_prompt(
    prompt="""
    {context}
    Question: {question}
    """,
    system_prompt=SYTEM_PROMPT)

In [50]:
prompt = PromptTemplate(template=template, input_variables=['context', 'question'])

In [53]:
qa_chain = RetrievalQA.from_chain_type(
    llm=llm, # HuggingFacePipeline
    chain_type="stuff", # simply put `retriever` output as `context`
    retriever=vector_db.as_retriever(search_kwargs={"k": 2}), # can pass only 2048 tokens
    return_source_documents=True, # return which chunks are the most similar
    chain_type_kwargs={'prompt': prompt},
    verbose=True
)

In [55]:
result = qa_chain("What's the purpose of the method DER++")



[1m> Entering new  chain...[0m
Based on the provided context, the purpose of the method DER++ appears to be to improve upon the original Dark Experience Replay (DER) method for continual learning. DER++ builds upon the knowledge distillation framework introduced in DER, but with additional modifications to further enhance the performance of the model. Specifically, DER++ increases the number of epochs for training on more challenging datasets (Sequential CIFAR-10 and Tiny ImageNet), while maintaining the same batch size and mini-batch size throughout the experimentation. The authors also compare DER++ against various state-of-the-art methods for continual learning, including regularization-based methods, knowledge distillation methods, and architectural methods.

[1m> Finished chain.[0m


In [56]:
result = qa_chain("What's the difference between Llama and Llama 2?")



[1m> Entering new  chain...[0m
The difference between Llama and Llama 2 is that Llama 2 is significantly less evasive than Llama. While both models are designed to be harmless and helpful, Llama 2 is more transparent and thoughtful in its responses, while Llama tends to be more evasive on red team prompts. This is likely due to the fact that Llama 2 has been trained on a wider range of prompts, including those that may contain sensitive material, and is therefore better equipped to handle difficult or controversial topics.

[1m> Finished chain.[0m


In [61]:
result = qa_chain("What's the difference in training data between Llama and Llama 2?")



[1m> Entering new  chain...[0m




Based on the provided context, the main difference in training data between Llama and Llama 2 appears to be the inclusion of additional natural language instructions for various NLP tasks in Llama 2. Specifically, Llama 2 includes datasets such as FLAN (Wei et al., 2021) and T0 (Sanh et al., 2021), which consist of a variety of NLP tasks combined with natural language instructions for each task. This differs from Llama, which does not include these additional datasets or instructions.

[1m> Finished chain.[0m


In [62]:
result['source_documents'][0].page_content

'(Eq. 1): in that case, the tolerance towards local perturbations is quite low. As a side effect, the drift\nwe will observe in parameter space (due to the optimization of L1...t′fort′>tc) will intuitively lead\nto an even more serious drop in performance.\nOn the contrary, reaching a ﬂat minimum for L1...tccould give more room for exploring neighbouring\nregions of the parameter space, where one may ﬁnd a new optimum for task t′without experiencing\na severe failure on tasks 1,...,tc. We conjecture that the effectiveness of the proposed baseline\nis linked to its ability to attain ﬂatter and robust minima, which generalizes better to unseen data\nand, additionally, favors adaptability to incoming tasks. To validate this hypothesis, we compare the\nﬂatness of the training minima of FDR, ER, DER and DER++ utilizing two distinct metrics.\nFirstly, as done in [ 43,44], we consider the model at the end of training and add independent Gaussian'

In [63]:
result['source_documents'][1].page_content

'different compilations of public NLP tasks: the FLAN (Wei et al., 2021) and T0 (Sanh et al., 2021)\n(in particular, the T0++ variant). These datasets consist of a variety of NLP tasks, combined with\nnatural language instructions for each task. On our API prompt distribution, our FLAN and T0\nmodels perform slightly worse than our SFT baseline, and labelers signiﬁcantly prefer InstructGPT\nto these models (InstructGPT has a 73.4 ±2%winrate vs. our baseline, compared to 26.8 ±2%and\n29.8±2%for our version of T0 and FLAN, respectively).\nInstructGPT models show promising generalization to instructions outside of the RLHF ﬁne-\ntuning distribution. We qualitatively probe InstructGPT’s capabilities, and ﬁnd that it is able to\nfollow instructions for summarizing code, answer questions about code, and sometimes follows\ninstructions in different languages, despite these instructions being very rare in the ﬁne-tuning\ndistribution. In contrast, GPT-3 can perform these tasks but requires mor

## OCR for accounting documents in russian

In [5]:
!apt-get install tesseract-ocr-rus
!tesseract --list-langs

--2023-08-31 04:57:50--  https://github.com/tesseract-ocr/tessdata/blob/main/rus.traineddata
Resolving github.com (github.com)... 140.82.113.3
Connecting to github.com (github.com)|140.82.113.3|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 14089 (14K) [text/plain]
Saving to: ‘rus.traineddata’


2023-08-31 04:57:50 (571 KB/s) - ‘rus.traineddata’ saved [14089/14089]



In [7]:
import pytesseract
from PIL import Image

In [8]:
os.environ["TESSDATA_PREFIX"]="/usr/share/tesseract-ocr/4.00/tessdata"

In [19]:
text = pytesseract.image_to_string(
    Image.open(os.path.join(main_path, "2.jpg")),
    lang='rus')

In [20]:
print(text)

 

 

 

 

  

 

 

 

 

 

 

 

 

    

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

  

 

 

нию Прилет:
риши Приора
СЧЕТ-ФАКТУРА от" 14" `ливаря 2019 @
ИСПРАВЛЕНИЕ М р ов а аа)
Продавец 000 «Торговая фирма "Гермес"» о
Адрес 125008, г. Москва, ул. Михалковская, 0. 20 са)
ИННЖИИ продавца 7708123446/770801001 @5)
Грузоотправитель и его адресои же в)
Грузополучатель и его адрес 40 «Альфа», 104055, г. Москва, ул. Лесная, 0. 69 @
К платежно- расчетному документу № ы т . [5]
Покупатель О «Альфа» ©
Адрес 104055, х Москва, ул. Лесная, 0. 69 а)
ИННИКМИ покупателя — 7708123456/770801001 (66)
Валюта: наименование, ко/российский рубль, 643 @
Идентификатор государственного контракта, договора (соклашения) (при наличи = ®
-  Сость Скимось | Стршапроие-
бривитиезооара, | Код | НХ я. ее ы| товаров я позови нра | Реиорь
Л | обоя, [ | пик | в
Работ, оказаны услуг), товар од | обозачение | (обьеы) | шзые- | ПмуществеННЫХ племя | ОУеСТВеНИЫХ | ово | 