<a href="https://colab.research.google.com/github/Jonlittle27/Doc_parser/blob/main/doc_parser.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Installs

In [1]:
# Install required packages
!pip install pandas
!pip install langchain-text-splitters
!pip install docx2txt pytesseract Pillow transformers torch python-docx
!pip install InstructorEmbedding langchain transformers accelerate bitsandbytes sentencepiece Xformers tiktoken chromadb
!pip install sentence_transformers==2.2.2
!pip install langchain-chroma>=0.1.2
!pip install langchain-huggingface
!sudo apt install tesseract-ocr

Collecting docx2txt
  Downloading docx2txt-0.8.tar.gz (2.8 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting pytesseract
  Downloading pytesseract-0.3.13-py3-none-any.whl.metadata (11 kB)
Collecting python-docx
  Downloading python_docx-1.1.2-py3-none-any.whl.metadata (2.0 kB)
Downloading pytesseract-0.3.13-py3-none-any.whl (14 kB)
Downloading python_docx-1.1.2-py3-none-any.whl (244 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m244.3/244.3 kB[0m [31m15.4 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: docx2txt
  Building wheel for docx2txt (setup.py) ... [?25l[?25hdone
  Created wheel for docx2txt: filename=docx2txt-0.8-py3-none-any.whl size=3960 sha256=2dc89cfc9a9297bfc8a1f60a10e6394c480893c09e37d00f8f3738fa50321d80
  Stored in directory: /root/.cache/pip/wheels/22/58/cf/093d0a6c3ecfdfc5f6ddd5524043b88e59a9a199cb02352966
Successfully built docx2txt
Installing collected packages: docx2txt, python-docx, pytesseract
Su

# Dependencies and Llama 2

In [2]:
#imports and model
import docx2txt
import pytesseract
from PIL import Image
import io
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
import re
from docx import Document
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_chroma import Chroma
from langchain.chains import RetrievalQA
from InstructorEmbedding import INSTRUCTOR
from langchain_huggingface import HuggingFaceEmbeddings

if torch.cuda.is_available():
  torch.set_default_device("cuda")
  print("Using GPU")
else:
  torch.set_default_device("‘cpu’")
  print("Using CPU")

# Initialize Llama 2 model and tokenizer
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-chat-hf")
model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-7b-chat-hf")


Using GPU


tokenizer_config.json:   0%|          | 0.00/1.62k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/614 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/26.8k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/9.98G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/3.50G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/188 [00:00<?, ?B/s]

# Function Creation

In [3]:
# Function creation

def extract_text_from_docx(file_path):
    text = docx2txt.process(file_path)
    return text

def perform_ocr(image):
    text = pytesseract.image_to_string(image)
    return text

def extract_images_from_docx(file_path):
    doc = Document(file_path)
    images = []
    for rel in doc.part.rels.values():
        if "image" in rel.target_ref:
            image_data = rel.target_part.blob
            image = Image.open(io.BytesIO(image_data))
            images.append(image)
    return images

def process_with_llama2(text):
    inputs = tokenizer(text, return_tensors="pt")
    with torch.no_grad():
        outputs = model.generate(**inputs, max_new_tokens =2000)
    processed_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return processed_text

# def chunk_text(text, chunk_size=1000, overlap=100):
#     chunks = []
#     start = 0
#     while start < len(text):
#         end = start + chunk_size
#         chunk = text[start:end]
#         chunks.append(chunk)
#         start = end - overlap
#     return chunks

def parse_tables(file_path):
    doc = Document(file_path)
    tables_data = []

    for table in doc.tables:
        table_data = []
        for row in table.rows:
            row_data = [cell.text.strip() for cell in row.cells]
            table_data.append(row_data)
        tables_data.append(table_data)

    return tables_data

def optimize_for_rag(file_path):
    # Extract text from Word document
    doc_text = extract_text_from_docx(file_path)

    # Extract images from Word document
    images = extract_images_from_docx(file_path)

    # Perform OCR on images
    ocr_text = ""
    for image in images:
        ocr_text += perform_ocr(image) + "\n"

    # Parse tables
    tables_data = parse_tables(file_path)

    # Convert table data to text
    tables_text = ""
    for i, table in enumerate(tables_data):
        tables_text += f"Table {i+1}:\n"
        for row in table:
            tables_text += " | ".join(row) + "\n"
        tables_text += "\n"

    # Combine document text, OCR text, and tables text
    combined_text = doc_text + "\n" + ocr_text + "\n" + tables_text

    # Process text with Llama 2
    processed_text = process_with_llama2(combined_text)

    # Chunk the processed text
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=1000,
        chunk_overlap=200,
        length_function=len,
        is_separator_regex=False,
    )

    # Split the combined text into chunks
    chunks = text_splitter.split_text(combined_text)

    # # Write chunks to a Markdown file
    # output_file = os.path.splitext(file_path)[0] + "llama_chunks.md"
    # with open(output_file, "w", encoding="utf-8") as md_file:
    #     md_file.write(f"# Parsed content from {os.path.basename(file_path)}\n\n")
    #     for i, chunk in enumerate(chunks):
    #         md_file.write(f"## Chunk {i+1}\n\n")
    #         md_file.write(f"```\n{chunk}\n```\n\n")

    # print(f"Parsed content has been written to {output_file}")

    return chunks

# Run Function

In [4]:
# Example
file_path = "llama_small.docx"
optimized_chunks = optimize_for_rag(file_path)



This is a friendly reminder - the current text generation call will exceed the model's predefined maximum length (4096). Depending on the model, you may observe exceptions, performance degradation, or nothing at all.


In [5]:
len(optimized_chunks)

14

In [6]:
optimized_chunks[0]

'Time\n\nPower\n\nCarbon Emitted\n\n\n\n(GPU hours)\n\nConsumption (W)\n\n(tCO2eq)\n\n7B\n\n184320\n\n400\n\n31.22\n\n\tLlama 2\t13B\n\n368640\n\n400\n\n62.44\n\n34B\n\n1038336\n\n350\n\n153.90\n\n70B\n\n1720320\n\n400\n\n291.42\n\nTotal\n\n3311616\n\n\n\n539.00\n\nTable 2: CO2 emissions during pretraining. Time: total GPU time required for training each model. Power Consumption: peak power capacity per GPU device for the GPUs used adjusted for power usage efficiency. 100% of the emissions are directly offset by Meta’s sustainability program, and because we are openly releasing these models, the pretraining costs do not need to be incurred by others.\n\n\n\ncan scale almost as well as expensive Infiniband up to 2000 GPUs, which makes pretraining even more democratizable.'

In [7]:
optimized_chunks[1]

'can scale almost as well as expensive Infiniband up to 2000 GPUs, which makes pretraining even more democratizable.\n\nCarbon Footprint of Pretraining. Following preceding research (Bender et al., 2021a; Patterson et al., 2021; Wu et al., 2022; Dodge et al., 2022) and using power consumption estimates of GPU devices and carbon efficiency, we aim to calculate the carbon emissions resulting from the pretraining of Llama 2 models. The actual power usage of a GPU is dependent on its utilization and is likely to vary from the Thermal Design Power (TDP) that we employ as an estimation for GPU power. It is important to note that our calculations do not account for further power demands, such as those from interconnect or non-GPU server power consumption, nor from datacenter cooling systems. Additionally, the carbon output related to the production of AI hardware, like GPUs, could add to the overall carbon footprint as suggested by Gupta et al. (2022b,a).'

# Embedding and Storing

In [None]:

from langchain.embeddings import HuggingFaceBgeEmbeddings

model_name = "BAAI/bge-base-en"
encode_kwargs = {'normalize_embeddings': True} # set True to compute cosine similarity

model_norm = HuggingFaceBgeEmbeddings(
    model_name=model_name,
    model_kwargs={'device': 'cuda'},
    encode_kwargs=encode_kwargs
)

In [None]:
#might need this just in case
# import torch
# torch.cuda.empty_cache()

In [None]:
# Embed and store the texts
# Supplying a persist_directory will store the embeddings on disk
persist_directory = 'db'

embedding = model_norm

vectordb = Chroma.from_documents(documents=chunks,
                                 embedding=embedding,
                                 persist_directory=persist_directory)

# Setting Pipeline

In [None]:
#starting the retriever
retriever = vectordb.as_retriever(search_kwargs={"k": 3})

In [None]:
# create the chain to answer questions -- Exploring adding a prompt

PROMPT = """You are a 340B Drug Program expert and solutions analyst, use the following pieces of context to answer the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer.  If you are close to an answer but not confident, apologize and direct the user to follow the links from the source material closest to an answer.

{context}

Question: {question}
Helpful Answer:
"""

new_prompt = PromptTemplate(
    template=PROMPT, input_variables=["context", "question"]
)
chain_type_kwargs = {"prompt": new_prompt}

qa_chain = RetrievalQA.from_chain_type(llm=local_llm,
                                  chain_type="stuff",
                                  retriever=retriever,
                                  chain_type_kwargs=chain_type_kwargs,
                                  return_source_documents=True)

In [None]:
## Pull Documents Sourcing Responses

import textwrap

def wrap_text_preserve_newlines(text, width=110):
    # Split the input text into lines based on newline characters
    lines = text.split('\n')

    # Wrap each line individually
    wrapped_lines = [textwrap.fill(line, width=width) for line in lines]

    # Join the wrapped lines back together using newline characters
    wrapped_text = '\n'.join(wrapped_lines)

    return wrapped_text

#this is where we can reword our responses
def process_llm_response(llm_response):
    print(wrap_text_preserve_newlines(llm_response['result']))
    # print('\n\nFor More Information Check:')
    # for source in llm_response["source_documents"]:
    #     key = source.metadata['source']
    #     value = scrape_dict[key[14:]]
    #     # print(key[14:])
    #     print(value)
    #     # print(source.metadata['source'])