In [1]:
from llama_index import Document, VectorStoreIndex, SimpleDirectoryReader, ServiceContext
from llama_index.vector_stores import ChromaVectorStore
from llama_index.readers.chroma import ChromaReader
from llama_index.storage.storage_context import StorageContext
# from transformers import AutoTokenizer, AutoModel
from llama_index.embeddings import HuggingFaceEmbedding
from llama_index.schema import MetadataMode
from IPython.display import Markdown, display
from llama_index import Document, VectorStoreIndex 
from llama_index.node_parser import SentenceSplitter
import chromadb
import pandas as pd 
import openai
import re
import os
import getpass
import glob 

In [2]:
default_path = os.getcwd()
model_path = os.path.join(default_path, '../../models')
model_dir = os.path.join(model_path, "mistral_origin")
data_path = os.path.join(default_path, '../../../data')
rulebook_path = os.path.join(data_path, 'pdf', 'rules')

In [3]:
file_list = glob.glob(f'{rulebook_path}/*.pdf')
file_list[3]

'/rag/jupyter/llama-index_examples/embedding/../../../data/pdf/rules/메일_화상채팅.pdf'

In [None]:
os.environ["OPENAI_API_KEY"] = getpass.getpass("OpenAI API Key:")
openai.api_key = os.environ["OPENAI_API_KEY"]

In [None]:
chroma_client = chromadb.HttpClient(host="192.168.0.146")

In [None]:
chroma_client.list_collections()

In [None]:
data_collection = chroma_client.get_or_create_collection("data")

In [None]:
data_store = ChromaVectorStore(chroma_collection=data_collection)
data_storage = StorageContext.from_defaults(vector_store=data_store)

In [None]:
page_no = 0

In [None]:
documents = SimpleDirectoryReader(input_files=[file_list[6]]).load_data()

In [None]:
num_mapper = dict({'①':'제1항', '②':'제2항', '③':'제3항', '④':'제4항', '⑤':'제5항', '⑥':'제6항',\
                   '⑦':'제7항', '⑧': '제8항', '⑨': '제9항', '⑩': '제10항', '⑪': '제11항', '⑫': '제12항',\
                  '⑬':'제13항', '⑭':'제14항', '⑮':'제15항', '⑯':'제16항', '⑰':'제17항', '⑱':'제18항', '⑲':'제19항', '⑳':'제20항'})

In [None]:
def text_cleanse(document):
    '''
    여러 공백 문자열 단일 공백 문자로 변환 
    여러 줄변환 문자 줄변환 문자로 변환 (\n x 2~ => \n x 2) 
    문서 맨 윗 내용이 페이지 번호인 경우 페이지 번호 제거 
    '''
    document.text = re.sub('[^A-Za-z0-9\'\"\-가-힣(){}\\n[]]', '', document.text)
    for num in num_mapper.keys():
        document.text = document.text.replace(num, num_mapper[num])
    document.text = re.sub(' +', ' ', document.text)
    document.text = document.text.strip()
    # document.text = re.sub(r'\r\n{2,}', '\n\n', document.text)
    
    text_list = document.text.splitlines(True)
    if text_list[0].startswith('페이지') or text_list[0].startswith(document.metadata['page_label']):
        text_list = text_list[1:]

    new_text = [] 
    for line in text_list:
        if line == '\n' or line == ' \n':
            continue 
        new_text.append(line) 
    document.text = ''.join(new_text)
    return document

In [None]:
for idx in range(len(documents)):
    documents[idx] = text_cleanse(documents[idx])

In [None]:
print(documents[1].text)

In [None]:
text = documents[0].text 
documents[0].metadata['page_label']

In [None]:
import re 

text = '제 1 장 제1장 제 1장 제 2 장 제2장 제 2장' 
# re.findall(r'제.*[0-9].*장', text)
re.findall(r'제.+2.+장', text)

In [None]:
text = '제 1 장 제1장 제 1장 제 2 장 제2장 제 2장 제 4장' 
re.findall(r'제 *1 *장', text)

In [None]:
def get_start_point(documents):
    s_point = 1
    for doc in documents:
        if len(re.findall(r'제 *1 *장', doc.text)) != 0 and (len(re.findall(r'목 *차', doc.text)) == 0 and len(re.findall(r'차 *례', doc.text)) == 0): 
            '''
            print(re.findall(r'제 *1 *장', doc.text))
            print(re.findall(r'목 *차', doc.text))
            print(re.findall(r'차 *례', doc.text))'''
            s_point = doc.metadata['page_label']
            break 
    return int(s_point) - 1

In [None]:
s_point = get_start_point(documents)
s_point

In [None]:
print(documents[4].text)

In [None]:
documents[0].metadata

In [None]:
def split_doc(idx, prev_spot, current_spot, document):
    file_name = document.metadata['file_name'].split('.')[0]
    splitted = document.text.split(current_spot)
    prev_doc = Document(text=splitted[0],
                       doc_id=f"{file_name}_doc_{idx}",
                       metadata={"spot": prev_spot, "file_name": document.metadata['file_name']},
                       excluded_llm_metadata_keys = ['spot', 'file_name']
                )
    current_doc = Document(text=splitted[1],
                       doc_id=f"{file_name}_doc_{idx + 1}",
                       metadata={"spot": current_spot, "file_name": document.metadata['file_name']},
                       excluded_llm_metadata_keys = ['spot', 'file_name']
                )
    return idx + 2, prev_doc, current_doc

In [None]:
def get_doc_content(idx, document):  
    '''
    input: document (페이지 단위로 분할된 document object) 
    output: document_list (장 단위로 분할된 document objects) 
    '''
    doc_list = []; meta_info = dict();
    prev_spot = document.metadata['spot']
    split_spot = re.findall(r'제 *[0-9] *장', document.text)
    if len(split_spot) == 0:
        return document
        
    splitted_docs = [] 
    for spot in split_spot:
        new_idx, prev_doc, current_doc = split_doc(idx, prev_spot, spot, document)
        prev_spot = spot
        splitted_docs.append(prev_doc)
        document = current_doc
    return splitted_doc

In [None]:
len(documents), s_point

In [None]:
documents[2].text.split('제2장')[1].split('제 3장')   #, documents[2].text.split('제2장')[1]

In [None]:
documents[s_point].metadata['spot'] = '제1장' 
for idx, doc in enumerate(documents):
    if idx >= s_point: 
        get_doc_content(idx, doc)