In [None]:
from langchain.docstore.document import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter, Language
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores import FAISS
import os
import openai
import docx2txt
import re
from PyPDF2 import PdfReader
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.document_loaders import UnstructuredPowerPointLoader

In [None]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size = 1500, chunk_overlap = 200)

def get_files_with_extension(folder_path, file_extension):

    file_list = []

    for elem in folder_path:
        for root, _, files in os.walk(elem):
            for file in files:
                if file.endswith(file_extension):    
                    # get the full path and file size
                    file_path = os.path.join(root, file).replace("\\", "/")
                    file_list.append(file_path)

    print(f"Number of founded documents: {len(file_list)}")
    
    return file_list

def add_context_to_doc_chunks(_docs):

    # adding the filename to each chunk my help the relevany search

    for i in _docs:
        i.page_content = ' '.join(i.metadata['source'].split("\\")[-1].split('.')[:-1]) + ' \n\n' + i.page_content

    return _docs

def create_db(_docs,_embeddings = None):

    if not _embeddings:
        embeddings = OpenAIEmbeddings()
    else:
        embeddings = _embeddings
    
    db = FAISS.from_documents(_docs, embeddings)

def generate_embeddings(text):
    response = openai.Embedding.create(input=text, model = 'text-embedding-ada-002')
    embeddings = response['data'][0]['embedding']
    return embeddings

def load_pdf(pdf_as_bytes, splitter = text_splitter, filename = 'pdf'):

    pdf_as_bytes = PdfReader(pdf_as_bytes)

    #text = ''
    DOCS = []

    for pagenum, page in enumerate(pdf_as_bytes.pages):

        page_text = page.extract_text()

        text_splitted = splitter.split_text(page_text)
        docs = [Document(page_content=t, metadata={'source' : filename, 'page' : str(pagenum+1)}) for t in text_splitted]
        docs = add_context_to_doc_chunks(docs)
        
        DOCS.append(docs)

    DOCS = [item for sublist in DOCS for item in sublist]

    return DOCS

def load_docx(file, splitter = text_splitter, filename = 'docx'):

    DOCS = []

    text = docx2txt.process(file) 
    text = re.sub(r"\n\s*\n", "\n\n", text)

    text_splitted = splitter.split_text(text)
    docs = [Document(page_content=t, metadata={'source' : filename, 'page' : 'all'}) for t in text_splitted]
    docs = add_context_to_doc_chunks(docs)
    DOCS.append(docs)

    DOCS = [item for sublist in DOCS for item in sublist]

    return DOCS


In [None]:
embeddings = HuggingFaceEmbeddings(model_name='intfloat/multilingual-e5-large')

In [None]:
dirname_list = ["data"]

# giving file extension
ext = ('.pdf','.docx')
filtered_files = get_files_with_extension(dirname_list, ext)

In [None]:
docs_all = []
for file in filtered_files:
    filename = file.split('/')[-1]
    print(filename)
    if file.endswith('.pdf'):
        pdf_doc_chunks = load_pdf(file, filename = filename)
        docs_all.extend(pdf_doc_chunks)
    if file.endswith('.docx'):
        docx_doc_chunks = load_docx(file, filename = filename)
        docs_all.extend(docx_doc_chunks)

In [None]:
embeddings, db = create_db(docs_all, embeddings)

In [None]:
#db.save_local("faiss_index_e5_large_pre")

In [None]:
def get_files_with_extension(folder_path, file_extension):

    file_list = []

    for elem in folder_path:
        for root, _, files in os.walk(elem):
            for file in files:
                if file.endswith(file_extension):    
                    # get the full path and file size
                    file_path = os.path.join(root, file).replace("\\", "/")
                    file_list.append(file_path)

    print(f"Number of founded documents: {len(file_list)}")
    
    return file_list
def add_context_to_doc_chunks(_docs):

    # adding the ppt page header to each chunk

    if _docs[0].metadata['source'].split('.')[0] == 'MPTHK_202212':
        for i in _docs:
            i.page_content = 'Magyar Posta Takarék Hosszú Kötvény Befektetési Alap' + ' \n\n' + i.page_content
    if _docs[0].metadata['source'].split('.')[0] == 'MPTHV_202212':
        for i in _docs:
            i.page_content = 'Magyar Posta Takarék Harmónia Vegyes Befektetési Alap' + ' \n\n' + i.page_content
    if _docs[0].metadata['source'].split('.')[0] == 'OPTII_202212':
        for i in _docs:
            i.page_content = 'DIÓFA Optimus III. Befektetési Alap „A” sorozat' + ' \n\n' + i.page_content
    if _docs[0].metadata['source'].split('.')[0] == 'OPTI_202212':
        for i in _docs:
            i.page_content = 'DIÓFA Optimus I. Befektetési Alap „A” sorozat' + ' \n\n' + i.page_content
    if _docs[0].metadata['source'].split('.')[0] == 'TAHB_202212':
        for i in _docs:
            i.page_content = 'Takarék Abszolút Hozamú Befektetési Alap' + ' \n\n' + i.page_content
    if _docs[0].metadata['source'].split('.')[0] == 'Tapollo_202212':
        for i in _docs:
            i.page_content = 'Takarék Apollo Származtatott Részvény Befektetési Alap' + ' \n\n' + i.page_content
    if _docs[0].metadata['source'].split('.')[0] == 'TSZ_202212':
        for i in _docs:
            i.page_content = 'Takarék Származtatott Befektetési Alap' + ' \n\n' + i.page_content
    else:
        for i in _docs:
            i.page_content = _docs[0].page_content.split('\n\n')[0] + ' \n\n' + i.page_content

    # adding the filename to each chunk my help the relevany search

    for i in _docs:
        i.page_content = ' '.join(i.metadata['source'].split("\\")[-1].split('.')[:-1]) + ' \n\n' + i.page_content


    return _docs
def load_pptx(file_path, splitter = text_splitter, filename = 'pptx'):

    DOCS = []

    #text = UnstructuredPowerPointLoader(file_path).load()
    text_splitted = UnstructuredPowerPointLoader(file_path).load_and_split(text_splitter=splitter)
    

    #docs = [Document(page_content=text[0].page_content, metadata={'source' : filename, 'page' : 'all'})]
    docs = [Document(page_content=t.page_content, metadata={'source' : filename, 'page' : 'all'}) for t in text_splitted]
    docs = add_context_to_doc_chunks(docs)
    DOCS.append(docs)

    DOCS = [item for sublist in DOCS for item in sublist]

    return DOCS

In [None]:
dirname_list = ["data"]

# giving file extension
ext = ('.pptx')
filtered_files = get_files_with_extension(dirname_list, ext)

In [None]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size = 1000, chunk_overlap = 200)

In [None]:
docs = []
for file in filtered_files:
    filename = file.split('/')[-1]
    pptx_doc_chunks = load_pptx(file,splitter=text_splitter,  filename = filename)
    docs.extend(pptx_doc_chunks)

In [None]:
ppt_db = FAISS.from_documents(docs, embeddings)
#ppt_db.save_local("faiss_index_pptx500_context_cleared")

In [None]:
db.merge_from(ppt_db)

In [None]:
#db.save_local("faiss_index_e5_large_pre")