In [1]:
from langchain.text_splitter import Language
from langchain.document_loaders.generic import GenericLoader
from langchain.document_loaders.parsers import LanguageParser
import os

from langchain.vectorstores import FAISS
from langchain.document_loaders import PyPDFLoader
from langchain.chains.question_answering import load_qa_chain
from langchain.prompts import PromptTemplate
from langchain.memory import ConversationBufferMemory
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.chains import RetrievalQA
from langchain.document_loaders import UnstructuredFileLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.chains import RetrievalQAWithSourcesChain
from huggingface_hub import notebook_login
from transformers import pipeline
from transformers import AutoTokenizer, AutoModelForCausalLM
from langchain import HuggingFacePipeline
from langchain.text_splitter import CharacterTextSplitter
import textwrap
import sys
import torch

In [4]:

loader = GenericLoader.from_filesystem(
        "./m460bsp_Library_StdDriver_registers",
        glob="**/*",
        suffixes=[".h", ".c"],
        parser=LanguageParser(language=Language.C, parser_threshold=10000),
        #parser=LanguageParser(),
        show_progress=True,
    )
docs = loader.load()
len(docs)

  0%|          | 0/93 [00:00<?, ?it/s]

93

In [5]:
for document in docs:
    print(document.metadata)
#print("\n\n--8<--\n\n".join([document.page_content for document in docs]))  

{'source': 'm460bsp_Library_StdDriver_registers\\Nuvoton\\m460\\Include\\acmp_reg.h', 'language': <Language.C: 'c'>}
{'source': 'm460bsp_Library_StdDriver_registers\\Nuvoton\\m460\\Include\\bmc_reg.h', 'language': <Language.C: 'c'>}
{'source': 'm460bsp_Library_StdDriver_registers\\Nuvoton\\m460\\Include\\bpwm_reg.h', 'language': <Language.C: 'c'>}
{'source': 'm460bsp_Library_StdDriver_registers\\Nuvoton\\m460\\Include\\canfd_reg.h', 'language': <Language.C: 'c'>}
{'source': 'm460bsp_Library_StdDriver_registers\\Nuvoton\\m460\\Include\\ccap_reg.h', 'language': <Language.C: 'c'>}
{'source': 'm460bsp_Library_StdDriver_registers\\Nuvoton\\m460\\Include\\clk_reg.h', 'language': <Language.C: 'c'>}
{'source': 'm460bsp_Library_StdDriver_registers\\Nuvoton\\m460\\Include\\crc_reg.h', 'language': <Language.C: 'c'>}
{'source': 'm460bsp_Library_StdDriver_registers\\Nuvoton\\m460\\Include\\crypto_reg.h', 'language': <Language.C: 'c'>}
{'source': 'm460bsp_Library_StdDriver_registers\\Nuvoton\\m460\\

In [6]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
C_splitter = RecursiveCharacterTextSplitter.from_language(language=Language.C, chunk_size=1000, chunk_overlap=200)
texts = C_splitter.split_documents(docs)
len(texts)

7547

In [17]:
print(texts[56].page_content)

void BPWM_EnableZeroInt(BPWM_T *bpwm, uint32_t u32ChannelNum);
void BPWM_DisableZeroInt(BPWM_T *bpwm, uint32_t u32ChannelNum);
void BPWM_ClearZeroIntFlag(BPWM_T *bpwm, uint32_t u32ChannelNum);
uint32_t BPWM_GetZeroIntFlag(BPWM_T *bpwm, uint32_t u32ChannelNum);
void BPWM_EnableLoadMode(BPWM_T *bpwm, uint32_t u32ChannelNum, uint32_t u32LoadMode);
void BPWM_DisableLoadMode(BPWM_T *bpwm, uint32_t u32ChannelNum, uint32_t u32LoadMode);
void BPWM_SetClockSource(BPWM_T *bpwm, uint32_t u32ChannelNum, uint32_t u32ClkSrcSel);
uint32_t BPWM_GetWrapAroundFlag(BPWM_T *bpwm, uint32_t u32ChannelNum);
void BPWM_ClearWrapAroundFlag(BPWM_T *bpwm, uint32_t u32ChannelNum);


/** @} end of group BPWM_EXPORTED_FUNCTIONS */

/** @} end of group BPWM_Driver */

/** @} end of group Standard_Driver */

#ifdef __cplusplus
}
#endif

#endif /* __BPWM_H__ */

/*** (C) COPYRIGHT 2019 Nuvoton Technology Corp. ***/


In [None]:
for document in texts:
    print(document.metadata)
#print("\n\n--8<--\n\n".join([document.page_content for document in texts]))  

## Save the texts into FAISS in disk

In [7]:
save_path = r'D:\nu_QA_data'
save_name = r'm460bsp_Lib_StdDriver_Regs'

# load embedding model
print("===== Load the embedding model =====")
embeddings = HuggingFaceEmbeddings(model_name='sentence-transformers/all-MiniLM-L6-v2',model_kwargs={'device': 'cpu'})

# Create vectors store
print("===== Build FAISS =====")
vectorstore=FAISS.from_documents(texts, embeddings)

index_path = os.path.join(save_path, save_name)
print(index_path)
vectorstore.save_local(index_path)

===== Load the embedding model =====
===== Build FAISS =====
D:\nu_QA_data\m460bsp_Lib_StdDriver_Regs


## Create VB only by chosen files

In [22]:
import re
import shutil



def find_files_with_pattern(directory, patterns, dst):
    matching_path = []
    
    for root, dirs, files in os.walk(directory):
        for file in files:
            file_path = os.path.join(root, file)
            
            for pattern in patterns:
                # Check if the file name matches the pattern
                if re.search(pattern, file, re.IGNORECASE):
                    matching_path.append(file_path)
                    shutil.copy(file_path, dst)
    
    return matching_path

def find_folders_with_pattern(directory, patterns, dst):
    matching_path = []
    
    for dirname in os.listdir(directory):
        
        for pattern in patterns:
            # Check if the file name matches the pattern
            if re.search(pattern, dirname, re.IGNORECASE):
                dir_loc = os.path.join(directory, dirname)
                #print(dir_loc, dirname)
                matching_path.append(dir_loc)
                shutil.copytree(dir_loc, os.path.join(dst, dirname), dirs_exist_ok=True)
            
    return matching_path

# Example usage:
directory_path = "./m251bsp"
#file_pattern = [r"acmp", r"dac"]
file_pattern = [r"uart", r"pdma", r"crc"]
dst = r"D:\nu_QA_data\m251bsp_partial_source"

if os.path.isdir(dst):
    print("Remove the old ones!")
    #os.rmdir(dst)
    shutil.rmtree(dst) 
else:
    print("First create!")
os.mkdir(dst)    

# Find the StdDriver
matching_files = find_files_with_pattern(os.path.join(directory_path, 'StdDriver'), file_pattern, dst)

# Find the SampleCode\StdDriver
matching_files += find_folders_with_pattern(os.path.join(directory_path, 'SampleCode', 'StdDriver'), file_pattern, dst)


print(matching_files)


Remove the old ones!
['./m251bsp\\StdDriver\\inc\\crc.h', './m251bsp\\StdDriver\\inc\\pdma.h', './m251bsp\\StdDriver\\inc\\scuart.h', './m251bsp\\StdDriver\\inc\\uart.h', './m251bsp\\StdDriver\\inc\\usci_uart.h', './m251bsp\\StdDriver\\src\\crc.c', './m251bsp\\StdDriver\\src\\pdma.c', './m251bsp\\StdDriver\\src\\scuart.c', './m251bsp\\StdDriver\\src\\uart.c', './m251bsp\\StdDriver\\src\\usci_uart.c', './m251bsp\\SampleCode\\StdDriver\\CRC_CCITT', './m251bsp\\SampleCode\\StdDriver\\CRC_CRC32', './m251bsp\\SampleCode\\StdDriver\\CRC_CRC8', './m251bsp\\SampleCode\\StdDriver\\DAC_PDMA_TimerTrigger', './m251bsp\\SampleCode\\StdDriver\\EADC_PDMA_BPWM_Trigger', './m251bsp\\SampleCode\\StdDriver\\EADC_PDMA_PWM_Trigger', './m251bsp\\SampleCode\\StdDriver\\FMC_CRC32', './m251bsp\\SampleCode\\StdDriver\\I2C_PDMA_TRX', './m251bsp\\SampleCode\\StdDriver\\PDMA_BasicMode', './m251bsp\\SampleCode\\StdDriver\\PDMA_ScatterGather', './m251bsp\\SampleCode\\StdDriver\\PDMA_ScatterGather_PingPongBuffer', '.

In [23]:

loader = GenericLoader.from_filesystem(
        dst,
        glob="**/*",
        suffixes=[".h", ".c"],
        parser=LanguageParser(language=Language.C, parser_threshold=10000),
        #parser=LanguageParser(),
        show_progress=True,
    )
docs = loader.load()
len(docs)

  0%|          | 0/50 [00:00<?, ?it/s]

50

In [24]:
for document in docs:
    print(document.metadata)

{'source': 'D:\\nu_QA_data\\m251bsp_partial_source\\crc.c', 'language': <Language.C: 'c'>}
{'source': 'D:\\nu_QA_data\\m251bsp_partial_source\\crc.h', 'language': <Language.C: 'c'>}
{'source': 'D:\\nu_QA_data\\m251bsp_partial_source\\pdma.c', 'language': <Language.C: 'c'>}
{'source': 'D:\\nu_QA_data\\m251bsp_partial_source\\pdma.h', 'language': <Language.C: 'c'>}
{'source': 'D:\\nu_QA_data\\m251bsp_partial_source\\scuart.c', 'language': <Language.C: 'c'>}
{'source': 'D:\\nu_QA_data\\m251bsp_partial_source\\scuart.h', 'language': <Language.C: 'c'>}
{'source': 'D:\\nu_QA_data\\m251bsp_partial_source\\uart.c', 'language': <Language.C: 'c'>}
{'source': 'D:\\nu_QA_data\\m251bsp_partial_source\\uart.h', 'language': <Language.C: 'c'>}
{'source': 'D:\\nu_QA_data\\m251bsp_partial_source\\usci_uart.c', 'language': <Language.C: 'c'>}
{'source': 'D:\\nu_QA_data\\m251bsp_partial_source\\usci_uart.h', 'language': <Language.C: 'c'>}
{'source': 'D:\\nu_QA_data\\m251bsp_partial_source\\CRC_CCITT\\main.

In [25]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
C_splitter = RecursiveCharacterTextSplitter.from_language(language=Language.C, chunk_size=1000, chunk_overlap=100)
texts = C_splitter.split_documents(docs)
len(texts)

935

In [26]:
save_path = r'D:\nu_QA_data'
save_name = r'm251bsp_partial'

# load embedding model
print("===== Load the embedding model =====")
embeddings = HuggingFaceEmbeddings(model_name='sentence-transformers/all-MiniLM-L6-v2',model_kwargs={'device': 'cpu'})

# Create vectors store
print("===== Build FAISS =====")
vectorstore=FAISS.from_documents(texts, embeddings)

index_path = os.path.join(save_path, save_name)
print(index_path)
vectorstore.save_local(index_path)

===== Load the embedding model =====
===== Build FAISS =====
D:\nu_QA_data\m251bsp_partial
