In [4]:
!pip install langchain -q
!pip install sentence_transformers -q
!pip install huggingface_hub -q
!pip install pypdf -q
!pip install faiss-cpu -q
!pip install ctransformers -q

In [13]:
import os
import re
import time
import torch
import pandas as pd
from langchain import PromptTemplate
from langchain.llms import  CTransformers
from langchain.chains import RetrievalQA
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.document_loaders import PyPDFLoader, DirectoryLoader
from langchain.document_loaders.csv_loader import CSVLoader
from langchain.embeddings import HuggingFaceEmbeddings
from sentence_transformers import SentenceTransformer
from langchain.chains.question_answering import load_qa_chain
from langchain.callbacks.manager import CallbackManager
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
from langchain.vectorstores import FAISS
from huggingface_hub import hf_hub_download


In [6]:
### Mount to g-drive (Sign in required for the mount)
from google.colab import drive
drive.mount("/content/gdrive", force_remount=True)

Mounted at /content/gdrive


In [7]:
# model_name_or_path = "TheBloke/Llama-2-13B-chat-GGML"
# model_basename = "llama-2-13b-chat.ggmlv3.q5_1.bin"
# local_dir = "/content/gdrive/MyDrive/Python_DA_DS_ML/Gen AI/LLM & Langchain/LLAMA2_Model"
# model_path = hf_hub_download(repo_id=model_name_or_path, filename=model_basename, local_dir = local_dir)

In [24]:
df = pd.read_csv('CSV/supermarket_sales.csv')
df.head(3)

Unnamed: 0,Invoice ID,Branch,City,Customer type,Gender,Product line,Unit price,Quantity,Tax 5%,Total,Date,Time,Payment,cogs,gross margin percentage,gross income,Rating
0,750-67-8428,A,Yangon,Member,Female,Health and beauty,74.69,7,26.1415,548.9715,1/5/2019,13:08,Ewallet,522.83,4.761905,26.1415,9.1
1,226-31-3081,C,Naypyitaw,Normal,Female,Electronic accessories,15.28,5,3.82,80.22,3/8/2019,10:29,Cash,76.4,4.761905,3.82,9.6
2,631-41-3108,A,Yangon,Normal,Male,Home and lifestyle,46.33,7,16.2155,340.5255,3/3/2019,13:23,Credit card,324.31,4.761905,16.2155,7.4


In [8]:
def createPDFVectorDB(data_path, vector_embedding_path):
    loader = DirectoryLoader(data_path, glob='*.pdf', loader_cls=PyPDFLoader,
                             use_multithreading=True, show_progress=True)
    docs = loader.load()
    text_splitter  = RecursiveCharacterTextSplitter(chunk_size = 1000, chunk_overlap=200)
    texts = text_splitter.split_documents(docs)
    embedding_model = HuggingFaceEmbeddings(model_name='sentence-transformers/all-mpnet-base-v2', model_kwargs={'device':'cuda'})
    db = FAISS.from_documents(texts, embedding_model)
    db.save_local(vector_embedding_path)


In [18]:
def createCSVVectorDB(data_path, vector_embedding_path):
    for csv_file_name in os.listdir(data_path):
        csv_file_path = os.path.join(data_path, csv_file_name)
        df = pd.read_csv(csv_file_path)[:1]
        field_names = list(df.columns)
        loader = CSVLoader(csv_file_path, csv_args={
                                    'delimiter': ',',
                                    'quotechar': '"',
                                    'fieldnames': field_names}, encoding="utf-8",)
        csv_data = loader.load()
        embedding_model = HuggingFaceEmbeddings(model_name='sentence-transformers/all-mpnet-base-v2', model_kwargs={'device':'cuda'})
        db = FAISS.from_documents(csv_data, embedding_model)
        db.save_local(vector_embedding_path)


In [11]:
%cd "/content/gdrive/MyDrive/Python_DA_DS_ML/Gen AI/LLM & Langchain"

/content/gdrive/MyDrive/Python_DA_DS_ML/Gen AI/LLM & Langchain


In [None]:
pdf_data_path = "./PDF"
pdf_vector_embedding_path = "./Vector_DB_PDF"
createPDFVectorDB(pdf_data_path, pdf_vector_embedding_path)

100%|██████████| 1/1 [00:02<00:00,  2.99s/it]


In [19]:
csv_data_path = "./CSV"
csv_vector_embedding_path = "./Vector_DB_CSV"
createCSVVectorDB(csv_data_path, csv_vector_embedding_path)

In [None]:
embedding = HuggingFaceEmbeddings(model_name='sentence-transformers/all-mpnet-base-v2', model_kwargs={'device':'cuda'})
test_idex = FAISS.load_local(pdf_vector_embedding_path,embedding)
question = "Artificial Intelligence in budget"
top_k_chunks  = test_idex.similarity_search(question,k=1)
top_k_chunks

[Document(page_content='60. For realizing the vision of “Make AI in India and Make AI work for \nIndia”,  three centres of excellence for Artificial Intelligence will be set-up in \ntop educational institutions. Leading industry players will partner in \nconducting interdisciplinary research, develop cutting-edge applications and \nscalable problem solutions in the areas of agriculture, health, and \nsustainable cities. This will galvanize an effective AI ecosystem and nurture \nquality human resources in the field. \nNational Data Governance Policy  \n61. To unleash innovation and research by start-ups and academia, a \nNational Data Governance Policy will be brought out. This will enable access \nto anonymized data. \nSimplification of Know Your Customer (KYC) process  \n62. The KYC process will be simplified adopting a ‘risk-based’ instead of \n‘one size fits all’ approach. The financial sector regulators will also be', metadata={'source': 'PDF/budget_2023.pdf', 'page': 17})]

In [20]:
embedding = HuggingFaceEmbeddings(model_name='sentence-transformers/all-mpnet-base-v2', model_kwargs={'device':'cuda'})
test_idex = FAISS.load_local(csv_vector_embedding_path,embedding)
question = "Find the Gender	distribution"
top_k_chunks  = test_idex.similarity_search(question,k=1)
top_k_chunks

[Document(page_content='Invoice ID: 192-98-7397\nBranch: C\nCity: Naypyitaw\nCustomer type: Normal\nGender: Male\nProduct line: Fashion accessories\nUnit price: 12.78\nQuantity: 1\nTax 5%: 0.639\nTotal: 13.419\nDate: 1/8/2019\nTime: 14:11\nPayment: Ewallet\ncogs: 12.78\ngross margin percentage: 4.761904762\ngross income: 0.639\nRating: 9.5', metadata={'source': './CSV/supermarket_sales.csv', 'row': 444})]

In [None]:
def _createQuestionPrompt(question, top_k_chunks):
    prompt= ""
    prompt += 'search results:\n\n'
    for i in range(len(top_k_chunks)):
        meta_info = '['+top_k_chunks[i].metadata['source'].split('/')[-1] + " , page : " + str(top_k_chunks[i].metadata['page'])+'] '
        page_content = top_k_chunks[i].page_content.replace('\n', ' ')
        page_content = re.sub('\s+', ' ', page_content)
        combined_content = meta_info + page_content
        prompt = combined_content +'\n\n'
    prompt += "Instructions: Compose a comprehensive reply to the query using the search results given."\
                "Cite each reference using [pdfname.pdf , page : number] notation (every result has this number at the beginning)."\
                "Citation should be done at the end of each sentence. If the search results mention multiple subjects"\
                "with the same name, create separate answers for each. Only include information found in the results and"\
                "don't add any additional information. Make sure the answer is correct and don't output false content."\
                "If the text does not relate to the query, simply state 'Found Nothing'. Don't write 'Answer:'"\
                "Directly start the answer.\n"
    prompt+= f"Query : {question} \n\n"
    return prompt

In [None]:
question = "Artificial Intelligence in budget"
prompt =  _createQuestionPrompt(question, top_k_chunks)

In [None]:
print(prompt)

[budget_2023.pdf , page : 17] 60. For realizing the vision of “Make AI in India and Make AI work for India”, three centres of excellence for Artificial Intelligence will be set-up in top educational institutions. Leading industry players will partner in conducting interdisciplinary research, develop cutting-edge applications and scalable problem solutions in the areas of agriculture, health, and sustainable cities. This will galvanize an effective AI ecosystem and nurture quality human resources in the field. National Data Governance Policy 61. To unleash innovation and research by start-ups and academia, a National Data Governance Policy will be brought out. This will enable access to anonymized data. Simplification of Know Your Customer (KYC) process 62. The KYC process will be simplified adopting a ‘risk-based’ instead of ‘one size fits all’ approach. The financial sector regulators will also be

Instructions: Compose a comprehensive reply to the query using the search results given

In [None]:
model_path = "./LLAMA2_Model/llama-2-7b-chat.ggmlv3.q8_0.bin"
llm = CTransformers(
    model=model_path,callbacks=[StreamingStdOutCallbackHandler()],
    model_type='llama',temperature = 0
)

In [None]:
### Without Callback
start_time = time.time()
prompt =  _createQuestionPrompt(question, top_k_chunks)
response = llm(prompt)
end_time =  time.time()
print("Execution Time in seconds : ", end_time-start_time)

Answer :
The Union Budget for 2023-24 has allocated significant funds towards the development and promotion of Artificial Intelligence (AI) in India. According to [budget_2023.pdf , page : 17], three centers of excellence for AI will be set up in top educational institutions, leading industry players will partner in conducting interdisciplinary research, and cutting-edge applications and scalable problem solutions will be developed in the areas of agriculture, health, and sustainable cities. This will help galvanize an effective AI ecosystem and nurture quality human resources in the field.
the field [170 
the field. National Data Governance.
  63 the field.


the field.

the field. Further, [170the field.
the field.

the field.
the field. Additionally, the field (Buding.


the field.
the field.
the field.
the field.
the field.

  the field.
the field.
the field.
the field [75 the field.

the field.Execution Time in seconds :  558.8962256908417


In [None]:
## With Callback
start_time = time.time()
prompt =  _createQuestionPrompt(question, top_k_chunks)
response = llm(prompt)
end_time =  time.time()
print("\nExecution Time in seconds : ", end_time-start_time)

Budget 2023 has allocated Rs 100 crore for AI development and deployment, as part of its larger goal to transform India into a leading AI hub. (budget_2023.pdf , page 17) 
To realize the vision of "Make AI in India and Make AI work for India", three centers of excellence for Artificial Intelligence will be set-up in top educational institutions. (budget_2023.pdf , page 17) 60. Leading industry players will partner in conducting interdisciplinary research, develop cutting-edge applications and edge applications and edgedge
Execution Time in seconds :  442.3121931552887


In [None]:
from IPython.display import HTML
display(HTML('''
<style>
  pre {
      white-space: normal;
  }
</style>
'''))
print(response)

Budget 2023 has allocated Rs 100 crore for AI development and deployment, as part of its larger goal to transform India into a leading AI hub. (budget_2023.pdf , page 17) 
To realize the vision of "Make AI in India and Make AI work for India", three centers of excellence for Artificial Intelligence will be set-up in top educational institutions. (budget_2023.pdf , page 17) 60. Leading industry players will partner in conducting interdisciplinary research, develop cutting-edge applications and edge applications and edgedge
