## SQL

In [4]:
import os
import openai
import pandas as pd
from dotenv import load_dotenv
import mysql.connector as connection 
from langchain.chat_models import ChatOpenAI
from langchain.agents import create_sql_agent
from langchain.sql_database import SQLDatabase
from langchain.agents.agent_toolkits import SQLDatabaseToolkit

load_dotenv()
os.environ['OPENAI_API_KEY'] = os.getenv('OPENAI_KEY')
openai.api_key = os.getenv('OPENAI_KEY')

class SQLQuerywithLangchain:
    def __init__(self):
        self.DB_USERNAME = os.getenv('DB_USERNAME')
        self.DB_PASSWORD = os.getenv('DB_PASSWORD')
        self.DB_HOST = os.getenv('DB_HOST')
        self.DB_PORT = os.getenv('DB_PORT')
        self.DB_NAME = os.getenv('DB_NAME')

    def createDBConnectionString(self):
        db_user = self.DB_USERNAME
        db_Password  = self.DB_PASSWORD
        db_host = self.DB_HOST + self.DB_PORT
        db_name = self.DB_NAME
        connectionString = f"mysql+pymysql://{db_user}:{db_Password}@{db_host}/{db_name}"
        return connectionString
    
    def getSQLSchema(self):
            ''''
                Extracting the schema info from the MySQL database and passing the schema 
                information to the prompt.
            '''
            sql_query = f"""  
            SELECT C.TABLE_NAME, C.COLUMN_NAME, C.DATA_TYPE, T.TABLE_TYPE, T.TABLE_SCHEMA  
            FROM INFORMATION_SCHEMA.COLUMNS C  
            JOIN INFORMATION_SCHEMA.TABLES T ON C.TABLE_NAME = T.TABLE_NAME AND C.TABLE_SCHEMA = T.TABLE_SCHEMA  
            WHERE T.TABLE_SCHEMA = '{self.DB_NAME}' 
            """ 
            mysql_connection_string = self.createDBConnectionString()
            result = pd.read_sql_query(sql_query, mysql_connection_string)
            df = result.infer_objects()
            output=[]
            current_table = ''  
            columns = []  
            for index, row in df.iterrows():
                table_name = f"{row['TABLE_SCHEMA']}.{row['TABLE_NAME']}"  
                column_name = row['COLUMN_NAME']  
                data_type = row['DATA_TYPE']  
                if " " in table_name:
                    table_name= f"[{table_name}]" 
                column_name = row['COLUMN_NAME']  
                if " " in column_name:
                    column_name= f"[{column_name}]" 
                    # If the table name has changed, output the previous table's information  
                if current_table != table_name and current_table != '':  
                    output.append(f"table: {current_table}, columns: {', '.join(columns)}")  
                    columns = []  
                
                # Add the current column information to the list of columns for the current table  
                columns.append(f"{column_name} {data_type}")  
                
                # Update the current table name  
                current_table = table_name  

            # Output the last table's information  
            output.append(f"table: {current_table}, columns: {', '.join(columns)}")
            output = "\n ".join(output)

            return output   

    def createAgentExecutor(self, openAI_model_name="gpt-3.5-turbo"):
        '''
            Instantiating Langchain agent to query SQL Database.
            Using SQLDatabaseToolkit from Langchain.
        '''
        mysql_connection_string = self.createDBConnectionString()
        llm = ChatOpenAI(model_name= openAI_model_name )
        db = SQLDatabase.from_uri(mysql_connection_string)
        toolkit = SQLDatabaseToolkit(db=db, llm =llm)
        agent_executor = create_sql_agent(
                                        llm=llm,
                                        toolkit=toolkit,
                                        verbose=True,
                                        return_intermediate_steps=False,
                                        handle_parsing_errors=True)
        return agent_executor

    def fetchQueryResult(self,question):
        '''
            Using langchain's SQL tool to fetch answer to the user's query.
        '''
        db_agent = self.createAgentExecutor()
        schema_info = self.getSQLSchema()
        prompt = f'''You are a professional SQL Data Analyst whose job is to fetch results from the SQL database.\
            The SQL Table schema is as follows {schema_info}.\
            The question will be asked in # delimiter. If you are not able to find the answer write "Found Nothing" in response.\
            Do not write anything out of context or on your own.\
            If the SQL query returns multiple rows then summarize them and provide response using bullet points.For duplicate response after the SQL query consider any one of the result to parse into LLM.\
            Question : # {question} #'''
        db_agent.return_intermediate_steps=True
        agent_response = db_agent(prompt)
        output = agent_response['output']
        # query = agent_response['intermediate_steps'][-1][0].log.split('\n')[-1].split('Action Input:')[-1].strip().strip('"')
        return output 
    

class SQLQuerywithFunctionCalling(SQLQuerywithLangchain):
    def __init__(self):
        super().__init__()

    def getMYSQLConnectionObject(self):
        db_user = self.DB_USERNAME
        db_password  = self.DB_PASSWORD
        db_host = self.DB_HOST
        db_name = self.DB_NAME
        conn = connection.connect(host=db_host,user=db_user,password=db_password,
                                        database=db_name, use_pure=True) 
        if conn.is_connected():
            return conn
        else:
            return "Database connection can't be established"
    
    def defineFunction(self):
        database_schema_string = self.getSQLSchema()
        function = [
            {
                "name": "ask_database",
                "description": "Use this function to answer user questions about product. Output should be a fully formed SQL query.",
                "parameters": {
                    "type": "object",
                    "properties": {
                        "query": {
                            "type": "string",
                            "description": f"""
                                    SQL query extracting info to answer the user's question.
                                    SQL should be written using this database schema:
                                    {database_schema_string}
                                    The query should be returned in plain text, not in JSON.
                                    Do not use new lines chatacthers inside the query.
                                    """,
                        }
                    },
                    "required": ["query"],
                },
            }
        ]

        return function
    
    def ask_database(self,query):
        """Function to query MySQL database with a provided SQL query."""
        try:
            conn = self.getMYSQLConnectionObject()
            cursor=conn.cursor()   
            cursor.execute(query)  
            results = str(cursor.fetchall())
            conn.close()
        except Exception as e:
            results = f"query failed with error: {e}"
        return results
    
    def execute_function_call(self,message):
        if message["function_call"]["name"] == "ask_database":
            query = eval(message["function_call"]["arguments"])["query"]
            results = self.ask_database(query)
        else:
            results = f"Error: function {message['function_call']['name']} does not exist"
        return results
    
    def openai_functions_chain(self,query):
        messages = []
        messages.append({"role": "system", "content": "Answer user questions by generating SQL queries against the CanonDB Database."})
        messages.append({"role": "user", "content": query})
        while True:
            assistant_message = openai.ChatCompletion.create(
                temperature=0,
                model="gpt-3.5-turbo-0613",
                messages=messages,
                functions=self.defineFunction(),
                function_call="auto",
            )["choices"][0]["message"]
            messages.append(assistant_message)

            if assistant_message.get("function_call"):
                print("Executing function: ", assistant_message["function_call"])
                results = self.execute_function_call(assistant_message)
                messages.append({"role": "function", "name": assistant_message["function_call"]["name"], "content": results})
            else:
                break

        return assistant_message['content']
        

In [5]:
question = "Find the competetor product information for economic category on23rd June,2023?"
obj = SQLQuerywithFunctionCalling()
res = obj.openai_functions_chain(question)

Executing function:  {
  "name": "ask_database",
  "arguments": "{\n  \"query\": \"SELECT Competetor_Product_Information FROM canondb.canon WHERE Category = 'economic' AND Date = '2023-06-23'\"\n}"
}


In [6]:
res

'The competitor product information for the economic category on 23rd June, 2023 is: "Due to Growth in Road Logistics Industry New Opportunities are arriving in Transport Sector in the form of Office Expansions and Office Equipment Requirements."'

In [10]:
obj.fetchQueryResult(question)



[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3mAction: sql_db_list_tables
Action Input: ""[0m
Observation: [38;5;200m[1;3mcanon[0m
Thought:[32;1m[1;3mThe only table in the database is "canon". I should query the schema of this table to see the available columns.
Action: sql_db_schema
Action Input: "canon"[0m
Observation: [33;1m[1;3m
CREATE TABLE canon (
	`index` INTEGER NOT NULL AUTO_INCREMENT, 
	`Date` DATE, 
	`Department` VARCHAR(50), 
	`Product` VARCHAR(50), 
	`Zone` VARCHAR(50), 
	`State` VARCHAR(50), 
	`Category` VARCHAR(50), 
	`Competetor_Product_Information` VARCHAR(2000), 
	`Attachment` VARCHAR(50), 
	PRIMARY KEY (`index`)
)DEFAULT CHARSET=utf8mb4 COLLATE utf8mb4_0900_ai_ci ENGINE=InnoDB

/*
3 rows from canon table:
index	Date	Department	Product	Zone	State	Category	Competetor_Product_Information	Attachment
2	2023-06-23	PCC-BIS	CS	India	#VALUE!	Economic	Due to Growth in Road Logistics Industry New Opportunities are arriving in Transport Sector in the f	Not

'Due to Growth in Road Logistics Industry New Opportunities are arriving in Transport Sector in the form of Office Expansions and Office Equipment Requirements.'

In [8]:
import openai
GPT_MODEL = "gpt-3.5-turbo-0613"

functions = [
    {
        "name": "ask_database",
        "description": "Use this function to answer user questions about music. Output should be a fully formed SQL query.",
        "parameters": {
            "type": "object",
            "properties": {
                "query": {
                    "type": "string",
                    "description": f"""
                            SQL query extracting info to answer the user's question.
                            SQL should be written using this database schema:
                            {database_schema_string}
                            The query should be returned in plain text, not in JSON.
                            Do not use new lines chatacthers inside the query.
                            """,
                }
            },
            "required": ["query"],
        },
    }
]


def ask_database(query):
    """Function to query SQLite database with a provided SQL query."""
    try:
        conn = obj.getMYSQLConnectionObject()
        print(conn.is_connected)
        cursor=conn.cursor()   
        cursor.execute(query)  
        results = str(cursor.fetchall())
        conn.close()
    except Exception as e:
        results = f"query failed with error: {e}"
        return results

def execute_function_call(message):
    if message["function_call"]["name"] == "ask_database":
        query = eval(message["function_call"]["arguments"])["query"]
        results = ask_database(query)
    else:
        results = f"Error: function {message['function_call']['name']} does not exist"
    return results


def openai_functions_chain(query):
  messages = []
  messages.append({"role": "system", "content": "Answer user questions by generating SQL queries against the Chinook Music Database."})
  messages.append({"role": "user", "content": query})
  while True:
    assistant_message = openai.ChatCompletion.create(
        temperature=0,
        model=GPT_MODEL,
        messages=messages,
        functions=functions,
        function_call="auto",
    )["choices"][0]["message"]
    messages.append(assistant_message)

    if assistant_message.get("function_call"):
      print("Executing function: ", assistant_message["function_call"])
      results = execute_function_call(assistant_message)
      messages.append({"role": "function", "name": assistant_message["function_call"]["name"], "content": results})
    else:
      break

  return assistant_message["function_call"],assistant_message['content']

NameError: name 'database_schema_string' is not defined

In [44]:
print("Response: ", openai_functions_chain(question))

Executing function:  {
  "name": "ask_database",
  "arguments": "{\n  \"query\": \"SELECT Competetor_Product_Information FROM canondb.canon WHERE Category = 'economic' AND Date = '2023-06-23'\"\n}"
}
Response:  Apologies, but I am currently unable to access the database to retrieve the competitor product information for the economic category on June 23rd, 2023. Please try again later.


In [38]:
results = ask_database(question)
results



In [7]:
conn = obj.getMYSQLConnectionObject()
print(conn.is_connected())

<mysql.connector.connection.MySQLConnection object at 0x000001C410641270>
True


In [60]:
def getMYSQLConnectionObject():
    
    db_user= os.getenv('DB_USERNAME')
    db_password = os.getenv('DB_PASSWORD')
    db_host_only = os.getenv('DB_HOST_ONLY')
    db_name = os.getenv('DB_NAME')
    conn = connection.connect(host=db_host_only,user=db_user,password=db_password,
                                    database=db_name, use_pure=True) 
    print(conn)
    print(db_user, db_password, db_host_only, db_name)
    if conn.is_connected():
        return conn
    else:
        return "Database connection can't be established"

In [55]:
conn = getMYSQLConnectionObject()

<mysql.connector.connection.MySQLConnection object at 0x0000027062110F40>
root 1320 localhost canonDB


In [59]:
os.getenv('DB_HOST')

'localhost:3306'

In [58]:
os.getenv('DB_PASSWORD')

'1320'

In [37]:
import mysql.connector as connection 
mydb=connection.connect(host='localhost',user=DB_USERNAME,password=DB_PASSWORD,use_pure=True)

In [4]:
mydb.is_connected()

True

In [13]:
mydb=connection.connect(host=DB_HOST,user=DB_USERNAME,password=DB_PASSWORD,
                            database="canonDB", use_pure=True) 
print(mydb.is_connected())
query="Select * from canon"
cursor=mydb.cursor()   
cursor.execute(query)        
result = cursor.fetchall()
print(result)

True
[(2, datetime.date(2023, 6, 23), 'PCC-BIS', 'CS', 'India', '#VALUE!', 'Economic', 'Due to Growth in Road Logistics Industry New Opportunities are arriving in Transport Sector in the form of Office Expansions and Office Equipment Requirements.', 'Not Available'), (3, datetime.date(2023, 6, 23), 'PCC-BIS', 'CS', 'West', '#VALUE!', 'Product', 'Sharp organized partners meet synergy at Goa invited 200 key partner and showcasing its office solution equipment’s including laptops and unveiling there newly launched MFD’s BP-70C and BP-60C.', 'Not Available'), (4, datetime.date(2023, 6, 30), 'PCC-PPP', 'PPP', 'India', '#VALUE!', 'Product', 'HP is very aggressively positioning Z6 series in photo vertical to gain market share. 24 inch is being positioned in photo segment and 44 inch in corporate and government segment.', 'Not Available'), (5, datetime.date(2023, 6, 29), 'PBC', 'ICB', 'East', '#VALUE!', 'Product', 'Nikon & Sony started promoting Z8 and Zve1 visibility, across MBOs', 'Not Avail

In [5]:
sql_query= 'select * from canon'
pd.read_sql_query(sql_query, connectionString)

Unnamed: 0,index,Date,Department,Product,Zone,State,Category,Competetor_Product_Information,Attachment
0,2,2023-06-23,PCC-BIS,CS,India,#VALUE!,Economic,Due to Growth in Road Logistics Industry New O...,Not Available
1,3,2023-06-23,PCC-BIS,CS,West,#VALUE!,Product,Sharp organized partners meet synergy at Goa i...,Not Available
2,4,2023-06-30,PCC-PPP,PPP,India,#VALUE!,Product,HP is very aggressively positioning Z6 series ...,Not Available
3,5,2023-06-29,PBC,ICB,East,#VALUE!,Product,Nikon & Sony started promoting Z8 and Zve1 vis...,Not Available
4,6,2023-06-22,CBC-PPP,PPP,India,#VALUE!,Market footfall,Konica Minolta demo on wheels had good respons...,Not Available
...,...,...,...,...,...,...,...,...,...
82,84,2023-07-14,PBC-South,ICB,South,#VALUE!,Product,Sony A7M4 sellout is good in the market.,Not Available
83,85,2023-07-14,PBC-South,ICB,South,#VALUE!,Product,Sony New model A7R5 stocks reached outlets las...,Not Available
84,86,2023-07-14,PBC-South,ICB,South,#VALUE!,Product,Sony APS-C stocks likely to hit market post 15...,Not Available
85,87,2023-07-14,PBC-South,ICB,South,#VALUE!,Product,Nikon Z8 stocks are available in market but no...,Not Available


## PDF

In [7]:
import os 
import re
import time
import copy
import fitz
import numpy as np
import pinecone as pc
from tqdm.auto import tqdm
from dotenv import load_dotenv
from langchain.docstore.document import Document
from langchain.vectorstores import  Pinecone #,FAISS,
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.embeddings.openai import OpenAIEmbeddings
# from langchain.text_splitter import RecursiveCharacterTextSplitter
# from langchain.document_loaders import PyPDFLoader, DirectoryLoader

load_dotenv()

class CustomTextSplitter:

    def __init__(self, chunk_size, chunk_overlap):
        self.keep_separator= False
        self.strip_whitespace=True
        self.is_separator_regex=False
        self.add_start_index=False
        self.length_function = len
        self.chunk_size=chunk_size
        self.chunk_overlap = chunk_overlap
        self.separators=["\n\n", "\n", " ", ""]

    def _split_text_with_regex(self,text: str, separator: str):
        # Now that we have the separator, split the text
        if separator:
            if self.keep_separator:
                # The parentheses in the pattern keep the delimiters in the result.
                _splits = re.split(f"({separator})", text)
                splits = [_splits[i] + _splits[i + 1] for i in range(1, len(_splits), 2)]
                if len(_splits) % 2 == 0:
                    splits += _splits[-1:]
                splits = [_splits[0]] + splits
            else:
                splits = re.split(separator, text)
        else:
            splits = list(text)
        return [s for s in splits if s != ""]
    
    def _join_docs(self, docs, separator:str,):
        text = separator.join(docs)
        if self.strip_whitespace:
            text = text.strip()
        if text == "":
            return None
        else:
            return text
        

    def _merge_splits(self, splits, separator: str):
        # We now want to combine these smaller pieces into medium size
        # chunks to send to the LLM.
        separator_len = self.length_function(separator)

        docs = []
        current_doc= []
        total = 0
        for d in splits:
            _len = self.length_function(d)
            if (
                total + _len + (separator_len if len(current_doc) > 0 else 0)
                > self.chunk_size
            ):
                if total > self.chunk_size:
                    print(
                        f"Created a chunk of size {total}, "
                        f"which is longer than the specified {self.chunk_size}"
                    )
                if len(current_doc) > 0:
                    doc = self._join_docs(current_doc, separator)
                    if doc is not None:
                        docs.append(doc)
                    # Keep on popping if:
                    # - we have a larger chunk than in the chunk overlap
                    # - or if we still have any chunks and the length is long
                    while total > self.chunk_overlap or (
                        total + _len + (separator_len if len(current_doc) > 0 else 0)
                        > self.chunk_size
                        and total > 0
                    ):
                        total -= self.length_function(current_doc[0]) + (
                            separator_len if len(current_doc) > 1 else 0
                        )
                        current_doc = current_doc[1:]
            current_doc.append(d)
            total += _len + (separator_len if len(current_doc) > 1 else 0)
        doc = self._join_docs(current_doc, separator)
        if doc is not None:
            docs.append(doc)
        return docs
    
    def _split_text(self,text: str,):
        """Split incoming text and return chunks."""
        final_chunks = []
        # Get appropriate separator to use
        separator = self.separators[-1]
        new_separators = []
        for i, _s in enumerate(self.separators):
            _separator = _s if self.is_separator_regex else re.escape(_s)
            if _s == "":
                separator = _s
                break
            if re.search(_separator, text):
                separator = _s
                new_separators = self.separators[i + 1 :]
                break

        _separator = separator if self.is_separator_regex else re.escape(separator)
        splits = self._split_text_with_regex(text, _separator)

        # Now go merging things, recursively splitting longer texts.
        _good_splits = []
        _separator = "" if self.keep_separator else separator
        for s in splits:
            if self.length_function(s) < self.chunk_size:
                _good_splits.append(s)
            else:
                if _good_splits:
                    merged_text = self._merge_splits(_good_splits, _separator)
                    final_chunks.extend(merged_text)
                    _good_splits = []
                if not new_separators:
                    final_chunks.append(s)
                else:
                    other_info = self._split_text(s, new_separators)
                    final_chunks.extend(other_info)
        if _good_splits:
            merged_text = self._merge_splits(_good_splits, _separator)
            final_chunks.extend(merged_text)
        return final_chunks
    

    def create_documents(self, texts, pdf_name):
        """Create documents from a list of texts."""
        documents = []
        for i, text in enumerate(texts):
            index = -1
            for chunk in self._split_text(text):
                if self.add_start_index:
                    index = text.find(chunk, index + 1)
                new_doc = Document(page_content=chunk, metadata={})
                new_doc.metadata['source'] = pdf_name
                documents.append(new_doc)
        return documents
    
class PdfTextExtractor:
    def __init__(self, pdf_path):
        self.pdf_path = pdf_path 
        self.start_page = 1
        self.end_page = None

    def _preprocess(self,text):
        '''
        preprocess extrcted text from pdf
        1. Replace new line character with whitespace.
        2. Replace redundant whitespace with a single whitespace
        '''
        text = text.replace('\n', ' ')
        text = re.sub('\s+', ' ', text)
        text = re.sub(r'\\u[e-f][0-9a-z]{3}',' ', text)
        return text
    
    def _pdf_to_text(self, pdf_filename):
        '''
            convert pdf to a list of words.
        '''
        doc = fitz.open(self.pdf_path)
        total_pages= doc.page_count

        if self.end_page is None:
            self.end_page = total_pages
        text_list=[]

        for i in tqdm(range(self.start_page-1, self.end_page)):
            text= doc.load_page(i).get_text('text')
            text= self._preprocess(text)
            text_list.append(text+ f' [{pdf_filename}, page:{i+1}]')
        doc.close()
        return text_list
    

class Data:

    def __init__(self, pdf_data_path, vector_db_path):

        self.pdf_data_path = pdf_data_path
        self.vector_db_path = vector_db_path
        self.pinecone_api_key = os.getenv('PINECONE_KEY')
        self.pinecone_env = os.getenv('PINECONE_ENV')
        self.embedding_model = HuggingFaceEmbeddings(model_name='sentence-transformers/all-MiniLM-L6-v2')
        self.openai_embedding_model = OpenAIEmbeddings(model='text-embedding-ada-002',
                                                        openai_api_key=os.getenv('OPENAI_KEY'))
        
    # def createPDFVectorDBwithFAISS(self, chunk_size, chunk_overlap):
    #     document_list=[]
    #     for pdf_filename in os.listdir(self.pdf_data_path):
    #         pdf_file_path = os.path.join(self.pdf_data_path,pdf_filename)
    #         extracted_text_list = PdfTextExtractor(pdf_file_path)._pdf_to_text(pdf_filename)
    #         merged_text_list = ['.'.join(extracted_text_list)]
    #         splitter = CustomTextSplitter(chunk_size, chunk_overlap)
    #         docs  = splitter.create_documents(merged_text_list,pdf_filename)
    #         document_list.extend(docs)
    
    #     db = FAISS.from_documents(document_list, self.embedding_model)
    #     db.save_local(self.vector_db_path)

    # def create_top_k_chunk_from_FAISS(self, question,top_k):
    #     test_idex = FAISS.load_local(self.vector_db_path,self.embedding_model)
    #     top_k_chunks  = test_idex.similarity_search(question,k=top_k)
    #     return top_k_chunks
    
    # def fetch_FAISS_VectorDB(self):
    #     test_index = FAISS.load_local(self.vector_db_path,self.embedding_model)
    #     return test_index


    def createPDFVectorDBwithPinecone(self,chunk_size, chunk_overlap):
        document_list=[]
        for pdf_filename in os.listdir(self.pdf_data_path):
            pdf_file_path = os.path.join(self.pdf_data_path,pdf_filename)
            extracted_text_list = PdfTextExtractor(pdf_file_path)._pdf_to_text(pdf_filename)
            merged_text_list = ['.'.join(extracted_text_list)]
            splitter = CustomTextSplitter(chunk_size, chunk_overlap)
            docs  = splitter.create_documents(merged_text_list,pdf_filename)
            document_list.extend(docs)

        embeddings = []
        ids = []
        metadatas = []
        for i in range(len(document_list)):
            if i%5==0:
                time.sleep(5)
            page_content = document_list[i].page_content
            source_pdf = document_list[i].metadata['source'].split('\\')[-1]
            embedded_page_content = self.openai_embedding_model.embed_query(page_content)
            metadata = {
                'source' : source_pdf,
                'page_content' : page_content
            }
            ids.append(str(i))
            embeddings.append(embedded_page_content)
            metadatas.append(metadata)

        pc.init(api_key=self.pinecone_api_key, environment=self.pinecone_env)
        index = pc.Index('pdf-index')
        index.upsert(vectors = zip(ids, embeddings, metadatas))

    def create_top_k_chunk_from_Pinecone(self, question,top_k):
        pc.init(api_key=self.pinecone_api_key, environment=self.pinecone_env)
        index = pc.Index('pdf-index')
        vectorstore = Pinecone( index, self.openai_embedding_model.embed_query, text_key='page_content')
        top_k_chunks = vectorstore.similarity_search(question, k=top_k)
        return top_k_chunks


# pdf_data_path = ".\\media"
# pdf_vector_embedding_path = ".\\VectorDB"
# data_obj = Data(pdf_data_path,pdf_vector_embedding_path)
# data_obj.createPDFVectorDBwithFAISS(chunk_size=2000, chunk_overlap=500)
# # data_obj.createPDFVectorDBwithPinecone(chunk_size=2000, chunk_overlap=500)
# test_question = "Find the cost of Sony ZV-E1 Full Frame camera"
# result = data_obj.create_top_k_chunk_from_FAISS(test_question, top_k =3)
# # result = data_obj.create_top_k_chunk_from_Pinecone(test_question, top_k =3)
# print(result)
# print(result[0].metadata['source'])
# print(result[0].metadata['page'])
# print(result[0].page_content)


  from tqdm.autonotebook import tqdm


In [8]:
pdf_data_path = ".\\media"
pdf_vector_embedding_path = ".\\VectorDB"
input_question = "What is the price of Sony ZV-E1 Camera ?"
data_obj = Data(pdf_data_path, pdf_vector_embedding_path)
top_k_chunks = data_obj.create_top_k_chunk_from_Pinecone(input_question,top_k=2)
top_k_chunks



[Document(page_content='Price: MRP:Rs. 214,990 Amazon Offer Price: Rs. 1,88,990 [Sony ZV-E1 Full-Frame Interchangeable-Lens Mirrorless vlog Camera.pdf, page:1]. Technical Details: Brand Sony Manufacturer Sony, Sony Corporation, 1-7-1 Konan, Minato-KU, Tokyo 108-0075, Japan Model ZV-E1/BQ IN5 Model Name ZV-E1 Model Year 2023 Product Dimensions 5.43 x 12.1 x 7.19 cm; 483 Grams Batteries 1 Lithium Ion batteries required. (included) Item model number ZV-E1/BQ IN5 Memory Slots Available 1 Compatible Devices Laptop, Desktop, Tablet, Smartphone Special Features Face Detection Mounting Hardware Camera, Rechargeable Battery NP-FZ100, Shoulder strap, Wind Screen, Wind Screen Adapter, Body cap, Accessory shoe cap Number of items 1 Standing screen display size [Sony ZV-E1 Full-Frame Interchangeable-Lens Mirrorless vlog Camera.pdf, page:2].7.5 Centimetres Display Type LCD Image stabilization technology Digital Has Image Stabilisation Yes Optical zoom 1 Aspect Ratio 16:9 Resolution 1,036,800 Dots Ma

In [10]:
len(top_k_chunks[0].page_content)

1995