<a href="https://colab.research.google.com/github/JesseJames50/projeto_llm/blob/main/sistema_pdf.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install qdrant-client openai langchain PyPDF2
!pip install -U langchain-openai

In [None]:
!pip install openai --upgrade

In [None]:
!pip install python-dotenv

In [132]:
from qdrant_client import QdrantClient
from qdrant_client.http.models import PointStruct, VectorParams, Distance
from uuid import uuid4
from datetime import datetime
import os
import PyPDF2
from langchain import LLMChain
from langchain_core.prompts import PromptTemplate
from langchain.text_splitter import CharacterTextSplitter
from langchain_community.embeddings import OpenAIEmbeddings
from langchain.llms import OpenAI
import openai
from openai import OpenAI
from dotenv import load_dotenv


# Carrega as variáveis de ambiente do arquivo .env
load_dotenv()
apenai_key = os.getenv('OPENAI_API_KEY')
qdrant_key = os.getenv('QDRANT_API_KEY')

# Configure a API key da OpenAI
client = OpenAI(
  api_key= apenai_key,  # this is also the default, it can be omitted
)

# Nome da coleção no Qdrant
collection_name = 'documents_collection'

# Configurar Qdrant
client_Qdrant = QdrantClient(
    url="https://20344d73-d460-4ad6-972f-7626b4af36bf.us-east4-0.gcp.cloud.qdrant.io:6333",
    api_key=qdrant_key
)

# Criar coleção no Qdrant
# Verificar se a coleção existe, caso contrário, criar
if not client_Qdrant.collection_exists(collection_name):
    client_Qdrant.create_collection(
        collection_name=collection_name,
        vectors_config=VectorParams(size=1536, distance=Distance.COSINE)
    )



## Função para Extrair Texto dos PDFs

In [133]:
def extract_text_from_pdf(pdf_path):
    with open(pdf_path, 'rb') as file:
        reader = PyPDF2.PdfReader(file)
        text = ""
        for page_num in range(len(reader.pages)):
            text += reader.pages[page_num].extract_text()
    return text


## Função para Dividir o Texto em Partes Menores

In [134]:
def get_chunks(text):
    text_splitter = CharacterTextSplitter(
        separator="\n",
        chunk_size=1000,
        chunk_overlap=200,
        length_function=len
    )
    chunks = text_splitter.split_text(text)
    return chunks

## Função para Identificar o Assunto com Langchain

In [135]:
# Define o template do prompt para identificar o assunto
prompt_template_subject = PromptTemplate(
    input_variables=["text"],
    template="Identifique o assunto do seguinte texto:\n\n{text}\n\nAssunto:"
)

# Função para identificar o assunto de um chunk
def identify_subject(text):
    prompt = prompt_template_subject.format(text=text)
    messages = [
        {
            "role": "user",
            "content": prompt,
        }
    ]
    response = client.chat.completions.create(
        model="gpt-3.5-turbo",
        messages=messages
    )
    subject = response.choices[0].message.content
    return subject.strip()

# Função para identificar o assunto de uma amostra de chunks
def identify_subjects(text, num_chunks=5):
    chunks = get_chunks(text)
    sample_chunks = chunks[:num_chunks] + chunks[-num_chunks:]
    subjects = [identify_subject(chunk) for chunk in sample_chunks]
    # Usar o assunto mais frequente ou combinar os assuntos
    final_subject = max(set(subjects), key=subjects.count)
    return final_subject


## Função para Gerar Embeddings com OpenAI

In [136]:
def generate_embeddings(text, model="text-embedding-ada-002"):
    text = text.replace("\n", " ")
    response = client.embeddings.create(
        input=text,
        model=model
    )
    return response.data[0].embedding

Inserção dos Dados no Qdrant

In [148]:
def insert_documents_into_qdrant(directory):
    current_date = datetime.now().isoformat()

    for filename in os.listdir(directory):
        if filename.endswith(".pdf"):
            pdf_path = os.path.join(directory, filename)
            text = extract_text_from_pdf(pdf_path)
            subject = identify_subjects(text)
            chunks = get_chunks(text)
            embeddings = [generate_embeddings(chunk) for chunk in chunks]

            for chunk, embedding in zip(chunks, embeddings):
                uid = str(uuid4())
                document = {
                    "id": uid,
                    "vector": embedding,
                    "payload": {
                        "assunto": subject,
                        "text": chunk,
                        "datainclusao": current_date,
                        "data_modificacao": current_date
                    }
                }

                point = PointStruct(id=document["id"], vector=document["vector"], payload=document["payload"])
                client_Qdrant.upsert(
                    collection_name="documents_collection",
                    points=[point],
                )

# Insira os documentos da pasta "documents"
insert_documents_into_qdrant("/content/data")



## Função para Buscar por Assunto

In [165]:
def search_by_subject(subject, client, collection_name, top_k=5):
    query_vector = generate_embeddings(subject)

    search_result = client.search(
        collection_name=collection_name,
        query_vector=query_vector,
        limit=top_k
    )
    return search_result




## Exemplo de busca por assunto

In [164]:
# Exemplo de busca por assunto
subject_query = "Faça um resumo sobre o que é a escola austríaca."
results = search_by_subject(subject_query, client_Qdrant, "documents_collection")

for result in results:
    print(result.payload["text"])

Query vector: [-0.009322420693933964, 0.01292368397116661, 0.011410387232899666, -0.017240092158317566, -0.01653771847486496, 0.020279457792639732, -0.01859375834465027, -0.0070684379898011684, 0.016652652993798256, -0.016946371644735336, -0.015401149168610573, 0.003706619841977954, 0.0040386514738202095, -0.011084740981459618, 0.006193662993609905, -0.028988895937800407, 0.04288313537836075, -0.00730469124391675, -0.00016192517068702728, 0.010605849325656891, -0.004705906845629215, 0.004054614342749119, -0.008256088942289352, -0.007074823137372732, 0.009558673948049545, 0.0052199168130755424, 0.01292368397116661, 0.0016968720592558384, 0.018414972350001335, -0.024468161165714264, 0.009169175289571285, -0.010312129743397236, 0.004772951360791922, -0.016576029360294342, -0.008696668781340122, -0.007081208284944296, 0.0011365690734237432, -0.012004212476313114, 0.013370649889111519, 0.00678110308945179, 0.016218457370996475, -0.0075920261442661285, 0.0006181690841913223, -0.0088946111500