In [25]:
import os
import openai
from PyPDF2 import PdfReader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.chains import RetrievalQA
from langchain.vectorstores import FAISS
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.document_loaders import TextLoader
from dotenv import load_dotenv

In [9]:
# Carregar variáveis de ambiente
load_dotenv()

# Configurar chave da API do OpenAI
openai.api_key = os.getenv("OPENAI_API_KEY")

In [10]:
# Função para ler o PDF e dividir em chunks
def load_and_split_pdf(pdf_path):
    # Ler o PDF
    reader = PdfReader(pdf_path)
    text = ""
    
    for page in reader.pages:
        text += page.extract_text()
    
    # Dividir o texto em chunks
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
    chunks = text_splitter.split_text(text)
    
    return chunks

In [11]:
# Função para gerar embeddings e armazenar em um índice FAISS
def create_vector_store(chunks):
    embeddings = OpenAIEmbeddings()
    vector_store = FAISS.from_texts(chunks, embeddings)
    return vector_store

In [None]:
from langchain.llms import OpenAI  # Certifique-se de importar a classe OpenAI
from langchain.chains import RetrievalQA

def query_rag(question, vector_store):
    # Usando OpenAI LLM com a chave da API
    llm = OpenAI(model_name="text-davinci-003", temperature=0, openai_api_key=openai.api_key)
    
    # Criar a cadeia de perguntas e respostas com o retriever
    qa_chain = RetrievalQA.from_chain_type(
        llm=llm,
        chain_type="stuff",  # Ou outro tipo dependendo do seu caso
        retriever=vector_store.as_retriever(),
    )
    
    # Obter a resposta
    response = qa_chain.run(question)
    
    return response

In [42]:
# Caminho para o PDF
pdf_path = 'modelo operadora3.pdf'

# Carregar e dividir o PDF em chunks
chunks = load_and_split_pdf(pdf_path)

# Criar o vector store com os chunks
vector_store = create_vector_store(chunks)

In [43]:
# Perguntar algo sobre o PDF
question = """
    You are an expert in analyzing raw logs and unstructured text. Your task is to extract **all IP addresses (IPv4 and IPv6)** and the **exact timestamp** associated with each IP.

    Instructions:
    1. From the text below, find all valid IPv4 and IPv6 addresses.
    2. For each IP address, find the **full date and time** (timestamp) that appears **closest and most directly associated** with that IP.
    3. Return only the IP address and its timestamp in this exact format: `IP_ADDRESS - TIMESTAMP`
    4. Check if the IP address has a timestamp associated with it. If not, do not include it in the results.
    5. If multiple timestamps are present near the same IP, choose the most **complete and specific one** (e.g., including date + time + timezone if possible).
    6. Each result must be on a **separate line**.
    7. Do NOT include any explanation, markdown, bullet points, or extra formatting.


    Text:
    {chunk}
"""
response = query_rag(question, vector_store)

print("Resposta:", response)

Resposta: 
111.11.111.11 - 2000-01-11 01:01:01 UTC
111.11.111.11 - 2000-01-11 01:01:01 UTC
111.11.111.11 - 2000-01-11 01:01:01 UTC
111.11.111.11 - 2000-01-11 01:01:01 UTC
111.11.111.1 - 2000-01-11 01:01:01 UTC
111.11.111.11 - 2000-01-11 01:01:01 UTC
111.11.111.1 - 2000-01-11 01:01:01 UTC
1111:11d:1e11:11bc:bc11:1111:11ba:4e11 - 2000-01-11 01:01:01 UTC
111.11.111.1 - 2000-01-11 01:01:01 UTC
111.11.111.1 - 2000-01-11 01:01:01 UTC
111.11.111.1


In [39]:
response

'\n1101:11:11a:0a111:0:a11:aa11 - 2011-02-11 02:47:31 UTC\n1101:11:11a:0a111:0:a11:aa11 - 2011-02-11 02:38:38 UTC\n1101:11:11a:0a111:0:a11:aa11 - 2011-02-11 02:32:11 UTC\n1101:11:11a:0a111:0:a11:aa11 - 2011-02-11 02:27:55 UTC\n1101:11:11a:0a111:0:a11:aa11 - 2011-02-11 02:22:00 UTC\n1101:11:11a:0a111:0:a11:aa11 - 2011-02-11 02:06:55 UTC\n1101:11:11a:0a111:0:a11:aa11 - 2011-02-11 02:55:45 UTC\n1101:11:11a:0a111:0:a11:aa'

In [44]:
import pandas as pd

def texto_para_df(texto):
    # Criando uma lista de dicionários, onde cada dicionário é uma linha do DataFrame
    dados = []
    
    # Dividir o texto em linhas
    linhas = texto.split('\n')
    
    for linha in linhas:
        # Separar o texto antes de " - " como IP e o restante como datetime
        partes = linha.split(' - ')
        if len(partes) == 2:
            ip = partes[0]
            datetime = partes[1]
            dados.append({"IP": ip, "Datetime": datetime})
    
    # Criar o DataFrame a partir da lista de dicionários
    df = pd.DataFrame(dados)
    return df

# Texto de exemplo
texto = response

# Chamar a função
df = texto_para_df(texto)

# Exibir o DataFrame
print(df)


                                       IP                 Datetime
0                           111.11.111.11  2000-01-11 01:01:01 UTC
1                           111.11.111.11  2000-01-11 01:01:01 UTC
2                           111.11.111.11  2000-01-11 01:01:01 UTC
3                           111.11.111.11  2000-01-11 01:01:01 UTC
4                            111.11.111.1  2000-01-11 01:01:01 UTC
5                           111.11.111.11  2000-01-11 01:01:01 UTC
6                            111.11.111.1  2000-01-11 01:01:01 UTC
7  1111:11d:1e11:11bc:bc11:1111:11ba:4e11  2000-01-11 01:01:01 UTC
8                            111.11.111.1  2000-01-11 01:01:01 UTC
9                            111.11.111.1  2000-01-11 01:01:01 UTC


In [45]:
# salvar o df em um arquivo CSV
df.to_csv('resultado3.csv', index=False)

In [46]:
df1 = pd.read_csv('resultado1.csv')
df2 = pd.read_csv('resultado2.csv')
df3 = pd.read_csv('resultado3.csv')

In [47]:
df1

Unnamed: 0,IP,Datetime
0,2222:011a:1b11:11cd:e1f1:g111:hh11:0ij1,01/01/11 14:35:45 UTC
1,2222:011a:1b11:11cd:e1f1:g111:hh11:0ij1,01/01/11 14:35:45 UTC
2,222:011a:1b11:11cd:e1f1:g111:hh11:0ij1,01/01/11 14:35:45 UTC
