In [44]:
import fitz  # PyMuPDF
from langchain.llms import Ollama
from langchain.prompts import PromptTemplate
from langchain.chains import LLMChain
from langchain.text_splitter import RecursiveCharacterTextSplitter
import os


In [13]:
# 1. Ler Pdf
def read_pdf_text(path):
    doc = fitz.open(path)
    text = ""
    for page in doc:
        text += page.get_text()
    return text

In [14]:
# 2. Dividir o texto
def split_text(text, chunk_size=1000, chunk_overlap=100):
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap,
    )
    return text_splitter.split_text(text)

In [31]:
# 3. LLM
llm = Ollama(model="mistral", temperature=0.0)

In [33]:
# 4. Prompt
prompt = PromptTemplate(
    input_variables=["chunk"],
    # template="Extraia todos os endereços IP do seguinte texto:\n\n{chunk}\n\nSomente liste os IPs."
    # template = "Extract all IP addresses from the following text:\n\n{chunk}\n\nJust list the IPs. Organize them in a list, with one IP per line.\n\n You shouldn't return anything besides this list of IPs."
    # template = """
    #             Extract all IP addresses from the following text:

    #             {chunk}

    #             Instructions:
    #             - Identify and extract all valid IPv4 addresses from the provided text.
    #             - Format the output as a list of IP addresses, with one IP per line.
    #             - Ensure that the output contains only the IP addresses without any additional text or explanations.
    #             - Return only the list (as a python list) of extracted IP addresses.
    #             - Format the output as one IP per line.
    #             - Do not include any additional text or explanations—only the IPs.
    #         """
    template = """
                You must extract all IP addresses from the following text:

                {chunk}

                Rules:
                1. Return ONLY the list of IP addresses.
                2. Each IP must be on a separate line.
                3. Do NOT include any explanation, label, bullet point, or formatting—just raw IP addresses.
                4. Do NOT return anything other than the IPs. No intros, no summaries.

                If you do not follow these rules, your output will be discarded.
    
            """
)

chain = LLMChain(llm=llm, prompt=prompt)

In [36]:
import re

def extract_valid_ips(text):
    return re.findall(r'\b(?:\d{1,3}\.){3}\d{1,3}\b', text)

In [37]:
# 5. Juntar tudo
def extract_ips_from_pdf(pdf_path):
    text = read_pdf_text(pdf_path)
    chunks = split_text(text)
    ips = []
    for chunk in chunks:
        result = chain.run(chunk)
        ips.append(result.strip())
        # Inside the loop:
        for chunk in chunks:
            result = chain.run(chunk)
            ips += extract_valid_ips(result)
    return "\n".join(ips)

In [42]:
caminho_pdf = "modelo operadora3.pdf"
ips_encontrados = extract_ips_from_pdf(caminho_pdf)
print("\nIPs extraídos:\n")
print(ips_encontrados)


IPs extraídos:

100000000000000
   (This is the only IP address found in the provided text)
111.11.111.111
111.11.111.1
111.11.111.11
111.11.111.11
111.11.111.11
111.11.111.11
111.11.111.11
111.11.111.11
111.11.111.11
111.11.111.11
111.11.111.11
111.11.111.1
111.11.111.11
111.11.111.1
111.11.111.1
111.11.111.1
111.11.111.1
111.11.111.1
111.11.111.1
111.11.111.1
111.11.111.111
  1111:11d:1e11:11bc:bc11:1111:11ba:4e11
  111.11.111.1
  111.11.111.11
  111.11.111.11
  111.11.111.11
  111.11.111.11
  111.11.111.11
  111.11.111.11
  111.11.111.11
  111.11.111.11
111.11.111.111
111.11.111.1
111.11.111.11
111.11.111.11
111.11.111.11
111.11.111.11
111.11.111.11
111.11.111.11
111.11.111.11
111.11.111.11
111.11.111.11
111.11.111.1
111.11.111.11
111.11.111.1
111.11.111.1
111.11.111.1
111.11.111.1
111.11.111.1
111.11.111.1
111.11.111.1
111.11.111.11
  111.11.111.1
  111.11.111.11
  111.11.111.1
  1111:11d:1e11:11bc:bc11:1111:11ba:4e11
  111.11.111.1
  111.11.111.1
  111.11.111.1
  111.11.111.1
  1

In [43]:
raw_text = ips_encontrados

# Regex for valid IPv4 addresses
ipv4_pattern = r'\b(?:\d{1,3}\.){3}\d{1,3}\b'

# Regex for valid IPv6 addresses (simple version)
ipv6_pattern = r'\b(?:[a-fA-F0-9]{1,4}:){7}[a-fA-F0-9]{1,4}\b'

# Find all matches
ipv4_list = re.findall(ipv4_pattern, raw_text)
ipv6_list = re.findall(ipv6_pattern, raw_text)

# Remove duplicates and sort
unique_ipv4 = sorted(set(ipv4_list))
unique_ipv6 = sorted(set(ipv6_list))

# Output
print("IPv4 Addresses:")
print(unique_ipv4)

print("\nIPv6 Addresses:")
print(unique_ipv6)

IPv4 Addresses:
['111.11.111.1', '111.11.111.11', '111.11.111.111']

IPv6 Addresses:
['1111:11d:1e11:11bc:bc11:1111:11ba:4e11']


In [45]:
# Get the filename without extension
base_name = os.path.splitext(caminho_pdf)[0]
# Combina as duas listas e remove duplicatas
combined_ips = list(set(ipv4_list + ipv6_list))

# Gera o nome do arquivo de saída baseado no nome do PDF
base_name = os.path.splitext(caminho_pdf)[0]
txt_filename = f"{base_name}_ips.txt"

# Salva os IPs no arquivo
with open(txt_filename, "w") as f:
    for ip in combined_ips:
        f.write(f"{ip}\n")

print(f"✅ IPs combinados e salvos em: {txt_filename}")

✅ IPs combinados e salvos em: modelo operadora3_ips.txt
