In [32]:
import pandas as pd
import re
import fitz
from langchain.chat_models import ChatOpenAI
from langchain_core.prompts import PromptTemplate
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_core.runnables import RunnableSequence
from dotenv import load_dotenv
import os

In [24]:
llm = ChatOpenAI(model="gpt-3.5-turbo", temperature=0.0)

In [25]:
prompt = PromptTemplate(
    input_variables=["chunk"],
    template="""
You are an expert in analyzing raw logs and unstructured text. Your task is to extract **all IP addresses (IPv4 and IPv6)** and the **exact timestamp** associated with each IP.

Instructions:
1. From the text below, find all valid IPv4 and IPv6 addresses.
2. For each IP address, find the **full date and time** (timestamp) that appears **closest and most directly associated** with that IP.
3. Return only the IP address and its timestamp in this exact format: `IP_ADDRESS - TIMESTAMP`
4. If multiple timestamps are present near the same IP, choose the most **complete and specific one** (e.g., including date + time + timezone if possible).
5. Each result must be on a **separate line**.
6. Do NOT include any explanation, markdown, bullet points, or extra formatting.

Text:
{chunk}
"""
)

chain = prompt | llm

In [26]:
# %% Função para ler PDF
def read_pdf_text(path: str) -> str:
    with fitz.open(path) as doc:
        return "".join(page.get_text() for page in doc)

In [27]:
# %% Dividir texto em chunks
def split_text(text: str, chunk_size: int = 1000, chunk_overlap: int = 100) -> list[str]:
    splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
    return splitter.split_text(text)

In [28]:
# %% Invocar modelo para cada chunk
def extract_ips_with_time(chunk: str) -> str:
    return chain.invoke({"chunk": chunk})

In [29]:
# %% Extrair pares IP - horário diretamente
def extract_ip_timestamp_pairs_from_pdf(pdf_path: str) -> list[str]:
    text = read_pdf_text(pdf_path)
    chunks = split_text(text)
    extracted_pairs = []

    for chunk in chunks:
        response = extract_ips_with_time(chunk)
        # Filtra apenas linhas válidas com padrão esperado
        lines = response.strip().splitlines()
        for line in lines:
            if " - " in line:
                extracted_pairs.append(line.strip())

    return extracted_pairs

# %% Limpar e separar por versão de IP
import re

In [30]:
def clean_and_split_ips(pairs: list[str]) -> tuple[list[str], list[str], list[str]]:
    ipv4_pattern = r'\b(?:\d{1,3}\.){3}\d{1,3}\b'
    ipv6_pattern = r'\b(?:[a-fA-F0-9]{1,4}:){1,7}[a-fA-F0-9]{1,4}\b'

    ipv4_list = []
    ipv6_list = []
    all_list = []

    for pair in pairs:
        ip_part = pair.split(" - ")[0]
        if re.match(ipv4_pattern, ip_part):
            ipv4_list.append(pair)
        elif re.match(ipv6_pattern, ip_part):
            ipv6_list.append(pair)
        all_list.append(pair)

    return sorted(ipv4_list), sorted(ipv6_list), sorted(all_list)

In [None]:
# %% Caminho do PDF
pdf_path = "modelo operadora3.pdf"

# %% Extrair pares IP - horário
ip_timestamp_pairs = extract_ip_timestamp_pairs_from_pdf(pdf_path)

# %% Classificar por tipo de IP
ipv4_pairs, ipv6_pairs, all_pairs = clean_and_split_ips(ip_timestamp_pairs)

# %% Exibir resultados organizados
print("\n--- IPv4 ---")
for item in ipv4_pairs:
    print(item)

print("\n--- IPv6 ---")
for item in ipv6_pairs:
    print(item)

print("\n--- Todos ---")
for item in all_pairs:
    print(item)


AttributeError: 'AIMessage' object has no attribute 'strip'

In [63]:
all_pairs

['1101:11:11a:0a111:0:a11:aa11 - 2011-02-11 02:06:55 UTC',
 '1101:11:11a:0a111:0:a11:aa11 - 2011-02-11 02:22:00 UTC',
 '1101:11:11a:0a111:0:a11:aa11 - 2011-02-11 02:27:55 UTC',
 '1101:11:11a:0a111:0:a11:aa11 - 2011-02-11 02:32:11 UTC',
 '1101:11:11a:0a111:0:a11:aa11 - 2011-02-11 02:38:38 UTC',
 '1101:11:11a:0a111:0:a11:aa11 - 2011-02-11 02:47:31 UTC',
 '1101:11:11a:0a111:0:a11:aa11 - 2011-02-11 02:50:55 UTC',
 '1101:11:11a:0a111:0:a11:aa11 - 2011-02-11 02:55:45 UTC']

In [65]:
all_pairs_1 = all_pairs.copy()
all_pairs_1

['192.168.1.1 - 2022-03-01 15:47:23',
 '192.168.1.1 - 2022-03-01 15:49:56',
 '2222:011a:1b11:11cd:e1f1:g111:hh11:0ij1 - 01/01/11 14:35:45 UTC',
 '2222:011a:1b11:11cd:e1f1:g111:hh11:0ij1 - 01/01/11 14:35:45 UTC',
 '222:011a:1b11:11cd:e1f1:g111:hh11:0ij1 - 01/01/11 14:35:45 UTC',
 '2a02:2b8::1113 - 2022-03-01 16:12:37',
 '2a02:2b8::1113 - 2022-03-01 16:14:45']

In [None]:
all_pairs_2 = all_pairs.copy()
all_pairs_2

['1101:11:11a:0a111:0:a11:aa11 - 2011-02-11 02:06:55 UTC',
 '1101:11:11a:0a111:0:a11:aa11 - 2011-02-11 02:22:00 UTC',
 '1101:11:11a:0a111:0:a11:aa11 - 2011-02-11 02:27:55 UTC',
 '1101:11:11a:0a111:0:a11:aa11 - 2011-02-11 02:32:11 UTC',
 '1101:11:11a:0a111:0:a11:aa11 - 2011-02-11 02:38:38 UTC',
 '1101:11:11a:0a111:0:a11:aa11 - 2011-02-11 02:47:31 UTC',
 '1101:11:11a:0a111:0:a11:aa11 - 2011-02-11 02:50:55 UTC',
 '1101:11:11a:0a111:0:a11:aa11 - 2011-02-11 02:55:45 UTC']

In [56]:
def extract_unique_ip_timestamp_pairs(raw_list: list[str]) -> pd.DataFrame:
    pattern = r"(?P<ip>(?:\d{1,3}\.){3}\d{1,3}|(?:[a-fA-F0-9:]{2,39}))\s*-\s*(?P<timestamp>.+)"
    matches = re.finditer(pattern, "\n".join(raw_list))

    data = set()
    for match in matches:
        ip = match.group("ip").strip()
        timestamp = match.group("timestamp").strip()
        data.add((ip, timestamp))

    df = pd.DataFrame(list(data), columns=["ip", "timestamp"])
    return df

In [60]:
df_ips = extract_unique_ip_timestamp_pairs(all_pairs)
df_ips.drop_duplicates(subset="ip", keep="first", inplace=True)
print(df_ips)

                                       ip  \
0                                    2020   
1                            111.11.111.1   
2                           111.11.111.11   
3  1111:11d:1e11:11bc:bc11:1111:11ba:4e11   

                                           timestamp  
0  01-01 00:00:00 UTC - No responsive records loc...  
1                            2000-01-11 01:01:01 UTC  
2                            2000-01-11 01:01:01 UTC  
3                             2000-01-11 01:01:01UTC  


In [None]:
df_mod_3 = df_ips.copy()

In [47]:
df_mod_3

Unnamed: 0,ip,timestamp
0,1111:11d:1e11:11bc:bc11:1111:11ba:4e11,2000-01-11 01:01:01UTC
1,111.11.111.1,2000-01-11 01:01:01 UTC
2,111.11.111.11,2000-01-11 01:01:01 UTC


In [61]:
df_mod2 = df_ips.copy()


In [62]:
df_mod2

Unnamed: 0,ip,timestamp
0,1101:11:11a:0a111:0:a11:aa11,2011-02-11 02:47:31 UTC


In [58]:
df_mod_1 = df_ips.copy()
df_mod_1

Unnamed: 0,ip,timestamp
0,192.168.1.1,2022-03-01 15:47:23
1,2a02:2b8::1113,2022-03-01 16:12:37
