In [None]:
!pip install fitter

In [None]:
import pandas as pd
import re
from pathlib import Path
import matplotlib.pyplot as plt
from fitter import Fitter
from fitter import get_distributions
import numpy as np
import zipfile
import os

In [None]:
def descomprimir_zip_en_misma_carpeta(ruta_zip):
    # Obtener carpeta donde está el zip
    carpeta_destino = os.path.dirname(ruta_zip)

    with zipfile.ZipFile(ruta_zip, 'r') as zip_ref:
        zip_ref.extractall(carpeta_destino)

ruta_zip = r".\data\logfiles.zip"
descomprimir_zip_en_misma_carpeta(ruta_zip)

In [None]:
ruta_log = Path(r".\data\logfiles.log")

In [None]:
def parsear_linea(linea):
    patron = (
        r'(?P<ip>\d+\.\d+\.\d+\.\d+)\s+- -\s+\[(?P<timestamp>[^\]]+)\]\s+'
        r'"(?P<method>GET|POST|PUT|DELETE|HEAD|OPTIONS)\s+(?P<url>\S+)\s+(?P<protocol>HTTP/[\d.]+)"\s+'
        r'(?P<status>\d{3})\s+(?P<response_time>\d+)\s+'
        r'"(?P<referrer>[^"]*)"\s+"(?P<user_agent>[^"]*)"'
    )

    match = re.match(patron, linea)
    if match:
        return {
            "ip": match.group("ip"),
            "timestamp": match.group("timestamp"),
            "request_type": match.group("method"),
            "url": match.group("url"),
            "protocol": match.group("protocol"),
            "status_code": int(match.group("status")),
            "response_time": int(match.group("response_time")),
            "referrer": match.group("referrer"),
            "user_agent": match.group("user_agent"),
        }
    return None

In [None]:
registros = []

with open(ruta_log, 'r', encoding='utf-8') as f:
    for linea in f:
        resultado = parsear_linea(linea.strip())
        if resultado:
            registros.append(resultado)

df = pd.DataFrame(registros)
print("📄 Primeras filas del DataFrame:")
display(df.head())

In [None]:
def eliminar_log(path_log):
    if os.path.exists(path_log):
        os.remove(path_log)
    else:
        print(f"⚠️ El archivo no existe: {path_log}")

# Ejemplo de uso
ruta_log = r".\data\logfiles.log"
eliminar_log(ruta_log)

In [None]:
print(df.dtypes)

In [None]:
df = df.astype({
    "ip": "string",
    "request_type": "string",
    "url": "string",
    "protocol": "string",
    "referrer": "string",
    "user_agent": "string"
})

df["timestamp"] = pd.to_datetime(df["timestamp"], format="%d/%b/%Y:%H:%M:%S.%f %z")

df["status_code"] = pd.to_numeric(df["status_code"], errors="coerce")
df["response_time"] = pd.to_numeric(df["response_time"], errors="coerce")

display(df.head())

In [None]:
df_get_post = df[df['request_type'].isin(['GET', 'POST'])]

In [None]:
df_get = df[df['request_type'] == 'GET']

In [None]:
df_post = df[df['request_type'] == 'POST']

In [None]:
df_get_post = df_get_post.sort_values("timestamp")

df_get_post["IA"] = df_get_post["timestamp"].diff().dt.total_seconds() * 1000

In [None]:
ia_vals = df_get_post["IA"].dropna()

# Histograma de Intervalo de Arribo
plt.figure(figsize=(10, 6))
plt.hist(ia_vals, bins=100, color='skyblue', edgecolor='black')
plt.title("Histograma de Inter-Arrival Time (IA)")
plt.xlabel("Inter-Arrival Time (ms)")
plt.ylabel("Frecuencia")
plt.grid(True)
plt.show()

In [None]:
# Histograma de response_time para GET
plt.figure(figsize=(10, 6))
plt.hist(df_get["response_time"].dropna(), bins=100, color="skyblue", edgecolor="black")
plt.title("📊 Histograma de response_time - GET")
plt.xlabel("Tiempo de respuesta (ms)")
plt.ylabel("Cantidad de requests")
plt.grid(True)
plt.tight_layout()
plt.show()

In [None]:
# Histograma de response_time para POST
plt.figure(figsize=(10, 6))
plt.hist(df_post["response_time"].dropna(), bins=100, color="salmon", edgecolor="black")
plt.title("📊 Histograma de response_time - POST")
plt.xlabel("Tiempo de respuesta (ms)")
plt.ylabel("Cantidad de requests")
plt.grid(True)
plt.tight_layout()
plt.show()

In [None]:
ia_data = df_get_post["IA"].dropna()
ia_data = ia_data[ia_data > 0]

f = Fitter(ia_data)
f.fit()

best_IA = f.get_best()
print("🔍 Mejor distribución encontrada:")
print(best_IA)

In [None]:
response_time_data_get = df_get["response_time"].dropna()
response_time_data_get = response_time_data_get[response_time_data_get > 0]

f = Fitter(response_time_data_get)
f.fit()

best_TA_Simples = f.get_best()
print("🔍 Mejor distribución encontrada:")
print(best_TA_Simples)

In [None]:
response_time_data_post = df_post["response_time"].dropna()
response_time_data_post = response_time_data_post[response_time_data_post > 0]

f = Fitter(response_time_data_post)
f.fit()

best_TA_Complejas = f.get_best()
print("🔍 Mejor distribución encontrada:")
print(best_TA_Complejas)