In [30]:
#Bibliotecas necesarias :

#!pip install pyspark kafka-python pandas matplotlib

from pyspark.sql import SparkSession
from pyspark.sql.functions import col, count, window, desc, lag, avg, stddev, min, max, sum, approx_count_distinct
from pyspark.sql.window import Window
from kafka import KafkaProducer
from kafka import KafkaConsumer

import os
import csv
import random
import datetime
import json
import time

In [40]:

# Configura tu Kafka
KAFKA_TOPIC = 'logs'
KAFKA_BROKER = 'localhost:9093'  # Cambia esto si usas Docker o red externa

# Inicializar el productor
producer = KafkaProducer(
    bootstrap_servers=KAFKA_BROKER,
    value_serializer=lambda v: json.dumps(v).encode('utf-8')
)

# Tipos de eventos
event_types = [
    ('INFO', 0.6),
    ('WARNING', 0.15),
    ('ERROR', 0.1),
    ('DEBUG', 0.1),
    ('LOGIN_FAILURE', 0.05)
]

users = ['admin', 'user1', 'user2', 'guest', 'root', 'test']
ips = [
    '192.168.1.10', '192.168.1.15', '10.0.0.1', '172.16.0.2',
    '203.0.113.45', '8.8.8.8', '185.199.108.153'
]

def choose_event_type():
    return random.choices([e[0] for e in event_types], weights=[e[1] for e in event_types])[0]

def generate_log():
    timestamp = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
    event_type = choose_event_type()
    user = random.choice(users)
    ip = random.choice(ips)

    if event_type == 'LOGIN_FAILURE':
        message = f"Intento de acceso fallido para usuario {user} desde {ip}"
    elif event_type == 'ERROR':
        message = f"Error critico en el sistema detectado por el usuario {user}"
    elif event_type == 'WARNING':
        message = f"Posible anomalia detectada desde la IP {ip}"
    elif event_type == 'DEBUG':
        message = f"Modo depuración activo por usuario {user}"
    else:
        message = f"Evento normal registrado para usuario {user} desde {ip}"

    return {
        'timestamp': timestamp,
        'event_type': event_type,
        'user': user,
        'ip': ip,
        'message': message
    }

# Bucle infinito para enviar logs
try:
    print(f"Enviando logs a Kafka topic '{KAFKA_TOPIC}'...")
    while True:
        log = generate_log()
        producer.send(KAFKA_TOPIC, value=log)
        #print("Enviado:", log)
        time.sleep(1)  # Esperar 1 segundo entre logs (ajustable)
except KeyboardInterrupt:
    print("Interrumpido por el usuario")
finally:
    producer.close()


Enviando logs a Kafka topic 'logs'...
Interrumpido por el usuario


In [37]:
# La siguiente celda es para configurar la sesión de Spark y verificar que todo esté listo:

from pyspark.sql import SparkSession

# Crea una sesión de Spark
spark = SparkSession.builder \
    .appName("SIEM with Spark in Jupyter") \
    .getOrCreate()

# Verifica que Spark está funcionando correctamente
print(spark.version)


3.5.5


In [41]:

def read_logs_from_csv(file_path="logs/kafka_logs.csv"):
    df = spark.read.option("header", "true").csv(file_path)
    df.show(5)
    return df

df = read_logs_from_csv("logs/kafka_logs.csv")


+-------------------+-------------+-----+------------+--------------------+
|          timestamp|   event_type| user|          ip|             message|
+-------------------+-------------+-----+------------+--------------------+
|2025-04-06 17:08:40|        ERROR|guest|     8.8.8.8|Error critico en ...|
|2025-04-06 17:08:42|         INFO|user1|  172.16.0.2|Evento normal reg...|
|2025-04-06 17:08:43|LOGIN_FAILURE|admin|192.168.1.10|Intento de acceso...|
+-------------------+-------------+-----+------------+--------------------+
only showing top 5 rows



In [42]:

# Conteo de eventos por tipo
def count_events(df):
    return df.groupBy("event_type").count().orderBy(desc("count"))

count_events(df).show()


+-------------+-----+
|   event_type|count|
+-------------+-----+
|         INFO|   10|
|        ERROR|    1|
|LOGIN_FAILURE|    1|
+-------------+-----+



In [43]:

# Conteo de eventos por usuario
def count_users(df):
    return df.groupBy("user").count().orderBy(desc("count"))

count_users(df).show()


+-----+-----+
| user|count|
+-----+-----+
|guest|    4|
|admin|    3|
| root|    2|
|user2|    2|
| test|    2|
|user1|    1|
+-----+-----+



In [44]:

# Detectar usuarios con alta cantidad de errores o fallos de login
def detect_anomalous_users(df):
    return df.filter((col("event_type") == "ERROR") | (col("event_type") == "LOGIN_FAILURE")) \
             .groupBy("user").count().orderBy(desc("count"))

detect_anomalous_users(df).show()


+-----+-----+
| user|count|
+-----+-----+
|admin|    1|
|guest|    1|
+-----+-----+



In [45]:

# Análisis de eventos en el tiempo
def analyze_time_distribution(df):
    df = df.withColumn("timestamp", col("timestamp").cast("timestamp"))
    return df.groupBy(window(col("timestamp"), "1 hour"), "event_type").count()

analyze_time_distribution(df).show()


+--------------------+-------------+-----+
|              window|   event_type|count|
+--------------------+-------------+-----+
|{2025-04-06 17:00...|        ERROR|    1|
|{2025-04-06 17:00...|         INFO|   10|
|{2025-04-06 17:00...|LOGIN_FAILURE|    1|
+--------------------+-------------+-----+



In [46]:

# Detección de patrones de actividad sospechosa
def detect_suspicious_patterns(df):
    window_spec = Window.partitionBy("user").orderBy("timestamp")
    return df.withColumn("prev_event", lag("event_type").over(window_spec))

detect_suspicious_patterns(df).show()


+-------------------+-------------+-----+---------------+--------------------+-------------+
|          timestamp|   event_type| user|             ip|             message|   prev_event|
+-------------------+-------------+-----+---------------+--------------------+-------------+
|2025-04-06 17:08:43|LOGIN_FAILURE|admin|   192.168.1.10|Intento de acceso...|         NULL|
|2025-04-06 17:08:48|         INFO|admin|        8.8.8.8|Evento normal reg...|LOGIN_FAILURE|
|2025-04-06 17:08:50|         INFO|admin|185.199.108.153|Evento normal reg...|         INFO|
|2025-04-06 17:08:40|        ERROR|guest|        8.8.8.8|Error critico en ...|         NULL|
|2025-04-06 17:08:44|         INFO|guest|       10.0.0.1|Evento normal reg...|        ERROR|
|2025-04-06 17:08:45|         INFO|guest|     172.16.0.2|Evento normal reg...|         INFO|
|2025-04-06 17:08:51|         INFO|guest|     172.16.0.2|Evento normal reg...|         INFO|
|2025-04-06 17:08:49|         INFO| test|   203.0.113.45|Evento normal

In [47]:

# Estadísticas por usuario
def user_statistics(df):
    user_events = df.groupBy("user").agg(count("event_type").alias("total_events"))
    return user_events.agg(
        avg("total_events").alias("avg_events"),
        stddev("total_events").alias("stddev_events"),
        min("total_events").alias("min_events"),
        max("total_events").alias("max_events")
    )

user_stats = user_statistics(df)
user_stats.show()


+------------------+------------------+----------+----------+
|        avg_events|     stddev_events|min_events|max_events|
+------------------+------------------+----------+----------+
|2.3333333333333335|1.0327955589886444|         1|         4|
+------------------+------------------+----------+----------+



In [48]:

# Correlación entre intentos de login fallidos y errores
def correlate_failed_logins_errors(df):
    failed_logins = df.filter(col("event_type") == "LOGIN_FAILURE").alias("fl")
    df_alias = df.alias("df")
    
    return failed_logins.join(
        df_alias, 
        (col("fl.user") == col("df.user")) & (col("fl.timestamp") < col("df.timestamp")),
        "inner"
    ).filter(col("df.event_type") == "ERROR").select(
        col("fl.timestamp").alias("failed_login_time"),
        col("df.timestamp").alias("error_time"),
        col("fl.user"),
        col("df.event_type")
    )

correlate_failed_logins_errors(df).show()


+-----------------+----------+----+----------+
|failed_login_time|error_time|user|event_type|
+-----------------+----------+----+----------+
+-----------------+----------+----+----------+



In [49]:

# Detectar direcciones IP con múltiples intentos fallidos
def detect_dangerous_ips(df):
    return df.filter(col("event_type") == "LOGIN_FAILURE").groupBy("ip").count().orderBy(desc("count"))

detect_dangerous_ips(df).show()


+------------+-----+
|          ip|count|
+------------+-----+
|192.168.1.10|    1|
+------------+-----+



In [13]:

# Identificar usuarios con actividad inusual
def detect_suspicious_users(user_stats):
    thresh = user_stats.select(avg("total_events")).collect()[0][0] * 2
    return user_stats.filter(col("total_events") > thresh)

detect_suspicious_users(user_stats).show()


AnalysisException: [UNRESOLVED_COLUMN.WITH_SUGGESTION] A column or function parameter with name `total_events` cannot be resolved. Did you mean one of the following? [`max_events`, `avg_events`, `min_events`, `stddev_events`].;
'Aggregate [unresolvedalias(avg('total_events), Some(org.apache.spark.sql.Column$$Lambda/0x000001e8cae486b8@1c9083c0))]
+- Aggregate [avg(total_events#196L) AS avg_events#200, stddev(cast(total_events#196L as double)) AS stddev_events#201, min(total_events#196L) AS min_events#203L, max(total_events#196L) AS max_events#205L]
   +- Aggregate [user#19], [user#19, count(event_type#18) AS total_events#196L]
      +- Relation [timestamp#17,event_type#18,user#19,ip#20,message#21] csv


In [50]:

# Número estimado de usuarios únicos
def count_unique_users(df):
    return df.select(approx_count_distinct("user")).collect()[0][0]

print(f"Número estimado de usuarios únicos: {count_unique_users(df)}")


Número estimado de usuarios únicos: 6


In [51]:

# Análisis de actividad por IP
def analyze_ip_activity(df):
    return df.groupBy("ip").agg(count("event_type").alias("total_events")).orderBy(desc("total_events"))

analyze_ip_activity(df).show()


+---------------+------------+
|             ip|total_events|
+---------------+------------+
|        8.8.8.8|           5|
|     172.16.0.2|           4|
|   192.168.1.10|           1|
|   192.168.1.15|           1|
|       10.0.0.1|           1|
|185.199.108.153|           1|
|   203.0.113.45|           1|
+---------------+------------+



In [52]:

# Detección de ráfagas de intentos de acceso fallidos
def detect_potential_attacks(df):
    time_window = Window.partitionBy("user").orderBy("timestamp").rowsBetween(-5, 0)
    df = df.withColumn("recent_events", count("event_type").over(time_window))
    return df.filter((col("event_type") == "LOGIN_FAILURE") & (col("recent_events") > 3))

detect_potential_attacks(df).show()

+---------+----------+----+---+-------+-------------+
|timestamp|event_type|user| ip|message|recent_events|
+---------+----------+----+---+-------+-------------+
+---------+----------+----+---+-------+-------------+

