<a href="https://colab.research.google.com/github/Ignacioelamo/LLMs4Phishing/blob/main/SpamAssesin_Original.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install spamcheck pandas tqdm scikit-learn

Collecting spamcheck
  Downloading spamcheck-1.0.4-py3-none-any.whl.metadata (1.3 kB)
Downloading spamcheck-1.0.4-py3-none-any.whl (2.8 kB)
Installing collected packages: spamcheck
Successfully installed spamcheck-1.0.4


In [None]:
NOMBRE_ARCHIVO = 'emails.csv'

!wget https://raw.githubusercontent.com/Ignacioelamo/LLMs4Phishing/main/data/01_combined_cleaned_email_data.csv -O $NOMBRE_ARCHIVO

--2025-05-28 11:40:50--  https://raw.githubusercontent.com/Ignacioelamo/LLMs4Phishing/main/data/01_combined_cleaned_email_data.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 10064045 (9.6M) [text/plain]
Saving to: ‘emails.csv’


2025-05-28 11:40:51 (109 MB/s) - ‘emails.csv’ saved [10064045/10064045]



In [None]:
import pandas as pd
import spamcheck
from tqdm.notebook import tqdm
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
import concurrent.futures
from sklearn.model_selection import train_test_split
import os
import multiprocessing

In [None]:
df_original = pd.read_csv(NOMBRE_ARCHIVO)

In [None]:
train_val_df, test_df = train_test_split(df_original, test_size=0.1, random_state=42, stratify=df_original['label'] if 'label' in df_original.columns else None)



print(f"Tamaño del DataFrame original: {len(df_original)}")
print(f"Tamaño del conjunto de test (test_df): {len(test_df)} ({len(test_df)/len(df_original)*100:.2f}%)")
print(f"Tamaño del conjunto de entrenamiento y validación (train_val_df): {len(train_val_df)} ({len(train_val_df)/len(df_original)*100:.2f}%)")

# Seleccionamos el DataFrame que se enviará a la API (solo el conjunto de test)
df_to_process = test_df.copy() # Usamos .copy() para evitar SettingWithCopyWarning

Tamaño del DataFrame original: 10000
Tamaño del conjunto de test (test_df): 1000 (10.00%)
Tamaño del conjunto de entrenamiento y validación (train_val_df): 9000 (90.00%)


# --- 2. Preparar listas para los resultados ---

In [None]:
predicted_labels = []
spamcheck_scores = []
spamcheck_reports = []

# --- 3. Procesar cada correo en el DataFrame con spamcheck ---


In [None]:
def process_email(row):
    """Procesa un solo correo con spamcheck y devuelve los resultados."""
    subject = row['subject']
    body = row['body']
    email_content = f"Subject: {subject}\n\n{body}"

    try:
        result = spamcheck.check(email_content, report=True)
        score = result['score']
        report = result['report']

        try:
            score = float(score)
        except ValueError:
            # print(f"Advertencia: El score de spamcheck no es numérico: '{score}'. Asumiendo 0.") # Desactivado para no llenar la consola en paralelo
            score = 0.0

        predicted_label = 1 if score >= 5 else 0

        return predicted_label, score, report

    except Exception as e:
        # print(f"Error procesando el correo: {e}") # Desactivado para no llenar la consola en paralelo
        return 0, 0.0, {"error": str(e)}

In [None]:
print("Procesando correos con spamcheck en paralelo...")
# Define el número de hilos. Un buen punto de partida es 5-10 para solicitudes de red.
# No uses demasiados, ya que cada hilo consume recursos y los servidores de spamcheck pueden tener límites.
NUM_THREADS = 16

with concurrent.futures.ThreadPoolExecutor(max_workers=NUM_THREADS) as executor:
    # Mapea la función 'process_email' a cada fila del DataFrame
    # tqdm se usa para mostrar el progreso de las tareas completadas
    results = list(tqdm(executor.map(process_email, df_to_process.to_dict('records')),
                        total=df_to_process.shape[0],
                        desc="Analizando correos en paralelo"))

for predicted_label, score, report in results:
    predicted_labels.append(predicted_label)
    spamcheck_scores.append(score)
    spamcheck_reports.append(report)

Procesando correos con spamcheck en paralelo...


Analizando correos en paralelo:   0%|          | 0/1000 [00:00<?, ?it/s]

# --- 4. Añadir resultados al DataFrame ---


In [None]:
df_to_process['predicted_label'] = predicted_labels
df_to_process['spamcheck_score'] = spamcheck_scores
df_to_process['spamcheck_report'] = spamcheck_reports

# --- 5. Calcular métricas de rendimiento ---


In [None]:
true_labels = df_to_process['label'].tolist()
predicted_labels = df_to_process['predicted_label'].tolist()

print("\n--- Métricas de Rendimiento ---")

# Matriz de Confusión
tn, fp, fn, tp = confusion_matrix(true_labels, predicted_labels).ravel()
print(f"Verdaderos Positivos (TP): {tp}")
print(f"Verdaderos Negativos (TN): {tn}")
print(f"Falsos Positivos (FP): {fp}")
print(f"Falsos Negativos (FN): {fn}")

# Accuracy
accuracy = accuracy_score(true_labels, predicted_labels)
print(f"Accuracy: {accuracy:.4f}")

# Precision
precision = precision_score(true_labels, predicted_labels, zero_division=0)
print(f"Precision: {precision:.4f}")

# Recall
recall = recall_score(true_labels, predicted_labels, zero_division=0)
print(f"Recall: {recall:.4f}")

# F1 Score
f1 = f1_score(true_labels, predicted_labels, zero_division=0)
print(f"F1 Score: {f1:.4f}")


--- Métricas de Rendimiento ---
Verdaderos Positivos (TP): 126
Verdaderos Negativos (TN): 498
Falsos Positivos (FP): 2
Falsos Negativos (FN): 374
Accuracy: 0.6240
Precision: 0.9844
Recall: 0.2520
F1 Score: 0.4013
