### Proyecto de Aplicaciones Basadas en el conocimiento
### Autores: Dilan Moya, Duval Muñoz, Angeli Tello y Mateo Gutierrez
### Sexto Semestre de Ingenieria de Software


In [1]:
#importe de librerias necesarias
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

#Importe de modelos de sklearn
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import (accuracy_score, classification_report, confusion_matrix, roc_auc_score, roc_curve)

import joblib
import warnings 
warnings.filterwarnings('ignore')

In [2]:
#Carga de datos del dataset
print("="*60)
df=pd.read_csv("phishing.csv")
df_filtrado=df



In [3]:
#Separacion de caracteristicas y distribuciones de clase
x=df_filtrado.drop(columns=['Index'])
y=df_filtrado['class']

print(f"\nDistribucion de clases:")
print(y.value_counts())
#print(f"Legítimo (-1): {(y == -1).sum()} ({(y == -1).sum()/len(y)*100:.1f}%)")
#print(f"Phishing (1): {(y == 1).sum()} ({(y == 1).sum()/len(y)*100:.1f}%)")


Distribucion de clases:
class
 1    6157
-1    4897
Name: count, dtype: int64


In [4]:
x_train,x_test,y_train,y_test=train_test_split(
    x,y,test_size=0.25,random_state=42,stratify=y
)

print(f"Datos de entrenamiento: {x_train.shape[0]} muestras (75%)")
print(f"Datos de prueba: {x_test.shape[0]} muestras (25%)")

#Guardar datos en otros archivos
df_train = pd.concat([x_train, y_train], axis=1)
df_test = pd.concat([x_test, y_test], axis=1)
df_train.to_csv("phishing_train_75.csv", index=False)
df_test.to_csv("phishing_test_25.csv", index=False)
print("\nDatasets guardados:")
print("phishing_train_75.csv")
print("phishing_test_25.csv")


Datos de entrenamiento: 8290 muestras (75%)
Datos de prueba: 2764 muestras (25%)

Datasets guardados:
phishing_train_75.csv
phishing_test_25.csv


In [5]:
#Debemos normalizar los datos separados
scaler=StandardScaler()
x_train_scaled=scaler.fit_transform(x_train)
x_test_scaled=scaler.transform(x_test)

print("="*60)
print("Modelo Random Forest")
rf_model=RandomForestClassifier(
    n_estimators=200,
    max_depth=15,
    min_samples_split=10,
    min_samples_leaf=5,
    random_state=42,
    n_jobs=-1
)
rf_model.fit(x_train,y_train)

Modelo Random Forest


0,1,2
,n_estimators,200
,criterion,'gini'
,max_depth,15
,min_samples_split,10
,min_samples_leaf,5
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [6]:
#Predicciones hechas
y_pred_rf=rf_model.predict(x_test)
y_proba_rf=rf_model.predict_proba(x_test)[:, 1]
#metricas de evaluacion
acc_rf=accuracy_score(y_test,y_pred_rf)
#auc_rf=roc_auc_score(y_test,y_proba_rf)
print(f"Accuracy: {acc_rf:.4f} ({acc_rf*100:.2f}%)")
#print(f"AUC-ROC: {auc_rf:.4f}")
print("\nMatriz de confusion")
cm_rf=confusion_matrix(y_test,y_pred_rf)
print(cm_rf)
print("\nReporte de clasificacion")
print(classification_report(y_test,y_pred_rf,target_names=['Legitimo','Phishing'],labels=[0,1,-1]))

Accuracy: 1.0000 (100.00%)

Matriz de confusion
[[1224    0]
 [   0 1540]]

Reporte de clasificacion
              precision    recall  f1-score   support

    Legitimo       0.00      0.00      0.00         0
    Phishing       1.00      1.00      1.00      1540

    accuracy                           1.00      2764
   macro avg       0.67      0.67      0.67      2764
weighted avg       1.00      1.00      1.00      2764



In [10]:
#Genero el modelo
joblib.dump(rf_model,"modelo_random_forest_proyecto.pkl")
print("Modelo generado: modelo_random_forest.pkl")

Modelo generado: modelo_random_forest.pkl


In [7]:
print("\n"+"="*60)
print("Modelo 2: Red Neuronal (Aprendizaje continuo)")
print("="*60)

# Cargar los datasets
df_train_75 = pd.read_csv("phishing_train_75.csv")
df_test_25 = pd.read_csv("phishing_test_25.csv")

print("\nDatasets cargados desde archivos CSV:")
print(f" phishing_train_75.csv: {df_train_75.shape[0]} muestras")
print(f" phishing_test_25.csv: {df_test_25.shape[0]} muestras")

# Separar características y target
X_train_nn = df_train_75.drop(columns=['class'])
y_train_nn = df_train_75['class']
X_test_nn = df_test_25.drop(columns=['class'])
y_test_nn = df_test_25['class']

# Normalizar
scaler_nn = StandardScaler()
X_train_nn_scaled = scaler_nn.fit_transform(X_train_nn)
X_test_nn_scaled = scaler_nn.transform(X_test_nn)

# Modelo con hiperparámetros optimizados
model_rnn = SGDClassifier(
    loss='log_loss',
    penalty='elasticnet',
    alpha=0.00001,
    l1_ratio=0.15,
    learning_rate='adaptive',
    eta0=0.01,
    max_iter=2000,
    tol=1e-4,
    random_state=42,
    warm_start=True,
    early_stopping=False
)

# Entrenar múltiples épocas con el 75%
clases = np.unique(y)
print("\nEntrenamiento inicial con 75% (5 épocas)")
n_epochs = 5
for epoch in range(n_epochs):
    model_rnn.partial_fit(X_train_nn_scaled, y_train_nn, classes=clases)
    y_pred_epoch = model_rnn.predict(X_test_nn_scaled)
    acc_epoch = accuracy_score(y_test_nn, y_pred_epoch)
    print(f"  Época {epoch+1}/{n_epochs}: Accuracy = {acc_epoch:.4f}")

# Evaluar ANTES del aprendizaje continuo
y_pred_rn = model_rnn.predict(X_test_nn_scaled)
acc_nn_init = accuracy_score(y_test_nn, y_pred_rn)
print(f"\n✓ Accuracy inicial (antes de aprendizaje continuo): {acc_nn_init:.4f} ({acc_nn_init*100:.2f}%)")
print("\nMatriz de Confusión Inicial:")
cm_nn_init = confusion_matrix(y_test_nn, y_pred_rn)
print(cm_nn_init)

# Aprendizaje continuo con lotes pequeños
print("\n"+"="*60)
print("Aprendizaje continuo con lotes del 25%")
batch_size = 20  # Lotes más pequeños
n = len(X_test_nn_scaled) // batch_size
accuracies = [acc_nn_init]
lote_numeros = [0]

for i in range(n):
    start = i * batch_size
    end = (i + 1) * batch_size
    lote_x = X_test_nn.iloc[start:end]
    lote_y = y_test_nn.iloc[start:end]
    
    # Aprendizaje continuo
    model_rnn.partial_fit(scaler_nn.transform(lote_x), lote_y)
    
    # Evaluar
    y_pred_rn = model_rnn.predict(X_test_nn_scaled)
    acc = accuracy_score(y_test_nn, y_pred_rn)
    accuracies.append(acc)
    lote_numeros.append(i+1)
    print(f"Lote {i+1}/{n}: Accuracy={acc:.4f}")


Modelo 2: Red Neuronal (Aprendizaje continuo)

Datasets cargados desde archivos CSV:
 phishing_train_75.csv: 8290 muestras
 phishing_test_25.csv: 2764 muestras

Entrenamiento inicial con 75% (5 épocas)
  Época 1/5: Accuracy = 1.0000
  Época 2/5: Accuracy = 1.0000
  Época 3/5: Accuracy = 1.0000
  Época 4/5: Accuracy = 1.0000
  Época 5/5: Accuracy = 1.0000

✓ Accuracy inicial (antes de aprendizaje continuo): 1.0000 (100.00%)

Matriz de Confusión Inicial:
[[1224    0]
 [   0 1540]]

Aprendizaje continuo con lotes del 25%
Lote 1/138: Accuracy=1.0000
Lote 2/138: Accuracy=1.0000
Lote 3/138: Accuracy=1.0000
Lote 4/138: Accuracy=1.0000
Lote 5/138: Accuracy=1.0000
Lote 6/138: Accuracy=1.0000
Lote 7/138: Accuracy=1.0000
Lote 8/138: Accuracy=1.0000
Lote 9/138: Accuracy=1.0000
Lote 10/138: Accuracy=1.0000
Lote 11/138: Accuracy=1.0000
Lote 12/138: Accuracy=1.0000
Lote 13/138: Accuracy=1.0000
Lote 14/138: Accuracy=1.0000
Lote 15/138: Accuracy=1.0000
Lote 16/138: Accuracy=1.0000
Lote 17/138: Accurac

In [8]:
# Evaluación final
y_pred_rn_final = model_rnn.predict(X_test_nn_scaled)
acc_rn_final = accuracy_score(y_test_nn, y_pred_rn_final)
print(f"\nAccuracy final después del aprendizaje: {acc_rn_final:.4f}")
print("\nMatriz de Confusión")
cm = confusion_matrix(y_test_nn, y_pred_rn_final)
print(cm)
print("\nReporte de Clasificación")
print(classification_report(y_test_nn, y_pred_rn_final, 
                          target_names=['Legitimo','Phishing']))


Accuracy final después del aprendizaje: 1.0000

Matriz de Confusión
[[1224    0]
 [   0 1540]]

Reporte de Clasificación
              precision    recall  f1-score   support

    Legitimo       1.00      1.00      1.00      1224
    Phishing       1.00      1.00      1.00      1540

    accuracy                           1.00      2764
   macro avg       1.00      1.00      1.00      2764
weighted avg       1.00      1.00      1.00      2764



In [9]:
joblib.dump(model_rnn,"phishing_rn_model_proyecto.pkl")
joblib.dump(scaler,"phishing_scaler_proyecto.pkl")
print("Modelos nuevos guardados: phishing_rn_model_proyecto.pkl, phishing_scaler_proyecto.pkl")

Modelos nuevos guardados: phishing_rn_model_proyecto.pkl, phishing_scaler_proyecto.pkl
