# Preprocess data for deep learning

In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, LSTM, Dense, Dropout, Concatenate
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import set_random_seed
from tensorflow.keras.regularizers import L2
from sklearn.utils import class_weight
from keras.layers import Bidirectional
from keras.callbacks import EarlyStopping

In [2]:
df = pd.read_csv("../dataset/3_features_phishing_enhanced.csv")
print(df.shape)
df = df.drop(columns=['num_links', 'num_special_chars', 'has_bank_word'])
df_filtered = df[df['source'] != 'Nazario'] # provo a togliere Nazario
df = df_filtered
df.head()

(82486, 17)


Unnamed: 0,subject,body,label,source,subject_len,body_len,subject_density,body_density,num_exclamations,body_entropy,body_entropy_per_char,percent_digits,percent_punct,text
0,Re: New Sequences Window,"Date: Wed, 21 Aug 2002 10:54:46 -0500 ...",0,Assassin,24,1538,4.8,6.835556,0,4.9731,0.003233,0.067,0.1268,"Re: New Sequences Window Date: Wed, 21 ..."
1,[zzzzteana] RE: Alexander,"Martin A posted:\nTassos Papadopoulos, the Gre...",0,Assassin,25,894,6.25,7.982143,2,4.6876,0.005243,0.0134,0.2069,[zzzzteana] RE: Alexander Martin A posted:\nTa...
2,[zzzzteana] Moscow bomber,Man Threatens Explosion In Moscow \n\nThursday...,0,Assassin,25,1746,6.25,6.901186,2,4.785,0.002741,0.0074,0.1042,[zzzzteana] Moscow bomber Man Threatens Explos...
3,[IRR] Klez: The Virus That Won't Die,Klez: The Virus That Won't Die\n \nAlready the...,0,Assassin,37,1125,4.625,6.818182,0,4.7567,0.004228,0.024,0.0818,[IRR] Klez: The Virus That Won't Die Klez: Th...
4,Re: [zzzzteana] Nothing like mama used to make,"> in adding cream to spaghetti carbonara, whi...",0,Assassin,46,1047,5.111111,7.270833,2,4.7307,0.004518,0.0038,0.1691,Re: [zzzzteana] Nothing like mama used to make...


In [None]:
set_random_seed(42)     # per riproducibilità

# Settings
MAX_NUM_WORDS = 10000
MAX_SEQUENCE_LENGTH = 300
text_col = 'text'
num_cols = [
    'subject_len', 'body_len', 'subject_density', 'body_density',
    'num_exclamations', 'percent_punct',
    'body_entropy', 'body_entropy_per_char', 'percent_digits'
]

results_lstm = {}

for source_name in df['source'].unique():
    print(f"\n🔄 KERAS LOSO: Validating on source: {source_name}")
    
    train = df[df['source'] != source_name]
    test = df[df['source'] == source_name]

    # Tokenization
    tokenizer = Tokenizer(num_words=MAX_NUM_WORDS, oov_token="<OOV>")
    tokenizer.fit_on_texts(train[text_col])
    
    X_train_text = pad_sequences(tokenizer.texts_to_sequences(train[text_col]), maxlen=MAX_SEQUENCE_LENGTH)
    X_test_text = pad_sequences(tokenizer.texts_to_sequences(test[text_col]), maxlen=MAX_SEQUENCE_LENGTH)

    # Numeric preprocessing
    scaler = StandardScaler()
    X_train_num = scaler.fit_transform(train[num_cols])
    X_test_num = scaler.transform(test[num_cols])
    
    y_train = train['label'].values
    y_test = test['label'].values

    # Model
    input_text = Input(shape=(MAX_SEQUENCE_LENGTH,), name='text_input')
    x_text = Embedding(MAX_NUM_WORDS, 128)(input_text)
    x_text = LSTM(64)(x_text)

    input_num = Input(shape=(X_train_num.shape[1],), name='num_input')
    x_num = Dense(32, activation='relu')(input_num)

    x = Concatenate()([x_text, x_num])
    x = Dense(64, activation='relu')(x)
    x = Dropout(0.3)(x)
    output = Dense(1, activation='sigmoid')(x)

    model = Model(inputs=[input_text, input_num], outputs=output)
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

    model.fit(
        [X_train_text, X_train_num], y_train,
        epochs=5, batch_size=32,
        verbose=0, validation_split=0.1
    )

    y_pred = (model.predict([X_test_text, X_test_num]) > 0.5).astype(int).flatten()
    report = classification_report(y_test, y_pred, output_dict=True, zero_division=0)

    results_lstm[source_name] = {
        'support': len(y_test),
        'accuracy': (y_pred == y_test).mean(),
        'precision': report['1']['precision'],
        'recall': report['1']['recall'],
        'f1-score': report['1']['f1-score']
    }

df_results_lstm = pd.DataFrame(results_lstm).T
print("\n📊 KERAS LSTM LOSO Results:\n", df_results_lstm.round(3))



🔄 KERAS LOSO: Validating on source: Assassin
[1m182/182[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 48ms/step

🔄 KERAS LOSO: Validating on source: CEAS-08
[1m1224/1224[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m58s[0m 47ms/step

🔄 KERAS LOSO: Validating on source: Nigerian_Fraud
[1m105/105[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 48ms/step

🔄 KERAS LOSO: Validating on source: Nazario
[1m49/49[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 48ms/step

🔄 KERAS LOSO: Validating on source: Enron
[1m931/931[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m47s[0m 50ms/step

🔄 KERAS LOSO: Validating on source: Ling
[1m90/90[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 48ms/step

📊 KERAS LSTM LOSO Results:
                 support  accuracy  precision  recall  f1-score
Assassin         5809.0     0.786      0.587   0.933     0.721
CEAS-08         39154.0     0.818      0.844   0.825     0.835
Nigerian_Fraud   3332.0     0.954      1.000   0.954 

### Migliorato

In [3]:
set_random_seed(42)     # per riproducibilità

# Settings
MAX_NUM_WORDS = 10000
MAX_SEQUENCE_LENGTH = 300
text_col = 'text'
num_cols = [
    'subject_len', 'body_len', 'subject_density', 'body_density',
    'num_exclamations', 'percent_punct',
    'body_entropy', 'body_entropy_per_char', 'percent_digits'
]

results_lstm = {}

for source_name in df['source'].unique():
    print(f"\n🔄 KERAS LOSO: Validating on source: {source_name}")
    
    train = df[df['source'] != source_name]
    test = df[df['source'] == source_name]

    # Tokenization
    tokenizer = Tokenizer(num_words=MAX_NUM_WORDS, oov_token="<OOV>")
    tokenizer.fit_on_texts(train[text_col])
    
    X_train_text = pad_sequences(tokenizer.texts_to_sequences(train[text_col]), maxlen=MAX_SEQUENCE_LENGTH)
    X_test_text = pad_sequences(tokenizer.texts_to_sequences(test[text_col]), maxlen=MAX_SEQUENCE_LENGTH)

    # Numeric preprocessing
    scaler = StandardScaler()
    X_train_num = scaler.fit_transform(train[num_cols])
    X_test_num = scaler.transform(test[num_cols])
    
    y_train = train['label'].values
    y_test = test['label'].values

    # Model
    input_text = Input(shape=(MAX_SEQUENCE_LENGTH,), name='text_input')
    x_text = Embedding(MAX_NUM_WORDS, 128)(input_text)
    # x_text = LSTM(64)(x_text)
    x_text = Bidirectional(LSTM(64))(x_text)

    input_num = Input(shape=(X_train_num.shape[1],), name='num_input')
    x_num = Dense(32, activation='relu')(input_num)

    x = Concatenate()([x_text, x_num])
    x = Dense(64, activation='relu', kernel_regularizer=L2(0.001))(x)
    x = Dropout(0.5)(x)
    output = Dense(1, activation='sigmoid')(x)

    class_weights = class_weight.compute_class_weight('balanced', classes=np.unique(y_train), y=y_train)
    class_weights = dict(enumerate(class_weights))

    model = Model(inputs=[input_text, input_num], outputs=output)
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    early_stop = EarlyStopping(monitor='val_loss', patience=2, restore_best_weights=True)

    model.fit(
        [X_train_text, X_train_num], y_train,
        epochs=5, batch_size=32, verbose=0, validation_split=0.1,
        class_weight=class_weights, callbacks=[early_stop]
    )

    y_pred = (model.predict([X_test_text, X_test_num]) > 0.5).astype(int).flatten()
    report = classification_report(y_test, y_pred, output_dict=True, zero_division=0)

    results_lstm[source_name] = {
        'support': len(y_test),
        'accuracy': (y_pred == y_test).mean(),
        'precision': report['1']['precision'],
        'recall': report['1']['recall'],
        'f1-score': report['1']['f1-score']
    }

df_results_lstm = pd.DataFrame(results_lstm).T
print("\n📊 KERAS LSTM LOSO Results:\n", df_results_lstm.round(3))



🔄 KERAS LOSO: Validating on source: Assassin
[1m182/182[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 96ms/step

🔄 KERAS LOSO: Validating on source: CEAS-08
[1m1224/1224[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m72s[0m 59ms/step

🔄 KERAS LOSO: Validating on source: Nigerian_Fraud
[1m105/105[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 58ms/step

🔄 KERAS LOSO: Validating on source: Enron
[1m931/931[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m56s[0m 60ms/step

🔄 KERAS LOSO: Validating on source: Ling
[1m90/90[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 56ms/step

📊 KERAS LSTM LOSO Results:
                 support  accuracy  precision  recall  f1-score
Assassin         5809.0     0.880      0.767   0.855     0.808
CEAS-08         39154.0     0.834      0.852   0.850     0.851
Nigerian_Fraud   3332.0     0.917      1.000   0.917     0.957
Enron           29767.0     0.783      0.765   0.775     0.770
Ling             2859.0     0.876      0.568 

### LSTM x2

In [4]:
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, LSTM, Bidirectional, Dense, Dropout, Concatenate
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.regularizers import l2
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report
from sklearn.utils import class_weight
import numpy as np
import pandas as pd

set_random_seed(42)

# Settings
MAX_NUM_WORDS = 10000
MAX_SEQUENCE_LENGTH = 300
EMBED_DIM = 128  # puoi passare a 100 se usi GloVe 100d
text_col = 'text'
num_cols = [
    'subject_len', 'body_len', 'subject_density', 'body_density',
    'num_exclamations', 'percent_punct',
    'body_entropy', 'body_entropy_per_char', 'percent_digits'
]

results_lstm = {}

for source_name in df['source'].unique():
    print(f"\n🔄 KERAS LOSO: Validating on source: {source_name}")
    
    train = df[df['source'] != source_name]
    test = df[df['source'] == source_name]

    # Tokenization
    tokenizer = Tokenizer(num_words=MAX_NUM_WORDS, oov_token="<OOV>")
    tokenizer.fit_on_texts(train[text_col])
    
    X_train_text = pad_sequences(tokenizer.texts_to_sequences(train[text_col]), maxlen=MAX_SEQUENCE_LENGTH)
    X_test_text = pad_sequences(tokenizer.texts_to_sequences(test[text_col]), maxlen=MAX_SEQUENCE_LENGTH)

    # Numeric preprocessing
    scaler = StandardScaler()
    X_train_num = scaler.fit_transform(train[num_cols])
    X_test_num = scaler.transform(test[num_cols])
    
    y_train = train['label'].values
    y_test = test['label'].values

    # Class balancing
    class_weights = class_weight.compute_class_weight(class_weight='balanced',
                                                      classes=np.unique(y_train),
                                                      y=y_train)
    class_weights = dict(enumerate(class_weights))

    # TEXT INPUT
    input_text = Input(shape=(MAX_SEQUENCE_LENGTH,), name='text_input')
    
    # Optional: use pre-trained GloVe embeddings here
    x_text = Embedding(MAX_NUM_WORDS, EMBED_DIM)(input_text)
    x_text = Bidirectional(LSTM(64))(x_text)  # Bidirectional for better context

    # NUMERIC INPUT
    input_num = Input(shape=(X_train_num.shape[1],), name='num_input')
    x_num = Dense(32, activation='relu', kernel_regularizer=l2(0.001))(input_num)

    # MERGE
    x = Concatenate()([x_text, x_num])
    x = Dense(64, activation='relu', kernel_regularizer=l2(0.001))(x)
    x = Dropout(0.5)(x)  # more dropout for regularization
    output = Dense(1, activation='sigmoid')(x)

    model = Model(inputs=[input_text, input_num], outputs=output)
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

    # Early stopping
    early_stop = EarlyStopping(monitor='val_loss', patience=2, restore_best_weights=True)

    model.fit(
        [X_train_text, X_train_num], y_train,
        epochs=20,
        batch_size=32,
        validation_split=0.1,
        verbose=0,
        class_weight=class_weights,
        callbacks=[early_stop]
    )

    y_pred = (model.predict([X_test_text, X_test_num], verbose=0) > 0.5).astype(int).flatten()
    report = classification_report(y_test, y_pred, output_dict=True, zero_division=0)

    results_lstm[source_name] = {
        'support': len(y_test),
        'accuracy': (y_pred == y_test).mean(),
        'precision': report['1']['precision'],
        'recall': report['1']['recall'],
        'f1-score': report['1']['f1-score']
    }

df_results_lstm = pd.DataFrame(results_lstm).T
print("\n📊 KERAS LSTM LOSO Results:\n", df_results_lstm.round(3))



🔄 KERAS LOSO: Validating on source: Assassin

🔄 KERAS LOSO: Validating on source: CEAS-08

🔄 KERAS LOSO: Validating on source: Nigerian_Fraud

🔄 KERAS LOSO: Validating on source: Enron

🔄 KERAS LOSO: Validating on source: Ling

📊 KERAS LSTM LOSO Results:
                 support  accuracy  precision  recall  f1-score
Assassin         5809.0     0.891      0.777   0.885     0.828
CEAS-08         39154.0     0.836      0.913   0.782     0.842
Nigerian_Fraud   3332.0     0.871      1.000   0.871     0.931
Enron           29767.0     0.794      0.765   0.809     0.786
Ling             2859.0     0.883      0.583   0.954     0.724


### LSTM senza feature numeriche

In [5]:
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, LSTM, Bidirectional, Dense, Dropout
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.metrics import classification_report
from sklearn.utils import class_weight
import numpy as np
import pandas as pd

set_random_seed(42)

# Settings
MAX_NUM_WORDS = 10000
MAX_SEQUENCE_LENGTH = 300
EMBED_DIM = 128
text_col = 'text'

results_lstm_text_only = {}

for source_name in df['source'].unique():
    print(f"\n🔄 KERAS LOSO (Text-only): Validating on source: {source_name}")
    
    train = df[df['source'] != source_name]
    test = df[df['source'] == source_name]

    # Tokenization
    tokenizer = Tokenizer(num_words=MAX_NUM_WORDS, oov_token="<OOV>")
    tokenizer.fit_on_texts(train[text_col])
    
    X_train_text = pad_sequences(tokenizer.texts_to_sequences(train[text_col]), maxlen=MAX_SEQUENCE_LENGTH)
    X_test_text = pad_sequences(tokenizer.texts_to_sequences(test[text_col]), maxlen=MAX_SEQUENCE_LENGTH)

    y_train = train['label'].values
    y_test = test['label'].values

    # Class balancing
    class_weights = class_weight.compute_class_weight(class_weight='balanced',
                                                      classes=np.unique(y_train),
                                                      y=y_train)
    class_weights = dict(enumerate(class_weights))

    # TEXT MODEL
    input_text = Input(shape=(MAX_SEQUENCE_LENGTH,), name='text_input')
    x = Embedding(MAX_NUM_WORDS, EMBED_DIM)(input_text)
    x = Bidirectional(LSTM(64))(x)
    x = Dropout(0.5)(x)
    x = Dense(64, activation='relu')(x)
    output = Dense(1, activation='sigmoid')(x)

    model = Model(inputs=input_text, outputs=output)
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

    early_stop = EarlyStopping(monitor='val_loss', patience=2, restore_best_weights=True)

    model.fit(
        X_train_text, y_train,
        epochs=20,
        batch_size=32,
        validation_split=0.1,
        verbose=0,
        class_weight=class_weights,
        callbacks=[early_stop]
    )

    y_pred = (model.predict(X_test_text, verbose=0) > 0.5).astype(int).flatten()
    report = classification_report(y_test, y_pred, output_dict=True, zero_division=0)

    results_lstm_text_only[source_name] = {
        'support': len(y_test),
        'accuracy': (y_pred == y_test).mean(),
        'precision': report['1']['precision'],
        'recall': report['1']['recall'],
        'f1-score': report['1']['f1-score']
    }

df_results_lstm_text_only = pd.DataFrame(results_lstm_text_only).T
print("\n📊 KERAS LSTM (Text-only) LOSO Results:\n", df_results_lstm_text_only.round(3))



🔄 KERAS LOSO (Text-only): Validating on source: Assassin

🔄 KERAS LOSO (Text-only): Validating on source: CEAS-08

🔄 KERAS LOSO (Text-only): Validating on source: Nigerian_Fraud

🔄 KERAS LOSO (Text-only): Validating on source: Enron

🔄 KERAS LOSO (Text-only): Validating on source: Ling

📊 KERAS LSTM (Text-only) LOSO Results:
                 support  accuracy  precision  recall  f1-score
Assassin         5809.0     0.747      0.543   0.913     0.681
CEAS-08         39154.0     0.802      0.835   0.805     0.819
Nigerian_Fraud   3332.0     0.941      1.000   0.941     0.970
Enron           29767.0     0.797      0.789   0.776     0.782
Ling             2859.0     0.902      0.633   0.924     0.751


# Bert

In [None]:
import torch
import os
import numpy as np
from tqdm import tqdm # For a nice progress bar
from transformers import BertTokenizer, BertModel
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.preprocessing import StandardScaler

In [12]:
device = torch.device('cpu')
bert = BertModel.from_pretrained('bert-base-uncased').to(device)
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
bert.eval()

def encode_bert_batched(texts, batch_size=16): # Adjust batch_size as needed
    all_embeddings = []
    # Convert texts to a list if not already to enable easy slicing
    texts_list = list(texts)
    for i in tqdm(range(0, len(texts_list), batch_size), desc="Encoding BERT"):
        batch_texts = texts_list[i:i + batch_size]
        tokens = tokenizer(
            batch_texts, padding=True, truncation=True, max_length=256,
            return_tensors='pt'
        )
        tokens = {k: v.to(device) for k, v in tokens.items()}
        with torch.no_grad():
            outputs = bert(**tokens)
        # Take the [CLS] token embedding for sentence representation
        batch_embeddings = outputs.last_hidden_state[:, 0, :].cpu().numpy()
        all_embeddings.append(batch_embeddings)
    return np.vstack(all_embeddings)

results_bert = {}

for source_name in df['source'].unique():
    print(f"\n🔄 BERT LOSO: Validating on source: {source_name}")

    train = df[df['source'] != source_name]
    test = df[df['source'] == source_name]

    # Encode BERT
    X_train_bert = encode_bert_batched(train[text_col])
    X_test_bert = encode_bert_batched(test[text_col])
    
    # Numeric
    scaler = StandardScaler()
    X_train_num = scaler.fit_transform(train[num_cols])
    X_test_num = scaler.transform(test[num_cols])

    # Combine features
    X_train = np.hstack([X_train_bert, X_train_num])
    X_test = np.hstack([X_test_bert, X_test_num])

    y_train = train['label'].values
    y_test = test['label'].values

    # Classifier
    clf = LogisticRegression(max_iter=1000)
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)

    report = classification_report(y_test, y_pred, output_dict=True, zero_division=0)

    results_bert[source_name] = {
        'support': len(y_test),
        'accuracy': (y_pred == y_test).mean(),
        'precision': report['1']['precision'],
        'recall': report['1']['recall'],
        'f1-score': report['1']['f1-score']
    }

df_results_bert = pd.DataFrame(results_bert).T
print("\n📊 BERT + Numeric LOSO Results:\n", df_results_bert.round(3))


🔄 BERT LOSO: Validating on source: Assassin


Encoding BERT:  19%|█▊        | 894/4793 [1:16:45<5:34:46,  5.15s/it]


KeyboardInterrupt: 

In [None]:

# 1. Save BERT Model and Tokenizer
# Hugging Face models have a convenient .save_pretrained() method
# This saves the model weights and configuration.
# The tokenizer saves its vocabulary and configuration.
model_dir = "./saved_phishing_model"
os.makedirs(model_dir, exist_ok=True) # Create directory if it doesn't exist

bert.save_pretrained(model_dir)
tokenizer.save_pretrained(model_dir)
print(f"BERT model and tokenizer saved to: {model_dir}")

# 2. Save StandardScaler
# Use joblib for scikit-learn objects (scalers, classifiers)
scaler_path = os.path.join(model_dir, "scaler.joblib")
joblib.dump(scaler, scaler_path)
print(f"Scaler saved to: {scaler_path}")

# 3. Save Logistic Regression Classifier
clf_path = os.path.join(model_dir, "logistic_regression_clf.joblib")
joblib.dump(clf, clf_path)
print(f"Logistic Regression classifier saved to: {clf_path}")

# 4. Save feature column names (important for consistent processing)
feature_config_path = os.path.join(model_dir, "feature_config.json")
import json
feature_config = {
    "text_column": text_col,
    "numeric_columns": num_cols
}
with open(feature_config_path, 'w') as f:
    json.dump(feature_config, f)
print(f"Feature configuration saved to: {feature_config_path}")

print("All necessary components saved successfully!")

# BERT senza feature numeriche
- prima opzione con solo il testo
- fine-tune bert-base-uncased con BertForSequenceClassification

In [None]:
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from sklearn.metrics import classification_report
from torch.utils.data import Dataset
import torch
import pandas as pd
import numpy as np

In [None]:
# Parameters
text_col = "text"
label_col = "label"
num_labels = 2

tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

class EmailDataset(Dataset):
    def __init__(self, texts, labels):
        self.encodings = tokenizer(
            texts.tolist(), truncation=True, padding=True, max_length=256, return_tensors="pt"
        )
        self.labels = torch.tensor(labels.tolist())

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {k: v[idx] for k, v in self.encodings.items()}
        item["labels"] = self.labels[idx]
        return item


In [None]:
results_finetune = {}

for source_name in df["source"].unique():
    print(f"\n🔁 Fine-tuning BERT — Validating on source: {source_name}")

    train = df[df["source"] != source_name]
    test = df[df["source"] == source_name]

    train_dataset = EmailDataset(train[text_col], train[label_col])
    test_dataset = EmailDataset(test[text_col], test[label_col])

    model = BertForSequenceClassification.from_pretrained(
        "bert-base-uncased", num_labels=num_labels
    )

    training_args = TrainingArguments(
        output_dir="./results",
        num_train_epochs=3,
        per_device_train_batch_size=8,
        per_device_eval_batch_size=16,
        evaluation_strategy="epoch",
        logging_dir="./logs",
        logging_steps=10,
        save_strategy="no",
        report_to="none",
        load_best_model_at_end=False,
        disable_tqdm=True
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=test_dataset,
    )

    trainer.train()

    # Prediction
    preds_output = trainer.predict(test_dataset)
    preds = np.argmax(preds_output.predictions, axis=1)
    y_true = test[label_col].values

    report = classification_report(y_true, preds, output_dict=True, zero_division=0)
    results_finetune[source_name] = {
        'support': len(y_true),
        'accuracy': (preds == y_true).mean(),
        'precision': report['1']['precision'],
        'recall': report['1']['recall'],
        'f1-score': report['1']['f1-score']
    }

df_results_finetune = pd.DataFrame(results_finetune).T
print("\n📊 Fine-Tuned BERT LOSO Results:\n", df_results_finetune.round(3))


# BERT con feature numeriche
- Usare BERT (bert-base-uncased) per estrarre l'embedding dal testo (subject + body)
- concatenare le feature numeriche
- passare il vettore combinato a un MLP (Multilayer Perceptron) per la calssficiazione binaria
- allenare tutto end-to-end, incluso il fine tuning di BERT

In [None]:
import torch
import torch.nn as nn
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from transformers import BertTokenizer, BertModel
from torch.utils.data import Dataset, DataLoader
from sklearn.metrics import classification_report


In [None]:
class EmailDatasetWithFeatures(Dataset):
    def __init__(self, texts, num_feats, labels, tokenizer, max_len=256):
        self.encodings = tokenizer(
            texts.tolist(), truncation=True, padding=True, max_length=max_len, return_tensors="pt"
        )
        self.num_feats = torch.tensor(num_feats, dtype=torch.float32)
        self.labels = torch.tensor(labels.tolist(), dtype=torch.float32)

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {k: v[idx] for k, v in self.encodings.items()}
        item["num_feats"] = self.num_feats[idx]
        item["labels"] = self.labels[idx]
        return item


In [None]:
class BertWithNumeric(nn.Module):
    def __init__(self, num_numeric_features, dropout=0.3):
        super().__init__()
        self.bert = BertModel.from_pretrained("bert-base-uncased")
        self.dropout = nn.Dropout(dropout)

        # Combiner: CLS embedding (768) + numeric
        self.classifier = nn.Sequential(
            nn.Linear(768 + num_numeric_features, 128),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(128, 1),
            nn.Sigmoid()
        )

    def forward(self, input_ids, attention_mask, num_feats):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        cls_output = outputs.last_hidden_state[:, 0, :]  # [CLS]
        combined = torch.cat((cls_output, num_feats), dim=1)
        return self.classifier(combined).squeeze()


In [None]:
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
device = torch.device("cpu")

text_col = "text"
label_col = "label"
num_cols = [
    'subject_len', 'body_len', 'subject_density', 'body_density',
    'num_exclamations', 'percent_punct',
    'body_entropy', 'body_entropy_per_char', 'percent_digits'
]

results_finetune = {}

for source_name in df["source"].unique():
    print(f"\n🚀 Fine-tuning BERT+Numeric — Validating on: {source_name}")

    train = df[df["source"] != source_name]
    test = df[df["source"] == source_name]

    # Scale numeric features
    scaler = StandardScaler()
    X_train_num = scaler.fit_transform(train[num_cols])
    X_test_num = scaler.transform(test[num_cols])

    y_train = train[label_col].values
    y_test = test[label_col].values

    # Datasets & Loaders
    train_dataset = EmailDatasetWithFeatures(train[text_col], X_train_num, y_train, tokenizer)
    test_dataset = EmailDatasetWithFeatures(test[text_col], X_test_num, y_test, tokenizer)

    train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
    test_loader = DataLoader(test_dataset, batch_size=16)

    # Model
    model = BertWithNumeric(num_numeric_features=X_train_num.shape[1]).to(device)
    optimizer = torch.optim.Adam(model.parameters(), lr=2e-5)
    loss_fn = nn.BCELoss()

    # Training loop
    model.train()
    for epoch in range(3):
        for batch in train_loader:
            optimizer.zero_grad()
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            num_feats = batch["num_feats"].to(device)
            labels = batch["labels"].to(device)

            outputs = model(input_ids, attention_mask, num_feats)
            loss = loss_fn(outputs, labels)
            loss.backward()
            optimizer.step()

    # Evaluation
    model.eval()
    all_preds = []
    with torch.no_grad():
        for batch in test_loader:
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            num_feats = batch["num_feats"].to(device)

            outputs = model(input_ids, attention_mask, num_feats)
            preds = (outputs > 0.5).long().cpu().numpy()
            all_preds.extend(preds)

    report = classification_report(y_test, all_preds, output_dict=True, zero_division=0)

    results_finetune[source_name] = {
        'support': len(y_test),
        'accuracy': np.mean(np.array(all_preds) == y_test),
        'precision': report['1']['precision'],
        'recall': report['1']['recall'],
        'f1-score': report['1']['f1-score']
    }

df_results_finetune = pd.DataFrame(results_finetune).T
print("\n📊 Fine-tuned BERT + Numeric LOSO Results:\n", df_results_finetune.round(3))
