### Libraries

In [1]:
import pandas as pd
import joblib
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier  # o altri
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
from xgboost import XGBClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import AdaBoostClassifier

# Training

In [11]:
df = pd.read_csv("../dataset/3_features_phishing_enhanced.csv")
print(df.shape)
df = df.drop(columns=['num_links', 'num_special_chars', 'has_bank_word'])
df_filtered = df[df['source'] != 'Nazario'] # provo a togliere Nazario
df = df_filtered
df.head()

(82486, 17)


Unnamed: 0,subject,body,label,source,subject_len,body_len,subject_density,body_density,num_exclamations,body_entropy,body_entropy_per_char,percent_digits,percent_punct,text
0,Re: New Sequences Window,"Date: Wed, 21 Aug 2002 10:54:46 -0500 ...",0,Assassin,24,1538,4.8,6.835556,0,4.9731,0.003233,0.067,0.1268,"Re: New Sequences Window Date: Wed, 21 ..."
1,[zzzzteana] RE: Alexander,"Martin A posted:\nTassos Papadopoulos, the Gre...",0,Assassin,25,894,6.25,7.982143,2,4.6876,0.005243,0.0134,0.2069,[zzzzteana] RE: Alexander Martin A posted:\nTa...
2,[zzzzteana] Moscow bomber,Man Threatens Explosion In Moscow \n\nThursday...,0,Assassin,25,1746,6.25,6.901186,2,4.785,0.002741,0.0074,0.1042,[zzzzteana] Moscow bomber Man Threatens Explos...
3,[IRR] Klez: The Virus That Won't Die,Klez: The Virus That Won't Die\n \nAlready the...,0,Assassin,37,1125,4.625,6.818182,0,4.7567,0.004228,0.024,0.0818,[IRR] Klez: The Virus That Won't Die Klez: Th...
4,Re: [zzzzteana] Nothing like mama used to make,"> in adding cream to spaghetti carbonara, whi...",0,Assassin,46,1047,5.111111,7.270833,2,4.7307,0.004518,0.0038,0.1691,Re: [zzzzteana] Nothing like mama used to make...


In [12]:
print(set(df['source'].values))

{'Ling', 'Enron', 'CEAS-08', 'Nigerian_Fraud', 'Assassin'}


## Random Forest

In [None]:
df.columns

In [None]:
# text_col = ['text']  # body, subject, text (subject + body)
num_cols = [
    'subject_len', 'body_len', 'subject_density', 'body_density','num_links', 'num_special_chars',
    'num_exclamations', 'body_entropy', 'body_entropy_per_char', 'percent_digits','has_bank_word', 'percent_punct'
]

text_col = 'text'

# Split
X = df[[text_col]+ num_cols]
y = df['label']
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=42)

# Preprocessing: text + numeriche
preprocessor = ColumnTransformer([
    ('tfidf', TfidfVectorizer(max_features=5000, ngram_range=(1, 2)), text_col),
    ('num', StandardScaler(), num_cols)
])

# Modello di partenza (cambia qui con altri modelli)
pipeline = Pipeline([
    ('preprocess', preprocessor),
    ('clf', RandomForestClassifier(random_state=42))
])

# Addestramento
pipeline.fit(X_train, y_train)

# Valutazione
y_pred = pipeline.predict(X_test)
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))
# ottengo valori alti, il modello generalizza troppo forse.


              precision    recall  f1-score   support

           0       0.98      0.98      0.98      7919
           1       0.98      0.98      0.98      8579

    accuracy                           0.98     16498
   macro avg       0.98      0.98      0.98     16498
weighted avg       0.98      0.98      0.98     16498

[[7789  130]
 [ 134 8445]]


### Leave-one-out Cross Validation

In [None]:
text_col = 'text'
num_cols = [
    'subject_len', 'body_len', 'subject_density', 'body_density',
    'num_links', 'num_special_chars', 'num_exclamations',
    'body_entropy', 'body_entropy_per_char', 'percent_digits',
    'has_bank_word', 'percent_punct'
]

# Output complessivo
results = {}

# Loop Leave-One-Source-Out
for source_name in df['source'].unique():
    print(f"\n🔍 Valutazione su sorgente esclusa: {source_name}")
    
    train = df[df['source'] != source_name]
    test = df[df['source'] == source_name]

    # Features e label
    X_train = train[[text_col] + num_cols]
    y_train = train['label']
    X_test = test[[text_col] + num_cols]
    y_test = test['label']

    # Pipeline
    preprocessor = ColumnTransformer([
        ('tfidf', TfidfVectorizer(max_features=5000, ngram_range=(1, 2)), text_col),
        ('num', StandardScaler(), num_cols)
    ])

    pipeline = Pipeline([
        ('preprocess', preprocessor),
        ('clf', RandomForestClassifier(random_state=42))
    ])

    # Train
    pipeline.fit(X_train, y_train)

    # Test
    y_pred = pipeline.predict(X_test)
    report = classification_report(y_test, y_pred, output_dict=True, zero_division=0)
    
    results[source_name] = {
        'support': len(y_test),
        'accuracy': (y_pred == y_test).mean(),
        'precision': report['1']['precision'],
        'recall': report['1']['recall'],
        'f1-score': report['1']['f1-score']
    }

# Output riepilogativo
df_results = pd.DataFrame(results).T
print("\n📊 Risultati Leave-One-Source-Out:\n")
print(df_results.round(3))



🔍 Valutazione su sorgente esclusa: Assassin

🔍 Valutazione su sorgente esclusa: CEAS-08

🔍 Valutazione su sorgente esclusa: Nigerian_Fraud

🔍 Valutazione su sorgente esclusa: Nazario

🔍 Valutazione su sorgente esclusa: Enron

🔍 Valutazione su sorgente esclusa: Ling

📊 Risultati Leave-One-Source-Out:

                support  accuracy  precision  recall  f1-score
Assassin         5809.0     0.878      0.801   0.782     0.792
CEAS-08         39154.0     0.851      0.886   0.841     0.863
Nigerian_Fraud   3332.0     0.957      1.000   0.957     0.978
Nazario          1565.0     0.668      1.000   0.668     0.801
Enron           29767.0     0.781      0.744   0.816     0.778
Ling             2859.0     0.942      0.798   0.856     0.826


In [30]:
# Salvataggio su file
import pickle 

with open("dati.pkl", "wb") as file:  # "wb" significa scrittura in modalità binaria
    pickle.dump(df_results, file)

## XGBoost

In [13]:
text_col = 'text'
num_cols = [
    'subject_len', 'body_len', 'subject_density', 'body_density',
    'num_exclamations', 'percent_punct',
    'body_entropy', 'body_entropy_per_char', 'percent_digits',
    #'has_bank_word', 'num_links',  'num_special_chars'
]

# Preprocessing da riutilizzare
preprocessor = ColumnTransformer([
    ('tfidf', TfidfVectorizer(max_features=10000, ngram_range=(1, 3),  min_df=5, max_df=0.8), text_col),
    ('num', StandardScaler(), num_cols)
])

results = {}

# Leave-One-Source-Out Evaluation
for source_name in df['source'].unique():
    print(f"\n🔍 Validazione su: {source_name}")
    
    train = df[df['source'] != source_name]
    test = df[df['source'] == source_name]

    X_train = train[[text_col] + num_cols]
    y_train = train['label']
    X_test = test[[text_col] + num_cols]
    y_test = test['label']
    neg, pos = np.bincount(y_train)
    scale = neg / pos
    pipeline = Pipeline([
        ('preprocess', preprocessor),
        ('clf', XGBClassifier(eval_metric='logloss', scale_pos_weight=scale))
    ])

    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_test)

    report = classification_report(y_test, y_pred, output_dict=True, zero_division=0)

    results[source_name] = {
        'support': len(y_test),
        'accuracy': (y_pred == y_test).mean(),
        'precision': report['1']['precision'],
        'recall': report['1']['recall'],
        'f1-score': report['1']['f1-score']
    }

df_results = pd.DataFrame(results).T
print("\n📊 Risultati LOSO:\n", df_results.round(3))


🔍 Validazione su: Assassin

🔍 Validazione su: CEAS-08

🔍 Validazione su: Nigerian_Fraud

🔍 Validazione su: Enron

🔍 Validazione su: Ling

📊 Risultati LOSO:
                 support  accuracy  precision  recall  f1-score
Assassin         5809.0     0.865      0.799   0.724     0.760
CEAS-08         39154.0     0.851      0.884   0.843     0.863
Nigerian_Fraud   3332.0     0.964      1.000   0.964     0.982
Enron           29767.0     0.819      0.777   0.862     0.817
Ling             2859.0     0.917      0.671   0.943     0.784


In [None]:
text_col = 'text'
num_cols = [
    'subject_len', 'body_len', 'subject_density', 'body_density',
    'num_exclamations', 'percent_punct',
    'body_entropy', 'body_entropy_per_char', 'percent_digits',
    #'has_bank_word', 'num_links',  'num_special_chars'
]

text_col = 'text'

# Preprocessing da riutilizzare
preprocessor = ColumnTransformer([
    ('tfidf', TfidfVectorizer(max_features=10000, ngram_range=(1, 3),  min_df=5, max_df=0.8), text_col),
    ('num', StandardScaler(), num_cols)
])

# Split
X = df[[text_col]+ num_cols]
y = df['label']
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=42)


# Modello di partenza (cambia qui con altri modelli)
pipeline = Pipeline([
    ('preprocess', preprocessor),
    ('clf', RandomForestClassifier(random_state=42))
])

# Addestramento
pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_test)

report = classification_report(y_test, y_pred, output_dict=True, zero_division=0)

print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))
# ottengo valori alti, il modello generalizza troppo forse.

results = {}

results['all'] = {
        'support': len(y_test),
        'accuracy': (y_pred == y_test).mean(),
        'precision': report['1']['precision'],
        'recall': report['1']['recall'],
        'f1-score': report['1']['f1-score']
}

df_results = pd.DataFrame(results).T
print("\n📊 Risultati:\n", df_results.round(3))

In [None]:
# Final training on full dataset
X_final = df[[text_col] + num_cols]
y_final = df['label']

final_pipeline = Pipeline([
    ('preprocess', preprocessor),
    ('clf', XGBClassifier(use_label_encoder=False, eval_metric='logloss'))
])

final_pipeline.fit(X_final, y_final)

joblib.dump(final_pipeline, 'phishing_model.pkl')


## ET

In [7]:
text_col = 'text'
num_cols = [
    'subject_len', 'body_len', 'subject_density', 'body_density',
    'num_links', 'num_special_chars', 'num_exclamations',
    'body_entropy', 'body_entropy_per_char', 'percent_digits',
    'has_bank_word', 'percent_punct'
]

# Preprocessing da riutilizzare
preprocessor = ColumnTransformer([
    ('tfidf', TfidfVectorizer(max_features=5000, ngram_range=(1, 2), min_df=5, max_df=0.8), text_col),
    ('num', StandardScaler(), num_cols)
])

results = {}

# Leave-One-Source-Out Evaluation
for source_name in df['source'].unique():
    print(f"\n🔍 Validazione su: {source_name}")
    
    train = df[df['source'] != source_name]
    test = df[df['source'] == source_name]

    X_train = train[[text_col] + num_cols]
    y_train = train['label']
    X_test = test[[text_col] + num_cols]
    y_test = test['label']

    pipeline = Pipeline([
        ('preprocess', preprocessor),
        ('clf', ExtraTreesClassifier(n_estimators=100, max_depth=10, random_state=42))
    ])

    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_test)

    report = classification_report(y_test, y_pred, output_dict=True, zero_division=0)

    results[source_name] = {
        'support': len(y_test),
        'accuracy': (y_pred == y_test).mean(),
        'precision': report['1']['precision'],
        'recall': report['1']['recall'],
        'f1-score': report['1']['f1-score']
    }

df_results = pd.DataFrame(results).T
print("\n📊 Risultati LOSO:\n", df_results.round(3))


🔍 Validazione su: Assassin

🔍 Validazione su: CEAS-08

🔍 Validazione su: Nigerian_Fraud

🔍 Validazione su: Nazario

🔍 Validazione su: Enron

🔍 Validazione su: Ling

📊 Risultati LOSO:
                 support  accuracy  precision  recall  f1-score
Assassin         5809.0     0.764      0.562   0.919     0.697
CEAS-08         39154.0     0.646      0.837   0.454     0.589
Nigerian_Fraud   3332.0     0.522      1.000   0.522     0.686
Nazario          1565.0     0.788      1.000   0.788     0.881
Enron           29767.0     0.609      0.553   0.877     0.678
Ling             2859.0     0.765      0.395   0.878     0.544


## ADB - Ada Boost

In [6]:
text_col = 'text'
num_cols = [
    'subject_len', 'body_len', 'subject_density', 'body_density',
    'num_links', 'num_special_chars', 'num_exclamations',
    'body_entropy', 'body_entropy_per_char', 'percent_digits',
    'has_bank_word', 'percent_punct'
]

# Preprocessing da riutilizzare
preprocessor = ColumnTransformer([
    ('tfidf', TfidfVectorizer(max_features=5000, ngram_range=(1, 2), min_df=5, max_df=0.8), text_col),
    ('num', StandardScaler(), num_cols)
])

results = {}

# Leave-One-Source-Out Evaluation
for source_name in df['source'].unique():
    print(f"\n🔍 Validazione su: {source_name}")
    
    train = df[df['source'] != source_name]
    test = df[df['source'] == source_name]

    X_train = train[[text_col] + num_cols]
    y_train = train['label']
    X_test = test[[text_col] + num_cols]
    y_test = test['label']

    pipeline = Pipeline([
        ('preprocess', preprocessor),
        ('clf', AdaBoostClassifier(n_estimators=100, learning_rate=0.5))
    ])

    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_test)

    report = classification_report(y_test, y_pred, output_dict=True, zero_division=0)

    results[source_name] = {
        'support': len(y_test),
        'accuracy': (y_pred == y_test).mean(),
        'precision': report['1']['precision'],
        'recall': report['1']['recall'],
        'f1-score': report['1']['f1-score']
    }

df_results = pd.DataFrame(results).T
print("\n📊 Risultati LOSO:\n", df_results.round(3))


🔍 Validazione su: Assassin

🔍 Validazione su: CEAS-08

🔍 Validazione su: Nigerian_Fraud

🔍 Validazione su: Nazario

🔍 Validazione su: Enron

🔍 Validazione su: Ling

📊 Risultati LOSO:
                 support  accuracy  precision  recall  f1-score
Assassin         5809.0     0.777      0.594   0.776     0.673
CEAS-08         39154.0     0.718      0.821   0.631     0.714
Nigerian_Fraud   3332.0     0.779      1.000   0.779     0.876
Nazario          1565.0     0.781      1.000   0.781     0.877
Enron           29767.0     0.671      0.599   0.903     0.720
Ling             2859.0     0.750      0.375   0.836     0.518
