### Libraries

In [10]:
import pandas as pd
import numpy as np
import re
import os
import pickle
import joblib
import requests
import tarfile
import mailparser
import re
import base64
import hashlib
import mailbox
import math
import tempfile # Importato per la gestione sicura dei file temporanei
from collections import Counter
from email import policy
from oletools.olevba import VBA_Parser
from scipy.stats import entropy as scipy_entropy
from tqdm import tqdm
from mailparser import parse_from_file
from concurrent.futures import ProcessPoolExecutor, as_completed
import string

# Dataset 3 features

In [3]:
df = pd.read_csv("3_features_phishing.csv")
print(df.shape)
df.head()

(82486, 4)


Unnamed: 0,subject,body,label,source
0,Re: New Sequences Window,"Date: Wed, 21 Aug 2002 10:54:46 -0500 ...",0,Assassin
1,[zzzzteana] RE: Alexander,"Martin A posted:\nTassos Papadopoulos, the Gre...",0,Assassin
2,[zzzzteana] Moscow bomber,Man Threatens Explosion In Moscow \n\nThursday...,0,Assassin
3,[IRR] Klez: The Virus That Won't Die,Klez: The Virus That Won't Die\n \nAlready the...,0,Assassin
4,Re: [zzzzteana] Nothing like mama used to make,"> in adding cream to spaghetti carbonara, whi...",0,Assassin


In [None]:
print(df['label'].value_counts())
print(df['label'].value_counts(normalize=True))  # percentuali

label
1    42891
0    39595
Name: count, dtype: int64
label
1    0.519979
0    0.480021
Name: proportion, dtype: float64


In [None]:
print(df.isnull().sum())  # valori NaN
print((df == '').sum())   # celle vuote

subject    0
body       0
label      0
source     0
dtype: int64
subject    347
body         1
label        0
source       0
dtype: int64


In [None]:
df['subject'] = df['subject'].fillna('')
df['body'] = df['body'].fillna('')

In [None]:
print(df['source'].value_counts())
print(df.groupby('source')['label'].value_counts())

source
CEAS-08           39154
Enron             29767
Assassin           5809
Nigerian_Fraud     3332
Ling               2859
Nazario            1565
Name: count, dtype: int64
source          label
Assassin        0         4091
                1         1718
CEAS-08         1        21842
                0        17312
Enron           0        15791
                1        13976
Ling            0         2401
                1          458
Nazario         1         1565
Nigerian_Fraud  1         3332
Name: count, dtype: int64


In [None]:
'''
# Approccio A – Split casuale, ma stratificato per label:

from sklearn.model_selection import train_test_split

train_df, test_df = train_test_split(df, test_size=0.2, stratify=df['label'], random_state=42)
# Buono per modelli generici, ma può portare a overfitting su CEAS-08 se non fai attenzione.



#  Approccio B – Leave-one-dataset-out cross-validation
# Esempio: Alleni su tutto tranne Enron, testi su Enron.

for source_name in df['source'].unique():
    train = df[df['source'] != source_name]
    test = df[df['source'] == source_name]
    # Allena e valuta il modello qui
'''

### Enhance

In [45]:
df = pd.read_csv("3_features_phishing.csv")
df['subject'] = df['subject'].fillna('')
df['body'] = df['body'].fillna('')

In [None]:
def shannon_entropy(text):  #non usata
    if not text:
        return 0.0
    freq = Counter(text)
    total_len = len(text)
    return -sum((count / total_len) * math.log2(count / total_len) for count in freq.values())

def extract_entropy_details(text):
    if not text:
        return 0.0, 0.0, 0.0, 0.0, 0.0

    total_len = len(text)
    freq = Counter(text)
    entropy = -sum((count / total_len) * math.log2(count / total_len) for count in freq.values())

    non_ascii_count = sum(1 for c in text if ord(c) > 127)
    digit_count = sum(1 for c in text if c.isdigit())
    punct_count = sum(1 for c in text if c in string.punctuation)

    entropy_per_char = entropy / total_len if total_len > 0 else 0.0
    percent_non_ascii = non_ascii_count / total_len
    percent_digits = digit_count / total_len
    percent_punct = punct_count / total_len

    return entropy, entropy_per_char, percent_non_ascii, percent_digits, percent_punct

def extract_features(df):
    df['subject_len'] = df['subject'].str.len() # sono altamente correlate
    df['num_words_subject'] = df['subject'].str.split().str.len()

    df['body_len'] = df['body'].str.len()                       # anche queste sono molto correlate
    df['num_words_body'] = df['body'].str.split().str.len()

    df['subject_density'] = df['subject_len'] / (df['num_words_subject'] + 1)
    df['body_density'] = df['body_len'] / (df['num_words_body'] + 1)

    df['num_links'] = df['body'].str.count(r'http[s]?://')
    df['num_special_chars'] = df['body'].str.count(r'[\$\@\!\#\%]')
    df['num_exclamations'] = df['body'].str.count(r'!')

    df['has_ip_link'] = df['body'].str.contains(r'http[s]?://\d{1,3}(?:\.\d{1,3}){3}').astype(int)
    df['has_bank_word'] = df['body'].str.contains(r'\b(?:bank|account|verify|login|password)\b', flags=re.IGNORECASE).astype(int)

    # Entropia e metriche derivate
    entropy_details = df['body'].apply(extract_entropy_details)
    df['body_entropy'] = entropy_details.apply(lambda x: round(x[0], 4))
    df['body_entropy_per_char'] = entropy_details.apply(lambda x: round(x[1], 6))
    df['percent_non_ascii'] = entropy_details.apply(lambda x: round(x[2], 4))
    df['percent_digits'] = entropy_details.apply(lambda x: round(x[3], 4))
    df['percent_punct'] = entropy_details.apply(lambda x: round(x[4], 4))
    df['text'] = df['subject'] + ' ' + df['body']

    # rimuovo le features
    df = df.drop(columns=['num_words_body','num_words_subject', 'has_ip_link', 'percent_non_ascii'])  # rimuove la colonna target

    df = df.drop(columns=[])
    return df

In [47]:
df2 = extract_features(df)

In [48]:
df2.columns

Index(['subject', 'body', 'label', 'source', 'subject_len', 'body_len',
       'subject_density', 'body_density', 'num_links', 'num_special_chars',
       'num_exclamations', 'has_bank_word', 'body_entropy',
       'body_entropy_per_char', 'percent_digits', 'percent_punct', 'text'],
      dtype='object')

In [49]:
df2.to_csv("3_features_phishing_enhanced.csv", index=False)

# Second Dataset

In [3]:
df = pd.read_csv("7_features_phishing.csv")
print(df.shape)
df.head()

(49860, 8)


Unnamed: 0,subject,body,label,source,sender,receiver,date,urls
0,Re: New Sequences Window,"Date: Wed, 21 Aug 2002 10:54:46 -0500 ...",0,Assassin,Robert Elz <kre@munnari.OZ.AU>,Chris Garrigues <cwg-dated-1030377287.06fa6d@D...,"Thu, 22 Aug 2002 18:26:25 +0700",1
1,[zzzzteana] RE: Alexander,"Martin A posted:\nTassos Papadopoulos, the Gre...",0,Assassin,Steve Burt <Steve_Burt@cursor-system.com>,"""'zzzzteana@yahoogroups.com'"" <zzzzteana@yahoo...","Thu, 22 Aug 2002 12:46:18 +0100",1
2,[zzzzteana] Moscow bomber,Man Threatens Explosion In Moscow \n\nThursday...,0,Assassin,"""Tim Chapman"" <timc@2ubh.com>",zzzzteana <zzzzteana@yahoogroups.com>,"Thu, 22 Aug 2002 13:52:38 +0100",1
3,[IRR] Klez: The Virus That Won't Die,Klez: The Virus That Won't Die\n \nAlready the...,0,Assassin,Monty Solomon <monty@roscom.com>,undisclosed-recipient: ;,"Thu, 22 Aug 2002 09:15:25 -0400",1
4,Re: [zzzzteana] Nothing like mama used to make,"> in adding cream to spaghetti carbonara, whi...",0,Assassin,Stewart Smith <Stewart.Smith@ee.ed.ac.uk>,zzzzteana@yahoogroups.com,"Thu, 22 Aug 2002 14:38:22 +0100",1


In [4]:
print(df['label'].value_counts())
print(df['label'].value_counts(normalize=True))  # percentuali


label
1    28457
0    21403
Name: count, dtype: int64
label
1    0.570738
0    0.429262
Name: proportion, dtype: float64


In [7]:
print(df.isnull().sum())  # valori NaN
print((df == '').sum())   # celle vuote

subject     0
body        0
label       0
source      0
sender      0
receiver    0
date        0
urls        0
dtype: int64


subject       87
body           1
label          0
source         0
sender       331
receiver    2092
date         483
urls           0
dtype: int64


In [6]:
df['subject'] = df['subject'].fillna('')
df['body'] = df['body'].fillna('')
df['sender'] = df['sender'].fillna('')
df['receiver'] = df['receiver'].fillna('')
df['date'] = df['date'].fillna('')

In [11]:
print(df['source'].value_counts())
print(df.groupby('source')['label'].value_counts())

source
CEAS-08           39154
Assassin           5809
Nigerian_Fraud     3332
Nazario            1565
Name: count, dtype: int64
source          label
Assassin        0         4091
                1         1718
CEAS-08         1        21842
                0        17312
Nazario         1         1565
Nigerian_Fraud  1         3332
Name: count, dtype: int64


In [None]:
def shannon_entropy(text):
    if not text:
        return 0.0
    freq = Counter(text)
    total_len = len(text)
    return -sum((count / total_len) * math.log2(count / total_len) for count in freq.values())

def extract_entropy_details(text):
    if not text:
        return 0.0, 0.0, 0.0, 0.0, 0.0

    total_len = len(text)
    freq = Counter(text)
    entropy = -sum((count / total_len) * math.log2(count / total_len) for count in freq.values())

    non_ascii_count = sum(1 for c in text if ord(c) > 127)
    digit_count = sum(1 for c in text if c.isdigit())
    punct_count = sum(1 for c in text if c in string.punctuation)

    entropy_per_char = entropy / total_len if total_len > 0 else 0.0
    percent_non_ascii = non_ascii_count / total_len
    percent_digits = digit_count / total_len
    percent_punct = punct_count / total_len

    return entropy, entropy_per_char, percent_non_ascii, percent_digits, percent_punct

def extract_features(df):
    df['subject_len'] = df['subject'].str.len() # sono altamente correlate
    df['num_words_subject'] = df['subject'].str.split().str.len()

    df['body_len'] = df['body'].str.len()                       # anche queste sono molto correlate
    df['num_words_body'] = df['body'].str.split().str.len()

    df['subject_density'] = df['subject_len'] / (df['num_words_subject'] + 1)
    df['body_density'] = df['body_len'] / (df['num_words_body'] + 1)

    df['num_links'] = df['body'].str.count(r'http[s]?://')
    df['num_special_chars'] = df['body'].str.count(r'[\$\@\!\#\%]')
    df['num_exclamations'] = df['body'].str.count(r'!')

    df['has_ip_link'] = df['body'].str.contains(r'http[s]?://\d{1,3}(?:\.\d{1,3}){3}').astype(int)
    df['has_bank_word'] = df['body'].str.contains(r'\b(?:bank|account|verify|login|password)\b', flags=re.IGNORECASE).astype(int)

    # Entropia e metriche derivate
    entropy_details = df['body'].apply(extract_entropy_details)
    df['body_entropy'] = entropy_details.apply(lambda x: round(x[0], 4))
    df['body_entropy_per_char'] = entropy_details.apply(lambda x: round(x[1], 6))
    df['percent_non_ascii'] = entropy_details.apply(lambda x: round(x[2], 4))
    df['percent_digits'] = entropy_details.apply(lambda x: round(x[3], 4))
    df['percent_punct'] = entropy_details.apply(lambda x: round(x[4], 4))
    df['text'] = df['subject'] + ' ' + df['body']

    # rimuovo le features
    df = df.drop(columns=['num_words_body','num_words_subject', 'has_ip_link', 'percent_non_ascii'])  # rimuove la colonna target

    df = df.drop(columns=[])



    return df