In [1]:

# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
from collections import Counter
import os
import re
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from IPython.display import display, Markdown
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler, FunctionTransformer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.base import clone
import numpy as np
from scipy import sparse


In [2]:
# Step 1 â€” Data intake & Basic EDA (tailored approach)

# Set style for visualizations
plt.style.use('ggplot')
sns.set(font_scale=1.2)

# Load the dataset
print("Loading the spam dataset...")
df = pd.read_csv('spam_dataset.csv')
print(f"Dataset loaded successfully with {df.shape[0]} rows and {df.shape[1]} columns.")

# 1) Snapshot & Data Dictionary
print("\n=== DATASET SNAPSHOT ===")
print(df.info())
print(df.head())


print("\n=== DATA DICTIONARY ===")
# Display data types and null counts
data_dict = pd.DataFrame({
    'Type': df.dtypes,
    'Non-Null Count': df.count(),
    'Null Count': df.isnull().sum(),
    'Null %': (df.isnull().sum() / len(df) * 100).round(2),
    'Unique Values': [df[col].nunique() for col in df.columns]
})
print(data_dict)

# Check for 'Unnamed: 0' column (index to drop)
if 'Unnamed: 0' in df.columns:
    print("\nDetected technical index column 'Unnamed: 0' - will be dropped")
    df = df.drop('Unnamed: 0', axis=1)
    print("Column dropped successfully")

# Verify label values
print("\n=== LABEL VERIFICATION ===")
print("'label' column values:")
print(df['label'].value_counts())

# Verify label_num values and consistency with label
print("\n'label_num' column values:")
print(df['label_num'].value_counts())

# Check consistency between label and label_num
if 'label' in df.columns and 'label_num' in df.columns:
    # Create a crosstab to verify alignment
    label_consistency = pd.crosstab(df['label'], df['label_num'], 
                                    rownames=['label'], 
                                    colnames=['label_num'])
    print("\nConsistency check (label vs label_num):")
    print(label_consistency)
    
    # Verify if spam == 1
    spam_is_one = (df[df['label'] == 'spam']['label_num'] == 1).all()
    print(f"Spam corresponds to label_num=1: {spam_is_one}")
    
    if not spam_is_one:
        print("WARNING: Inconsistency between 'label' and 'label_num'!")

# Check for empty or very short texts
print("\n=== TEXT CONTENT VERIFICATION ===")
# Check text length statistics
df['text_length'] = df['text'].str.len()
print("Text length statistics:")
print(df['text_length'].describe())

# Identify very short texts (potential empty content)
short_threshold = 10  # Define threshold for "very short" texts
short_texts = df[df['text_length'] < short_threshold]
print(f"\nFound {len(short_texts)} very short texts (less than {short_threshold} characters):")
if not short_texts.empty:
    print(short_texts[['label', 'text']])

# Check for whitespace-only texts
whitespace_texts = df[df['text'].str.strip() == '']
print(f"\nFound {len(whitespace_texts)} whitespace-only texts:")
if not whitespace_texts.empty:
    print(whitespace_texts[['label', 'text']])

# Verify if texts contain "Subject:" pattern
has_subject = df['text'].str.contains('Subject:', case=False, regex=True)
print(f"\nTexts containing 'Subject:': {has_subject.sum()} out of {len(df)} ({has_subject.mean()*100:.2f}%)")

# Display a few examples of texts with and without "Subject:"
if has_subject.any():
    print("\nExample with 'Subject:':")
    print(df[has_subject].iloc[0]['text'][:200] + "...")
    
if (~has_subject).any():
    print("\nExample without 'Subject:':")
    print(df[~has_subject].iloc[0]['text'][:200] + "...")

print("\n=== PRELIMINARY FINDINGS ===")
print(f"- Total records: {len(df)}")
print(f"- Missing values: {'None' if df.isnull().sum().sum() == 0 else df.isnull().sum().sum()}")
print(f"- Very short texts: {len(short_texts)}")
print(f"- Whitespace-only texts: {len(whitespace_texts)}")
print(f"- Class distribution: {df['label'].value_counts().to_dict()}")
print(f"- Spam corresponds to label_num=1: {spam_is_one if 'label' in df.columns and 'label_num' in df.columns else 'N/A'}")

Loading the spam dataset...
Dataset loaded successfully with 5171 rows and 4 columns.

=== DATASET SNAPSHOT ===
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5171 entries, 0 to 5170
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Unnamed: 0  5171 non-null   int64 
 1   label       5171 non-null   object
 2   text        5171 non-null   object
 3   label_num   5171 non-null   int64 
dtypes: int64(2), object(2)
memory usage: 161.7+ KB
None
   Unnamed: 0 label                                               text  \
0         605   ham  Subject: enron methanol ; meter # : 988291\nth...   
1        2349   ham  Subject: hpl nom for january 9 , 2001\n( see a...   
2        3624   ham  Subject: neon retreat\nho ho ho , we ' re arou...   
3        4685  spam  Subject: photoshop , windows , office . cheap ...   
4        2030   ham  Subject: re : indian springs\nthis deal is to ...   

   label_num  
0          0  
1         

In [3]:
# Cell 1 â€” Duplicates analysis (without removal)

# Flag exact duplicates
df['is_exact_duplicate'] = df.duplicated('text', keep=False)

# Calculate metrics
n_rows_dup_marked = df['is_exact_duplicate'].sum()
n_groups_dup = df['text'].value_counts().gt(1).sum()

# Count duplicates by class
dup_by_class = df[df['is_exact_duplicate']].groupby('label').size()
dup_pct_by_class = df[df['is_exact_duplicate']].groupby('label').size() / df.groupby('label').size() * 100

# Print summary
print("=== DUPLICATES ANALYSIS ===")
print(f"Total rows marked as duplicates: {n_rows_dup_marked} ({n_rows_dup_marked/len(df)*100:.2f}% of dataset)")
print(f"Number of duplicate groups: {n_groups_dup}")
print("\nDuplicates by class:")
for label, count in dup_by_class.items():
    print(f"- {label}: {count} rows ({dup_pct_by_class[label]:.2f}% of {label} class)")

# Show top 3 duplicate groups
print("\nTop 3 duplicate groups:")
top_dups = df['text'].value_counts().nlargest(3)
for text, count in top_dups.items():
    # Extract subject or first 80 chars for display
    subject_match = re.search(r'(?im)^\s*subject\s*:\s*(.*)$', text)
    display_text = subject_match.group(1) if subject_match else text[:80]
    print(f"- '{display_text}...' ({count} occurrences)")


=== DUPLICATES ANALYSIS ===
Total rows marked as duplicates: 321 (6.21% of dataset)
Number of duplicate groups: 143

Duplicates by class:
- ham: 264 rows (7.19% of ham class)
- spam: 57 rows (3.80% of spam class)

Top 3 duplicate groups:
- 'calpine daily gas nomination...' (20 occurrences)
- '...' (16 occurrences)
- 'you can be smart !...' (3 occurrences)


In [4]:
# Cell 2 â€” Parsing subject / body

# Define robust extraction functions
def extract_subject(text):
    text = str(text).strip() if pd.notna(text) else ''
    match = re.search(r'(?im)^\s*subject\s*:\s*(.*)$', text)
    return match.group(1).strip() if match else ''

def extract_body(text):
    text = str(text).strip() if pd.notna(text) else ''
    match = re.search(r'(?im)^\s*subject\s*:\s*(.*)$', text)
    if match:
        # Find the position after the subject line
        subject_end = match.end()
        # Find the next line break after subject
        next_line = text.find('\n', subject_end)
        if next_line != -1:
            return text[next_line+1:].strip()
        else:
            return ''  # No body after subject
    return text  # If no subject found, return full text

# Add columns
print("Extracting subject and body...")
df['subject'] = df['text'].apply(extract_subject)
df['body'] = df['text'].apply(extract_body)

# Calculate length and token metrics
df['subject_length'] = df['subject'].fillna('').str.len()
df['body_length'] = df['body'].fillna('').str.len()
df['subject_token_count'] = df['subject'].fillna('').str.split().str.len()
df['body_token_count'] = df['body'].fillna('').str.split().str.len()
df['subject_body_ratio'] = df['subject_length'] / df['body_length'].apply(lambda x: max(1, x))

# Print statistics by class
print("\n=== SUBJECT/BODY STATISTICS BY CLASS ===")

metrics = ['subject_length', 'body_length', 'subject_token_count', 'body_token_count', 'subject_body_ratio']

# Create a results dataframe for better display
results = []

for label_num in [1, 0]:  # SPAM=1, HAM=0
    class_label = 'SPAM' if label_num == 1 else 'HAM'
    class_df = df[df['label_num'] == label_num]
    
    for metric in metrics:
        median = class_df[metric].median()
        q1 = class_df[metric].quantile(0.25)
        q3 = class_df[metric].quantile(0.75)
        
        results.append({
            'Class': class_label,
            'Metric': metric,
            'Median': median,
            'Q1': q1,
            'Q3': q3,
            'IQR': f"{median:.2f} [{q1:.2f}-{q3:.2f}]"
        })

# Convert to DataFrame and display
stats_df = pd.DataFrame(results)
print(stats_df[['Class', 'Metric', 'IQR']])

Extracting subject and body...

=== SUBJECT/BODY STATISTICS BY CLASS ===
  Class               Metric                      IQR
0  SPAM       subject_length      32.00 [20.00-46.00]
1  SPAM          body_length  507.00 [227.50-1171.00]
2  SPAM  subject_token_count        7.00 [4.00-10.00]
3  SPAM     body_token_count    105.00 [46.00-232.00]
4  SPAM   subject_body_ratio         0.06 [0.02-0.15]
5   HAM       subject_length      28.00 [19.00-40.00]
6   HAM          body_length  473.00 [190.00-1170.25]
7   HAM  subject_token_count         6.00 [4.00-9.00]
8   HAM     body_token_count    118.00 [44.00-284.00]
9   HAM   subject_body_ratio         0.05 [0.02-0.15]


In [5]:
# Cell 3 â€” Subject indicators by class

# Calculate subject indicators
# --- PATCH: subject indicators ---

# Pattern piÃ¹ robusti per "Re:" e "Fwd:" con spazi opzionali
RE_PAT  = r'(?i)\bre\s*:'
FWD_PAT = r'(?i)\b(?:fw|fwd)\s*:'

def calculate_subject_indicators(subject: str):
    subject = (subject or "").strip()
    # Conta solo lettere per la % maiuscole
    letters = [c for c in subject if c.isalpha()]
    n_letters = max(1, len(letters))
    uppercase_letters = sum(1 for c in letters if c.isupper())

    return {
        "subject_has_re": 1 if re.search(RE_PAT, subject) else 0,
        "subject_has_fwd": 1 if re.search(FWD_PAT, subject) else 0,
        "subject_exclamation_count": subject.count("!"),
        "subject_digit_count": sum(c.isdigit() for c in subject),
        "subject_has_currency": 1 if re.search(r"[$â‚¬Â£Â¥]", subject) else 0,
        "subject_percent_uppercase_letters": (uppercase_letters / n_letters) * 100.0,
    }

print("Calculating subject indicators...")
subject_indicators = df["subject"].apply(calculate_subject_indicators)

cols = [
    "subject_has_re", "subject_has_fwd", "subject_exclamation_count",
    "subject_digit_count", "subject_has_currency",
    "subject_percent_uppercase_letters"
]
for c in cols:
    df[c] = subject_indicators.apply(lambda x: x[c])

# Tabella mediana [Q1â€“Q3] (come giÃ  fai)
results = []
for label_num, class_label in [(1,"SPAM"), (0,"HAM")]:
    class_df = df[df["label_num"] == label_num]
    for c in cols:
        med = class_df[c].median()
        q1 = class_df[c].quantile(0.25)
        q3 = class_df[c].quantile(0.75)
        results.append({
            "Class": class_label,
            "Indicator": c,
            "Median [Q1-Q3]": f"{med:.2f} [{q1:.2f}-{q3:.2f}]"
        })
indicators_df = pd.DataFrame(results)
print("\n=== SUBJECT INDICATORS BY CLASS (mediana [IQR]) ===")
print(indicators_df[["Class","Indicator","Median [Q1-Q3]"]])

# ðŸš© Aggiungi ANCHE la prevalenza (%) per i flag binari: piÃ¹ informativa della mediana
flag_cols = ["subject_has_re","subject_has_fwd","subject_has_currency"]
pct_rows = []
for label_num, class_label in [(1,"SPAM"), (0,"HAM")]:
    class_df = df[df["label_num"] == label_num]
    for c in flag_cols:
        pct = class_df[c].mean() * 100.0
        pct_rows.append({"Class": class_label, "Flag": c, "Prevalence %": f"{pct:.2f}"})
pct_df = pd.DataFrame(pct_rows)
print("\n=== SUBJECT FLAGS PREVALENCE (percentuale di True) ===")
print(pct_df.pivot(index="Flag", columns="Class", values="Prevalence %"))


# Convert to DataFrame and display
indicators_df = pd.DataFrame(results)
print(indicators_df[['Class', 'Indicator', 'Median [Q1-Q3]']])

Calculating subject indicators...

=== SUBJECT INDICATORS BY CLASS (mediana [IQR]) ===
   Class                          Indicator    Median [Q1-Q3]
0   SPAM                     subject_has_re  0.00 [0.00-0.00]
1   SPAM                    subject_has_fwd  0.00 [0.00-0.00]
2   SPAM          subject_exclamation_count  0.00 [0.00-0.00]
3   SPAM                subject_digit_count  0.00 [0.00-0.00]
4   SPAM               subject_has_currency  0.00 [0.00-0.00]
5   SPAM  subject_percent_uppercase_letters  0.00 [0.00-0.00]
6    HAM                     subject_has_re  0.00 [0.00-0.00]
7    HAM                    subject_has_fwd  0.00 [0.00-0.00]
8    HAM          subject_exclamation_count  0.00 [0.00-0.00]
9    HAM                subject_digit_count  1.00 [0.00-5.00]
10   HAM               subject_has_currency  0.00 [0.00-0.00]
11   HAM  subject_percent_uppercase_letters  0.00 [0.00-0.00]

=== SUBJECT FLAGS PREVALENCE (percentuale di True) ===
Class                   HAM  SPAM
Flag             

In [6]:
# Cell 4 â€” Simple signals on body and text

# Define function to calculate content indicators
def calculate_content_indicators(text):
    text = str(text).strip() if pd.notna(text) else ''
    
    # Count letters for uppercase percentage calculation
    letters = [c for c in text if c.isalpha()]
    letter_count = max(1, len(letters))
    uppercase_letters = sum(1 for c in letters if c.isupper())
    
    # Define spam words list
    spam_words = ['free', 'offer', 'money', 'click', 'win', 'cash', 'prize', 'discount']
    
    # Calculate indicators
    url_count = len(re.findall(r'https?://\S+|www\.\S+', text))
    email_count = len(re.findall(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b', text))
    phone_count = len(re.findall(r'\+?\d[\d\s\-\(\)]{7,}', text))
    digit_count = sum(c.isdigit() for c in text)
    exclamation_count = text.count('!')
    percent_uppercase = (uppercase_letters / letter_count) * 100
    
    # Count spam words (case insensitive)
    spam_word_count = sum(len(re.findall(r'\b' + word + r'\b', text, re.IGNORECASE)) for word in spam_words)
    
    return {
        'url_count': url_count,
        'email_count': email_count,
        'phone_count': phone_count,
        'digit_count': digit_count,
        'exclamation_count': exclamation_count,
        'percent_uppercase_letters': percent_uppercase,
        'spam_word_count': spam_word_count
    }

# Apply to body and text
print("Calculating content indicators for body and full text...")

# Process body
body_indicators = df['body'].apply(calculate_content_indicators)
for indicator, values in zip(
    ['body_url_count', 'body_email_count', 'body_phone_count', 'body_digit_count',
     'body_exclamation_count', 'body_percent_uppercase_letters', 'body_spam_word_count'],
    ['url_count', 'email_count', 'phone_count', 'digit_count',
     'exclamation_count', 'percent_uppercase_letters', 'spam_word_count']
):
    df[indicator] = body_indicators.apply(lambda x: x[values])

# Process full text
text_indicators = df['text'].apply(calculate_content_indicators)
for indicator, values in zip(
    ['text_url_count', 'text_email_count', 'text_phone_count', 'text_digit_count',
     'text_exclamation_count', 'text_percent_uppercase_letters', 'text_spam_word_count'],
    ['url_count', 'email_count', 'phone_count', 'digit_count',
     'exclamation_count', 'percent_uppercase_letters', 'spam_word_count']
):
    df[indicator] = text_indicators.apply(lambda x: x[values])

# Print comparative table for body metrics
print("\n=== BODY CONTENT INDICATORS: SPAM vs HAM ===")

body_metrics = ['body_url_count', 'body_email_count', 'body_phone_count', 
               'body_digit_count', 'body_exclamation_count', 
               'body_percent_uppercase_letters', 'body_spam_word_count']

# Create comparison table
comparison_data = []

for metric in body_metrics:
    # SPAM statistics
    spam_median = df[df['label_num'] == 1][metric].median()
    spam_q1 = df[df['label_num'] == 1][metric].quantile(0.25)
    spam_q3 = df[df['label_num'] == 1][metric].quantile(0.75)
    spam_iqr = f"{spam_median:.2f} [{spam_q1:.2f}-{spam_q3:.2f}]"
    
    # HAM statistics
    ham_median = df[df['label_num'] == 0][metric].median()
    ham_q1 = df[df['label_num'] == 0][metric].quantile(0.25)
    ham_q3 = df[df['label_num'] == 0][metric].quantile(0.75)
    ham_iqr = f"{ham_median:.2f} [{ham_q1:.2f}-{ham_q3:.2f}]"
    
    # Calculate ratio (protect against division by zero)
    ratio = spam_median / max(0.001, ham_median)
    
    comparison_data.append({
        'Metric': metric.replace('body_', ''),
        'SPAM (median [IQR])': spam_iqr,
        'HAM (median [IQR])': ham_iqr,
        'SPAM/HAM ratio': f"{ratio:.2f}x"
    })

# Convert to DataFrame and display
comparison_df = pd.DataFrame(comparison_data)
print(comparison_df)

Calculating content indicators for body and full text...



=== BODY CONTENT INDICATORS: SPAM vs HAM ===
                      Metric SPAM (median [IQR])  HAM (median [IQR])  \
0                  url_count    0.00 [0.00-0.00]    0.00 [0.00-0.00]   
1                email_count    0.00 [0.00-0.00]    0.00 [0.00-0.00]   
2                phone_count    0.00 [0.00-0.00]    0.00 [0.00-2.00]   
3                digit_count   5.00 [0.00-20.00]  24.00 [6.00-53.00]   
4          exclamation_count    0.00 [0.00-2.00]    0.00 [0.00-0.00]   
5  percent_uppercase_letters    0.00 [0.00-0.00]    0.00 [0.00-0.00]   
6            spam_word_count    0.00 [0.00-1.00]    0.00 [0.00-0.00]   

  SPAM/HAM ratio  
0          0.00x  
1          0.00x  
2          0.00x  
3          0.21x  
4          0.00x  
5          0.00x  
6          0.00x  


In [7]:
# Cell 5 â€” Formatting & "noise" flags

# Define functions to detect formatting and noise patterns
def has_html(text):
    text = str(text).strip() if pd.notna(text) else ''
    return bool(re.search(r'<[^>]+>', text))

def has_obfuscated_links(text):
    text = str(text).strip() if pd.notna(text) else ''
    
    # Find all anchor tags
    anchor_tags = re.findall(r'<a\s+[^>]*href\s*=\s*["\']([^"\']+)["\'][^>]*>(.*?)</a>', text, re.IGNORECASE | re.DOTALL)
    
    for href, anchor_text in anchor_tags:
        # Extract domains
        href_domain = re.search(r'https?://([^/]+)', href)
        anchor_domain = re.search(r'https?://([^/]+)', anchor_text)
        
        # Check if anchor text doesn't contain a domain (generic text like "click here")
        if not anchor_domain:
            return True
            
        # Check if domains don't match
        if href_domain and anchor_domain and href_domain.group(1) != anchor_domain.group(1):
            return True
            
    return False

def has_signature(text):
    text = str(text).strip() if pd.notna(text) else ''
    return bool(re.search(r'--\s*\n', text))

def has_disclaimer(text):
    text = str(text).strip() if pd.notna(text) else ''
    return bool(re.search(r'(?i)\b(disclaimer|legal notice|confidential)\b', text))

def has_original_message(text):
    text = str(text).strip() if pd.notna(text) else ''
    pattern1 = r'(?im)^-{2,}\s*Original Message\s*-{2,}'
    pattern2 = r'(?ims)^\s*From: .+\n\s*Sent: .+\n\s*To: .+\n\s*Subject: .+'
    return bool(re.search(pattern1, text) or re.search(pattern2, text))

# Apply functions to add flags
print("Detecting formatting and noise patterns...")
df['has_html'] = df['text'].apply(has_html)
df['has_obfuscated_links'] = df['text'].apply(has_obfuscated_links)
df['has_signature'] = df['text'].apply(has_signature)
df['has_disclaimer'] = df['text'].apply(has_disclaimer)
df['has_original_message'] = df['text'].apply(has_original_message)

# Print percentages by class
print("\n=== FORMATTING & NOISE PATTERNS BY CLASS ===")

flags = ['has_html', 'has_obfuscated_links', 'has_signature', 'has_disclaimer', 'has_original_message']
results = []

for flag in flags:
    # Calculate percentages
    spam_pct = df[df['label_num'] == 1][flag].mean() * 100
    ham_pct = df[df['label_num'] == 0][flag].mean() * 100
    
    results.append({
        'Flag': flag,
        'SPAM %': f"{spam_pct:.2f}%",
        'HAM %': f"{ham_pct:.2f}%"
    })

# Convert to DataFrame and display
flags_df = pd.DataFrame(results)
print(flags_df)

print("\nDECISION: Will propose to extract meta-features from HTML/formatting before cleaning, then remove HTML tags and normalize text.")

Detecting formatting and noise patterns...

=== FORMATTING & NOISE PATTERNS BY CLASS ===
                   Flag SPAM %  HAM %
0              has_html  0.00%  0.00%
1  has_obfuscated_links  0.00%  0.00%
2         has_signature  0.00%  0.00%
3        has_disclaimer  2.47%  0.95%
4  has_original_message  0.00%  0.00%

DECISION: Will propose to extract meta-features from HTML/formatting before cleaning, then remove HTML tags and normalize text.


In [8]:
# Cell 6 â€” Leakage (anti-spam headers)

# Define function to check for leakage
def check_leakage(text):
    text = str(text).strip() if pd.notna(text) else ''
    leakage_patterns = [
        r'X-Spam-Flag:',
        r'Spam-Score:',
        r'SpamAssassin',
        r'X-Spam-Status:',
        r'X-Spam-Level:'
    ]
    
    for pattern in leakage_patterns:
        if re.search(f'(?i){pattern}', text):
            return True
    return False

# Apply function to add flag
print("Checking for anti-spam headers (potential leakage)...")
df['has_leakage'] = df['text'].apply(check_leakage)

# Calculate statistics
leakage_count = df['has_leakage'].sum()
leakage_percent = leakage_count / len(df) * 100

# Print results
print("\n=== LEAKAGE DETECTION (ANTI-SPAM HEADERS) ===")
print(f"Emails with potential leakage: {leakage_count} ({leakage_percent:.2f}% of dataset)")

if leakage_count > 0:
    print("\nWARNING: Found anti-spam headers that could cause data leakage!")
    print("DECISION: These headers should be excluded during preprocessing in Step 3.")
else:
    print("\nNo anti-spam headers detected. No special handling required.")

Checking for anti-spam headers (potential leakage)...

=== LEAKAGE DETECTION (ANTI-SPAM HEADERS) ===
Emails with potential leakage: 0 (0.00% of dataset)

No anti-spam headers detected. No special handling required.


In [9]:
# Cell 7 â€” Distinctive lexicon (top n-grams)

# Define preprocessing function for n-grams
def preprocess_for_ngrams(text):
    text = str(text).strip() if pd.notna(text) else ''
    # Convert to lowercase
    text = text.lower()
    # Remove punctuation
    text = re.sub(r'[^\w\s]', '', text)
    # Remove numbers
    text = re.sub(r'\d+', '', text)
    # Normalize whitespace
    text = re.sub(r'\s+', ' ', text).strip()
    return text

# Function to get top n-grams
def get_top_ngrams(corpus, n, top_k=20):
    corpus = [preprocess_for_ngrams(text) for text in corpus]
    
    # Skip empty texts
    corpus = [text for text in corpus if text.strip()]
    
    if not corpus:
        return pd.DataFrame({'ngram': [], 'frequency': []})
    
    # Create vectorizer and fit
    vectorizer = CountVectorizer(ngram_range=(n, n), stop_words='english')
    X = vectorizer.fit_transform(corpus)
    
    # Get feature names and frequencies
    feature_names = vectorizer.get_feature_names_out()
    frequencies = X.sum(axis=0).A1
    
    # Create DataFrame and sort
    ngram_df = pd.DataFrame({'ngram': feature_names, 'frequency': frequencies})
    ngram_df = ngram_df.sort_values('frequency', ascending=False).head(top_k).reset_index(drop=True)
    
    return ngram_df

# Set top K parameter
TOP_K = 20
print(f"Generating top {TOP_K} n-grams for each corpus...")

# Split data by class
spam_subjects = df[df['label_num'] == 1]['subject']
ham_subjects = df[df['label_num'] == 0]['subject']
spam_bodies = df[df['label_num'] == 1]['body']
ham_bodies = df[df['label_num'] == 0]['body']

# Generate top n-grams
print("\n=== TOP UNIGRAMS IN SUBJECT ===")
print("SPAM:")
display(get_top_ngrams(spam_subjects, 1, TOP_K))
print("\nHAM:")
display(get_top_ngrams(ham_subjects, 1, TOP_K))

print("\n=== TOP BIGRAMS IN SUBJECT ===")
print("SPAM:")
display(get_top_ngrams(spam_subjects, 2, TOP_K))
print("\nHAM:")
display(get_top_ngrams(ham_subjects, 2, TOP_K))

print("\n=== TOP UNIGRAMS IN BODY ===")
print("SPAM:")
display(get_top_ngrams(spam_bodies, 1, TOP_K))
print("\nHAM:")
display(get_top_ngrams(ham_bodies, 1, TOP_K))

print("\n=== TOP BIGRAMS IN BODY ===")
print("SPAM:")
display(get_top_ngrams(spam_bodies, 2, TOP_K))
print("\nHAM:")
display(get_top_ngrams(ham_bodies, 2, TOP_K))

Generating top 20 n-grams for each corpus...

=== TOP UNIGRAMS IN SUBJECT ===
SPAM:


Unnamed: 0,ngram,frequency
0,software,44
1,online,42
2,meds,41
3,new,41
4,paliourg,36
5,cheap,33
6,want,31
7,free,31
8,best,30
9,prices,27



HAM:


Unnamed: 0,ngram,frequency
0,hpl,582
1,nom,338
2,meter,333
3,enron,276
4,gas,232
5,actuals,214
6,nomination,209
7,fw,177
8,noms,148
9,deal,134



=== TOP BIGRAMS IN SUBJECT ===
SPAM:


Unnamed: 0,ngram,frequency
0,instant download,13
1,soft tabs,12
2,charset ascii,11
3,windows xp,10
4,office xp,10
5,hi paliourg,9
6,prescription needed,7
7,penis growth,7
8,fix penis,7
9,cialis soft,7



HAM:


Unnamed: 0,ngram,frequency
0,hpl nom,234
1,enron hpl,185
2,hpl actuals,168
3,tenaska iv,80
4,gas nomination,70
5,calpine daily,54
6,eastrans nomination,53
7,daily gas,49
8,noms actual,45
9,actual flow,44



=== TOP UNIGRAMS IN BODY ===
SPAM:


Unnamed: 0,ngram,frequency
0,com,989
1,http,983
2,company,723
3,www,587
4,information,513
5,font,511
6,td,504
7,statements,476
8,email,471
9,price,446



HAM:


Unnamed: 0,ngram,frequency
0,ect,13893
1,hou,7281
2,enron,6279
3,subject,2728
4,com,2693
5,deal,2655
6,gas,2629
7,cc,2357
8,pm,2325
9,meter,2126



=== TOP BIGRAMS IN BODY ===
SPAM:


Unnamed: 0,ngram,frequency
0,http www,409
1,nbsp nbsp,296
2,computron com,195
3,href http,175
4,looking statements,172
5,pills pills,169
6,width height,164
7,src http,157
8,www computron,152
9,forward looking,142



HAM:


Unnamed: 0,ngram,frequency
0,hou ect,7226
1,ect ect,6339
2,enron enron,1440
3,ect cc,1391
4,corp enron,1210
5,cc subject,1099
6,let know,980
7,daren farmer,933
8,enron com,824
9,ect subject,730


In [10]:
# Cell 8 â€” "Definition of Done" for Step 2 (con analisi aggiuntive)

print("=== STEP 2: DEFINITION OF DONE ===")

# 1. Duplicates summary
print("\n1. DUPLICATES:")
print(f"â€¢ Rows marked as duplicates: {n_rows_dup_marked} ({n_rows_dup_marked/len(df)*100:.2f}% of dataset)")
print(f"â€¢ Number of duplicate groups: {n_groups_dup}")
print("â€¢ Breakdown by class:")
for label, count in dup_by_class.items():
    print(f"  - {label}: {count} ({dup_pct_by_class[label]:.2f}% of class)")

# 2. Subject/Body statistics
print("\n2. SUBJECT/BODY STATISTICS (median [IQR]):")
for metric in metrics:
    spam_data = df[df['label_num'] == 1][metric]
    ham_data = df[df['label_num'] == 0][metric]
    
    spam_median = spam_data.median()
    spam_q1 = spam_data.quantile(0.25)
    spam_q3 = spam_data.quantile(0.75)
    
    ham_median = ham_data.median()
    ham_q1 = ham_data.quantile(0.25)
    ham_q3 = ham_data.quantile(0.75)
    
    print(f"â€¢ {metric}:")
    print(f"  - SPAM: {spam_median:.2f} [{spam_q1:.2f}-{spam_q3:.2f}]")
    print(f"  - HAM: {ham_median:.2f} [{ham_q1:.2f}-{ham_q3:.2f}]")

# 2.1 Subject flags prevalence
print("\n2.1. SUBJECT FLAGS PREVALENCE (%):")
flag_cols = ["subject_has_re","subject_has_fwd","subject_has_currency"]
rows = []
for lbl, name in [(1,"SPAM"), (0,"HAM")]:
    sub = df.loc[df.label_num==lbl, flag_cols]
    for c in flag_cols:
        rows.append({"Class": name, "Flag": c, "Prevalence %": round(sub[c].mean()*100, 2)})
prevalence_df = pd.DataFrame(rows).pivot(index="Flag", columns="Class", values="Prevalence %")
print(prevalence_df)

# 3. Body signals comparison
print("\n3. BODY SIGNALS COMPARISON (median [IQR]):")
print(comparison_df.to_string(index=False))

# 3.1 Body signals: detailed stats
print("\n3.1. BODY SIGNALS: PREVALENCE, CONDITIONAL MEAN, AND P95:")
count_cols = ["body_url_count","body_email_count","body_phone_count",
              "body_digit_count","body_exclamation_count","body_spam_word_count"]
out_rows = []
for lbl, name in [(1,"SPAM"), (0,"HAM")]:
    sub = df.loc[df.label_num==lbl, count_cols]
    gt0 = (sub > 0).mean()*100
    cond_mean = sub.where(sub>0).mean()
    p95 = sub.quantile(0.95)
    out = {"Class": name}
    out.update({f"% {c.split('_',1)[1]}>0": f"{gt0[c]:.2f}%" for c in count_cols})
    out.update({f"mean {c.split('_',1)[1]}|>0": f"{cond_mean[c]:.2f}" for c in count_cols})
    out.update({f"p95 {c.split('_',1)[1]}": f"{p95[c]:.0f}" for c in count_cols})
    out_rows.append(out)
detailed_stats_df = pd.DataFrame(out_rows)
print(detailed_stats_df)

# 4. Formatting percentages
print("\n4. FORMATTING & NOISE:")
for flag in flags:
    spam_pct = df[df['label_num'] == 1][flag].mean() * 100
    ham_pct = df[df['label_num'] == 0][flag].mean() * 100
    
    print(f"â€¢ {flag}:")
    print(f"  - SPAM: {spam_pct:.2f}%")
    print(f"  - HAM: {ham_pct:.2f}%")

# 5. Leakage
print("\n5. LEAKAGE:")
print(f"â€¢ Anti-spam headers: {leakage_count} emails ({leakage_percent:.2f}% of dataset)")
if leakage_count > 0:
    print("â€¢ DECISION: Exclude anti-spam headers during preprocessing in Step 3")
else:
    print("â€¢ No leakage detected")

# 6. Decisions for Step 3
print("\n6. DECISIONS FOR STEP 3:")
print("â€¢ Remove exact duplicates in training, keeping one instance per group")
print("â€¢ Extract meta-features from HTML/URL/email/phone before cleanup")
print("â€¢ Remove HTML tags and normalize whitespace/case")
print("â€¢ Apply PII placeholders: [URL] [EMAIL] [PHONE] [NUMBER]")
print("â€¢ Use weighted combination of subject + body")
print("â€¢ Truncate content after signatures and 'Original Message' blocks")
print("â€¢ Usa class_weight=\"balanced\" nel modello (gestisce automaticamente lo sbilanciamento).")
print("â€¢ Escludi eventuali header antispam (X-Spam-Flag, Spam-Score, X-Spam-Status, X-Spam-Level, SpamAssassin) dal testo per il modello (qui non rilevati, ma la regola resta).")

=== STEP 2: DEFINITION OF DONE ===

1. DUPLICATES:
â€¢ Rows marked as duplicates: 321 (6.21% of dataset)
â€¢ Number of duplicate groups: 143
â€¢ Breakdown by class:
  - ham: 264 (7.19% of class)
  - spam: 57 (3.80% of class)

2. SUBJECT/BODY STATISTICS (median [IQR]):
â€¢ subject_length:
  - SPAM: 32.00 [20.00-46.00]
  - HAM: 28.00 [19.00-40.00]
â€¢ body_length:
  - SPAM: 507.00 [227.50-1171.00]
  - HAM: 473.00 [190.00-1170.25]
â€¢ subject_token_count:
  - SPAM: 7.00 [4.00-10.00]
  - HAM: 6.00 [4.00-9.00]
â€¢ body_token_count:
  - SPAM: 105.00 [46.00-232.00]
  - HAM: 118.00 [44.00-284.00]
â€¢ subject_body_ratio:
  - SPAM: 0.06 [0.02-0.15]
  - HAM: 0.05 [0.02-0.15]

2.1. SUBJECT FLAGS PREVALENCE (%):
Class                   HAM  SPAM
Flag                             
subject_has_currency   0.19  3.80
subject_has_fwd        4.52  2.67
subject_has_re        19.20  5.34

3. BODY SIGNALS COMPARISON (median [IQR]):
                   Metric SPAM (median [IQR]) HAM (median [IQR]) SPAM/HAM rat

In [11]:
# Cell 11 â€” Preprocessing Freeze & Setup

# Set key parameters
RANDOM_STATE = 42
N_FOLDS = 5
SUBJECT_WEIGHT = 2

print("=== PREPROCESSING SETUP ===")
print(f"â€¢ Random Seed: {RANDOM_STATE}")
print(f"â€¢ Cross-validation folds: {N_FOLDS}")
print(f"â€¢ Subject weight: {SUBJECT_WEIGHT}x")

# Define target variable
print("\n=== TARGET VARIABLE ===")
print("â€¢ Using 'label_num' as target (0=HAM, 1=SPAM)")
print(f"â€¢ Class distribution: {df['label_num'].value_counts().to_dict()}")

# Define meta-features list based on analysis
subject_meta_features = [
  'subject_has_re', 'subject_has_fwd', 'subject_has_currency',
  'subject_digit_count', 'subject_exclamation_count',
  'subject_length', 'subject_body_ratio'
]
body_meta_features = [
  'body_phone_count', 'body_digit_count',
  'body_exclamation_count', 'body_spam_word_count',
  'body_length'
]
all_meta_features = subject_meta_features + body_meta_features


# Create a copy of the dataframe for training with meta-features
# Note: Deduplication will be done in the training phase, not here
meta_X = df[all_meta_features].copy()
y = df['label_num'].copy()

# Display example of meta-features for one sample
print("\n=== EXAMPLE META-FEATURES (first sample) ===")
example = meta_X.iloc[0].to_dict()
for feature, value in example.items():
    print(f"â€¢ {feature}: {value}")

print("\n=== PREPROCESSING DECISIONS ===")
print("â€¢ Training data will use deduplication (one instance per group)")
print("â€¢ Text preprocessing will:")
print("  - Extract meta-features (shown above) before cleaning")
print("  - Remove HTML tags and normalize whitespace/case")
print("  - Apply PII placeholders: [URL] [EMAIL] [PHONE] [NUMBER]")
print("  - Weight subject higher than body (ratio 2:1)")
print("  - Truncate content after signatures and 'Original Message' blocks")
print("  - Use balanced class weights in the model")
print("  - Exclude anti-spam headers that could cause leakage")

=== PREPROCESSING SETUP ===
â€¢ Random Seed: 42
â€¢ Cross-validation folds: 5
â€¢ Subject weight: 2x

=== TARGET VARIABLE ===
â€¢ Using 'label_num' as target (0=HAM, 1=SPAM)
â€¢ Class distribution: {0: 3672, 1: 1499}

=== EXAMPLE META-FEATURES (first sample) ===
â€¢ subject_has_re: 0.0
â€¢ subject_has_fwd: 0.0
â€¢ subject_has_currency: 0.0
â€¢ subject_digit_count: 6.0
â€¢ subject_exclamation_count: 0.0
â€¢ subject_length: 33.0
â€¢ subject_body_ratio: 0.11827956989247312
â€¢ body_phone_count: 0.0
â€¢ body_digit_count: 4.0
â€¢ body_exclamation_count: 0.0
â€¢ body_spam_word_count: 0.0
â€¢ body_length: 279.0

=== PREPROCESSING DECISIONS ===
â€¢ Training data will use deduplication (one instance per group)
â€¢ Text preprocessing will:
  - Extract meta-features (shown above) before cleaning
  - Remove HTML tags and normalize whitespace/case
  - Apply PII placeholders: [URL] [EMAIL] [PHONE] [NUMBER]
  - Weight subject higher than body (ratio 2:1)
  - Truncate content after signatures and 'Ori

In [12]:
# Cell 1 â€” Preprocessing functions (correct order)
print("=== PREPROCESSING FUNCTIONS ===")

# 1. Meta-features first (already calculated in df)
print("â€¢ Using pre-calculated meta-features from previous steps")

# 2. Text cleaning functions
def clean_text(text, remove_leakage=True, truncate=True, replace_pii=True, normalize=True):
    """
    Complete text cleaning pipeline:
    - Remove HTML
    - Remove anti-spam headers (optional)
    - Truncate after signatures and 'Original Message' (optional)
    - Replace PII with placeholders (optional)
    - Normalize whitespace and lowercase (optional)
    """
    if pd.isna(text) or text == '':
        return ''
    
    text = str(text)
    
    # Remove HTML
    if '<' in text and '>' in text:  # Quick check before applying expensive regex
        text = re.sub(r'<[^>]+>', ' ', text)
    
    # Remove anti-spam headers (potential data leakage)
    if remove_leakage:
        leakage_pattern = r'(?im)^(X-Spam-Flag|Spam-Score|X-Spam-Status|X-Spam-Level|SpamAssassin):.*?$\n?'
        text = re.sub(leakage_pattern, '', text)
    
    # Truncate after signatures and 'Original Message'
    if truncate:
        # Signatures (common pattern: --\n)
        signature_match = re.search(r'--\s*\n', text)
        if signature_match:
            text = text[:signature_match.start()].strip()
        
        # 'Original Message' block
        orig_msg_patterns = [
            r'(?im)^-{2,}\s*Original Message\s*-{2,}',
            r'(?ims)^\s*From: .+\n\s*Sent: .+\n\s*To: .+\n\s*Subject: .+'
        ]
        
        for pattern in orig_msg_patterns:
            match = re.search(pattern, text)
            if match:
                text = text[:match.start()].strip()
    
    # Replace PII with placeholders
    if replace_pii:
        # URLs
        text = re.sub(r'https?://\S+|www\.\S+', '[URL]', text)
        
        # Email addresses
        text = re.sub(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b', '[EMAIL]', text)
        
        # Phone numbers (various formats)
        text = re.sub(r'\+?\d[\d\s\-\(\)]{7,}\d', '[PHONE]', text)
        
        # Number sequences (4+ consecutive digits)
        text = re.sub(r'\b\d{4,}\b', '[NUMBER]', text)
    
    # Normalization
    if normalize:
        # Lowercase
        text = text.lower()
        
        # Normalize whitespace
        text = re.sub(r'\s+', ' ', text).strip()
    
    return text

# 3. Function for subject weighting
def create_weighted_text(row, subject_weight):
    """
    Create text_weighted = (subject + " ")*SUBJECT_WEIGHT + body
    """
    subject = str(row['subject']) if pd.notna(row['subject']) else ''
    body = str(row['body']) if pd.notna(row['body']) else ''
    
    # If subject is empty but body isn't, use only body
    if subject.strip() == '' and body.strip() != '':
        return body
    
    # If body is empty but subject isn't, use only subject
    if body.strip() == '' and subject.strip() != '':
        return subject
    
    # Otherwise apply weighting
    return (subject + " ") * subject_weight + body

# 4. Function to create model-ready DataFrame
def create_model_ready_df(df, subject_weight, meta_features, deduplicate=True):
    """
    Create model-ready dataframe:
    - Add text_weighted and text_clean
    - Optionally remove duplicates
    - Keep only necessary columns (meta-features + text_clean + label)
    """
    # Create a copy to avoid modifying the original
    df_ready = df.copy()
    
    # Create text_weighted
    print("\nâ€¢ Creating weighted text (subjectÃ—{}):".format(subject_weight))
    df_ready['text_weighted'] = df_ready.apply(
        lambda row: create_weighted_text(row, subject_weight), axis=1
    )
    
    # Apply clean_text to text_weighted
    print("â€¢ Cleaning text with full pipeline")
    df_ready['text_clean'] = df_ready['text_weighted'].apply(clean_text)
    
    # Remove duplicates (optional)
    if deduplicate:
        before_count = len(df_ready)
        df_ready = df_ready.drop_duplicates('text')
        after_count = len(df_ready)
        print(f"â€¢ Removed {before_count - after_count} duplicates ({(before_count - after_count)/before_count*100:.2f}% of data)")
    
    # Select columns to keep
    keep_cols = meta_features + ['text_clean', 'label_num']
    df_ready = df_ready[keep_cols]
    
    # Check: no NaN/inf in meta-features
    nan_meta = df_ready[meta_features].isna().sum().sum()
    inf_meta = np.isinf(df_ready[meta_features].values).sum()
    
    if nan_meta > 0 or inf_meta > 0:
        print(f"WARNING: Found {nan_meta} NaN and {inf_meta} Inf values in meta-features")
        # Fill NaN with 0 and replace Inf with very large but finite values
        df_ready[meta_features] = df_ready[meta_features].fillna(0)
        df_ready[meta_features] = df_ready[meta_features].replace([np.inf, -np.inf], [1e9, -1e9])
        print("  â†’ Replaced NaN with 0 and Inf with Â±1e9")
    
    # Check: text_clean not empty
    empty_texts = (df_ready['text_clean'] == '').sum()
    if empty_texts > 0:
        print(f"WARNING: Found {empty_texts} empty text_clean values")
        # Insert placeholder for empty texts
        df_ready.loc[df_ready['text_clean'] == '', 'text_clean'] = '[EMPTY_TEXT]'
        print("  â†’ Replaced empty texts with '[EMPTY_TEXT]' placeholder")
    
    return df_ready


print(f"\nTotal meta-features: {len(all_meta_features)}")

# Create model-ready DataFrame (without deduplication yet)
print("\nâ€¢ Creating model-ready DataFrame...")
df_ready = create_model_ready_df(
    df, 
    subject_weight=SUBJECT_WEIGHT,
    meta_features=all_meta_features,
    deduplicate=False  # Don't deduplicate here, keep both train and test complete
)

# Show example of text before and after cleaning
print("\n=== EXAMPLE: BEFORE vs AFTER CLEANING ===")
sample_idx = 0
print(f"ORIGINAL:\n{df.iloc[sample_idx]['text'][:500]}...")
print(f"\nCLEANED:\n{df_ready.iloc[sample_idx]['text_clean'][:500]}...")

# Statistics on text_clean
print("\n=== TEXT_CLEAN STATISTICS ===")
df_ready['clean_length'] = df_ready['text_clean'].str.len()
print(f"â€¢ Mean length: {df_ready['clean_length'].mean():.2f} chars")
print(f"â€¢ Median length: {df_ready['clean_length'].median():.2f} chars")
print(f"â€¢ Min length: {df_ready['clean_length'].min()} chars")
print(f"â€¢ Max length: {df_ready['clean_length'].max()} chars")

# Report length distribution by class
print("\nâ€¢ Length distribution by class:")
for label, name in [(1, "SPAM"), (0, "HAM")]:
    subset = df_ready[df_ready['label_num'] == label]['clean_length']
    print(f"  - {name}: {subset.median():.2f} chars [IQR: {subset.quantile(0.25):.2f}-{subset.quantile(0.75):.2f}]")

print("\n=== PREPROCESSING COMPLETE ===")
print(f"â€¢ Final DataFrame shape: {df_ready.shape}")
print(f"â€¢ Features: {len(all_meta_features)} meta-features + text_clean")

=== PREPROCESSING FUNCTIONS ===
â€¢ Using pre-calculated meta-features from previous steps

Total meta-features: 12

â€¢ Creating model-ready DataFrame...

â€¢ Creating weighted text (subjectÃ—2):
â€¢ Cleaning text with full pipeline
  â†’ Replaced empty texts with '[EMPTY_TEXT]' placeholder

=== EXAMPLE: BEFORE vs AFTER CLEANING ===
ORIGINAL:
Subject: enron methanol ; meter # : 988291
this is a follow up to the note i gave you on monday , 4 / 3 / 00 { preliminary
flow data provided by daren } .
please override pop ' s daily volume { presently zero } to reflect daily
activity you can obtain from gas control .
this change is needed asap for economics purposes ....

CLEANED:
enron methanol ; meter # : [number] enron methanol ; meter # : [number] this is a follow up to the note i gave you on monday , 4 / 3 / 00 { preliminary flow data provided by daren } . please override pop ' s daily volume { presently zero } to reflect daily activity you can obtain from gas control . this change is nee

In [22]:
# Cell 2 â€” Feature Representation Definition
print("=== FEATURE REPRESENTATION SETUP ===")

import numpy as np
from scipy.sparse import csr_matrix
from sklearn.base import clone
from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import FunctionTransformer as FT

# 1. TF-IDF for text features
print("\nâ€¢ Text Representation:")
print("  - Using TF-IDF Vectorizer with 1-2 grams")
print("  - Minimum document frequency: 2")
print("  - Sublinear TF scaling: Yes")
print("  - English stopwords: Yes")
print("  - Max features: 150,000")

# --- Template TF-IDF (word) ---
TFIDF_WORD_TEMPLATE = TfidfVectorizer(
    ngram_range=(1, 2),
    min_df=2,
    sublinear_tf=True,
    max_features=150_000,
    stop_words='english',
    strip_accents='unicode',
    dtype=np.float32
)

# --- Template TF-IDF (char) ---
TFIDF_CHAR_TEMPLATE = TfidfVectorizer(
    analyzer='char',
    ngram_range=(3, 5),
    min_df=2,
    sublinear_tf=True,
    max_features=50_000,
    strip_accents='unicode',
    dtype=np.float32
)

# --- Ramo numerico: SPARSE -> SCALER (resta sparse end-to-end) ---
def build_num_pipeline():
    return Pipeline([
        ('to_sparse', FunctionTransformer(lambda X: csr_matrix(X), validate=False)),
        ('scaler', StandardScaler(with_mean=False))
    ])

# --- Selettore 1D del testo per sicurezza ---
text_selector = FT(lambda X: X['text_clean'], validate=False)

# --- Factory per costruire il ColumnTransformer ---
def build_features_transformer(all_meta_features, mode='word'):
    num_pipeline = build_num_pipeline()
    if mode == 'word':
        text_branch = Pipeline([
            ('select_text', text_selector),
            ('tfidf', clone(TFIDF_WORD_TEMPLATE))
        ])
    elif mode == 'word_char':
        text_branch = Pipeline([
            ('select_text', text_selector),
            ('union', FeatureUnion([
                ('word', clone(TFIDF_WORD_TEMPLATE)),
                ('char', clone(TFIDF_CHAR_TEMPLATE))
            ]))
        ])
    else:
        raise ValueError("mode must be 'word' or 'word_char'")

    return ColumnTransformer(
        transformers=[
            ('text_features', text_branch, ['text_clean']),
            ('meta_features', num_pipeline, all_meta_features)
        ],
        remainder='drop',
        sparse_threshold=1.0
    )


# --- Stima coerente del vocabolario (solo info di servizio) ---
def estimate_vocab_size(texts):
    probe = clone(TFIDF_WORD_TEMPLATE)
    probe.fit(texts)
    return len(probe.vocabulary_)

# Print information about the meta-features
print("\n=== META-FEATURES INCLUDED ===")
print("Subject-related:")
for feature in [f for f in all_meta_features if f.startswith('subject_')]:
    print(f"  - {feature}")
print("\nBody-related:")
for feature in [f for f in all_meta_features if f.startswith('body_')]:
    print(f"  - {feature}")

# Create model-ready DataFrame without 
df_ready_model = create_model_ready_df(
    df,
    subject_weight=SUBJECT_WEIGHT,
    meta_features=all_meta_features,
    deduplicate=True 
)

print("df_ready_model shape:", df_ready_model.shape)

# Estimate vocabulary size
vocab_size = estimate_vocab_size(df_ready_model['text_clean'])
print(f"\nEstimated vocabulary size (TF-IDF params): {vocab_size:,} n-grams")
print(f"Limited to max_features: {min(vocab_size, TFIDF_WORD_TEMPLATE.max_features):,} n-grams")

feature_dim = min(vocab_size, TFIDF_WORD_TEMPLATE.max_features or vocab_size) + len(all_meta_features)
print(f"Total feature dimensionality (approx): {feature_dim:,}")
print(f"Deduplicated rows: {len(df) - len(df_ready_model)} "
      f"({(len(df) - len(df_ready_model)) / len(df) * 100:.2f}%)")

print("\n=== FEATURE REPRESENTATION COMPLETE ===")
print("Templates & factory ready. Using df_ready_model for training.")

=== FEATURE REPRESENTATION SETUP ===

â€¢ Text Representation:
  - Using TF-IDF Vectorizer with 1-2 grams
  - Minimum document frequency: 2
  - Sublinear TF scaling: Yes
  - English stopwords: Yes
  - Max features: 150,000

=== META-FEATURES INCLUDED ===
Subject-related:
  - subject_has_re
  - subject_has_fwd
  - subject_has_currency
  - subject_digit_count
  - subject_exclamation_count
  - subject_length
  - subject_body_ratio

Body-related:
  - body_phone_count
  - body_digit_count
  - body_exclamation_count
  - body_spam_word_count
  - body_length

â€¢ Creating weighted text (subjectÃ—2):
â€¢ Cleaning text with full pipeline
â€¢ Removed 178 duplicates (3.44% of data)
  â†’ Replaced empty texts with '[EMPTY_TEXT]' placeholder
df_ready_model shape: (4993, 14)

Estimated vocabulary size (TF-IDF params): 71,878 n-grams
Limited to max_features: 71,878 n-grams
Total feature dimensionality (approx): 71,890
Deduplicated rows: 178 (3.44%)

=== FEATURE REPRESENTATION COMPLETE ===
Templates & 

In [24]:
print("=== BASELINE MODEL SETUP ===")
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from scipy.sparse import issparse

baseline_model = LogisticRegression(
    class_weight="balanced",
    max_iter=2000,
    solver="liblinear",
    random_state=RANDOM_STATE,
    n_jobs=1
)

# Riusa la factory della Cella 2
features_transformer_A = build_features_transformer(all_meta_features, mode='word')
features_transformer_B = build_features_transformer(all_meta_features, mode='word_char')

pipeline_A = Pipeline([('features', features_transformer_A),
                       ('classifier', baseline_model)])
pipeline_B = Pipeline([('features', features_transformer_B),
                       ('classifier', baseline_model)])

# Test rapido memoria/densitÃ  su un campione
sample_size = min(1000, len(df_ready_model))
X_sample = df_ready_model.iloc[:sample_size]
XA = features_transformer_A.fit_transform(X_sample)
XB = features_transformer_B.fit_transform(X_sample)
print("A sparse?", issparse(XA), XA.shape)
print("B sparse?", issparse(XB), XB.shape)

print("\n=== BASELINE MODEL READY ===")
print("Default: Version A (Word n-grams); Version B per confronto.")


=== BASELINE MODEL SETUP ===
A sparse? True (1000, 13850)
B sparse? True (1000, 63850)

=== BASELINE MODEL READY ===
Default: Version A (Word n-grams); Version B per confronto.


In [None]:
# Cell 4 â€” Cross-validation + metrics

print("=== CROSS-VALIDATION & METRICS ===")

from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.metrics import (
    average_precision_score, roc_auc_score, f1_score,
    precision_recall_curve, confusion_matrix
)
import numpy as np
import pandas as pd

# --- 1. Stratified 80/20 split ---
print("\nâ€¢ Stratified 80/20 split (holdout test set)")
X = df_ready_model.drop(columns=['label_num'])
y = df_ready_model['label_num'].values

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.20, stratify=y, random_state=42
)
print(f"Train shape: {X_train.shape}, Test shape: {X_test.shape}")
print(f"Spam % in train: {np.mean(y_train)*100:.2f}%, test: {np.mean(y_test)*100:.2f}%")

# --- 2. CV setup ---
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# --- 3. CV loop ---
pr_aucs, roc_aucs, f1s_05 = [], [], []
f1_max_thresholds, highrec_thresholds = [], []
oof_pred = np.zeros(len(X_train))
oof_true = np.zeros(len(X_train))
fold_indices = np.zeros(len(X_train), dtype=int)

print("\nâ€¢ Starting cross-validation (pipeline_A)...")
for fold, (tr_idx, val_idx) in enumerate(cv.split(X_train, y_train), 1):
    print(f"\n--- Fold {fold} ---")
    X_tr, X_val = X_train.iloc[tr_idx], X_train.iloc[val_idx]
    y_tr, y_val = y_train[tr_idx], y_train[val_idx]

    # Fit pipeline
    pipeline_A.fit(X_tr, y_tr)

    # Predict probabilities
    y_val_proba = pipeline_A.predict_proba(X_val)[:, 1]

    # Save OOF predictions
    oof_pred[val_idx] = y_val_proba
    oof_true[val_idx] = y_val
    fold_indices[val_idx] = fold

    # Metrics
    pr_auc = average_precision_score(y_val, y_val_proba)
    roc_auc = roc_auc_score(y_val, y_val_proba)
    y_val_pred_05 = (y_val_proba >= 0.5).astype(int)
    f1_05 = f1_score(y_val, y_val_pred_05, pos_label=1)

    pr_aucs.append(pr_auc)
    roc_aucs.append(roc_auc)
    f1s_05.append(f1_05)
               # <-- niente -1

    # Threshold tuning
    precisions, recalls, thresholds = precision_recall_curve(y_val, y_val_proba)
    prec_t, rec_t = precisions[:-1], recalls[:-1]
    f1_scores = 2 * (prec_t * rec_t) / (prec_t + rec_t + 1e-8)
    best_idx = int(np.nanargmax(f1_scores))
    f1_max_thr = thresholds[best_idx]
    f1_max_thresholds.append(f1_max_thr)

    # High-recall threshold (Recall >= 0.95)
    highrec_thr = None
    mask = rec_t >= 0.95
    if np.any(mask):  # exclude last point (always recall=1)
        highrec_thr = thresholds[np.where(mask)[0][-1]]
    else:
        highrec_thr = thresholds[0]  # fallback: lowest threshold
    highrec_thresholds.append(highrec_thr)

    print(f"PR-AUC: {pr_auc:.4f} | ROC-AUC: {roc_auc:.4f} | F1@0.5: {f1_05:.4f}")
    print(f"Best F1 threshold: {f1_max_thr:.4f} | High-recall (recallâ‰¥0.95) threshold: {highrec_thr:.4f}")

# --- 4. Aggregate metrics report ---
def mean_std(arr):
    return f"{np.mean(arr):.4f} Â± {np.std(arr):.4f}"

print("\n=== CV RESULTS (mean Â± std) ===")
print(f"PR-AUC:    {mean_std(pr_aucs)}")
print(f"ROC-AUC:   {mean_std(roc_aucs)}")
print(f"F1@0.5:    {mean_std(f1s_05)}")
print(f"F1-max threshold (mean):      {np.mean(f1_max_thresholds):.4f}")
print(f"High-recall threshold (mean): {np.mean(highrec_thresholds):.4f}")

# --- 5. Aggregated OOF confusion matrix ---
# Chosen threshold: mean of F1-max thresholds
chosen_thr = np.mean(f1_max_thresholds)
oof_pred_label = (oof_pred >= chosen_thr).astype(int)
cm = confusion_matrix(oof_true, oof_pred_label)
cm_df = pd.DataFrame(cm, index=["HAM (true)", "SPAM (true)"], columns=["HAM (pred)", "SPAM (pred)"])

print(f"\n=== OOF CONFUSION MATRIX (mean F1-max threshold = {chosen_thr:.4f}) ===")
print(cm_df)

# Global OOF metrics
oof_pr_auc  = average_precision_score(oof_true, oof_pred)
oof_roc_auc = roc_auc_score(oof_true, oof_pred)
print(f"\n=== OOF METRICS (global) ===")
print(f"PR-AUC (OOF):  {oof_pr_auc:.4f}")
print(f"ROC-AUC (OOF): {oof_roc_auc:.4f}")


# --- 6. Save results for later use ---
cv_results = {
    "pr_aucs": pr_aucs,
    "roc_aucs": roc_aucs,
    "f1s_05": f1s_05,
    "f1_max_thresholds": f1_max_thresholds,
    "highrec_thresholds": highrec_thresholds,
    "oof_pred": oof_pred,
    "oof_true": oof_true,
    "fold_indices": fold_indices,
    "chosen_thr": chosen_thr,
    "confusion_matrix": cm,
    "oof_pr_auc": oof_pr_auc,
    "oof_roc_auc": oof_roc_auc
}
print("\n=== CV COMPLETE ===")