In [5]:
# Import libraries
import pandas as pd
import networkx as nx
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
import warnings
import numpy as np
import pandas as pd
import networkx as nx
from tqdm import tqdm
warnings.filterwarnings('ignore')
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette("husl")


ModuleNotFoundError: No module named 'networkx'

In [None]:
# Load data
df = pd.read_csv("../data/processed/graph_merge.csv")
print(f"Dataset loaded: {len(df):,} emails")
print(f"\ Columns: {df.columns.tolist()}")
print(f"\ Label distribution:")
print(df['label'].value_counts())
print(f"\nSpam rate: {df['label'].mean()*100:.1f}%")

Dataset loaded: 49,529 emails
\ Columns: ['sender', 'receiver', 'date', 'subject', 'body', 'label', 'urls']
\ Label distribution:
label
1    28126
0    21403
Name: count, dtype: int64

Spam rate: 56.8%


In [37]:
# Clean data
df = df.dropna(subset=['sender', 'receiver'])
df['sender'] = df['sender'].astype(str).str.strip().str.lower()
df['receiver'] = df['receiver'].astype(str).str.strip().str.lower()
df = df[(df['sender'] != 'nan') & (df['receiver'] != 'nan')]

print(f"After cleaning: {len(df):,} emails")

# Build directed graph
G = nx.DiGraph()

for idx, row in df.iterrows():
    sender = row['sender']
    receiver = row['receiver']
    is_spam = row['label']
    
    if G.has_edge(sender, receiver):
        G[sender][receiver]['weight'] += 1
        G[sender][receiver]['spam_count'] += is_spam
        G[sender][receiver]['ham_count'] += (1 - is_spam)
    else:
        G.add_edge(sender, receiver, 
                   weight=1, 
                   spam_count=is_spam,
                   ham_count=1-is_spam)

print(f"\n{'='*60}")
print(f" GRAPH STATISTICS:")
print(f"{'='*60}")
print(f"  Nodes (unique emails):     {G.number_of_nodes():,}")
print(f"  Edges (connections):       {G.number_of_edges():,}")
print(f"  Network density:           {nx.density(G):.6f}")
print(f"  Avg degree:                {sum(dict(G.degree()).values())/G.number_of_nodes():.2f}")

After cleaning: 49,529 emails

 GRAPH STATISTICS:
  Nodes (unique emails):     37,670
  Edges (connections):       37,171
  Network density:           0.000026
  Avg degree:                1.97


In [38]:
pagerank = nx.pagerank(G, max_iter=50)
clustering = nx.clustering(G.to_undirected())
out_degrees = dict(G.out_degree())
in_degrees = dict(G.in_degree())
sender_features = []

for sender, group in df.groupby('sender'):
    if sender not in G:
        continue
    
    out_deg = out_degrees.get(sender, 0)
    if out_deg == 0:
        continue

    total_sent = len(group)
    spam_sent = group['label'].sum()
    spam_ratio = spam_sent / total_sent

    receivers = list(G.successors(sender))
    reciprocity = (
        sum(1 for r in receivers if G.has_edge(r, sender)) / len(receivers)
        if receivers else 0
    )

    # avg weight of outgoing edges
    avg_weight = (
        np.mean([G[sender][r]['weight'] for r in receivers])
        if receivers else 0
    )

    sender_features.append({
        'sender': sender,
        'out_degree': out_deg,
        'in_degree': in_degrees.get(sender, 0),
        'total_sent': total_sent,
        'spam_ratio': spam_ratio,
        'reciprocity': reciprocity,
        
        # graph features added here
        'pagerank': pagerank.get(sender, 0),
        'clustering': clustering.get(sender, 0),
        'degree_centrality': out_deg / (G.number_of_nodes() - 1),
        'avg_weight': avg_weight,

        # label
        'is_spammer': 1 if spam_ratio > 0.8 else 0
    })

features_df = pd.DataFrame(sender_features)
print(features_df.head())


                                 sender  out_degree  in_degree  total_sent  \
0                    "  coinbase   " <>           1          0           1   
1  " " <netflix_mails@skybluefoods.net>           1          0           1   
2   " " <wellsfargo_online@knology.net>           1          0           1   
3  " " <wellsfargo_online@pflagscv.net>           1          0           1   
4       " amex " <priemna@leluk.org.ua>           1          0           1   

   spam_ratio  reciprocity  pagerank  clustering  degree_centrality  \
0         1.0          0.0  0.000011         0.0           0.000027   
1         1.0          0.0  0.000011         0.0           0.000027   
2         1.0          0.0  0.000011         0.0           0.000027   
3         1.0          0.0  0.000011         0.0           0.000027   
4         1.0          0.0  0.000011         0.0           0.000027   

   avg_weight  is_spammer  
0         1.0           1  
1         1.0           1  
2         1.0       

In [39]:
comparison_features = ['out_degree', 'in_degree', 'reciprocity', 'total_sent', 
                       'pagerank', 'clustering', 'avg_weight']

rows = []
for feature in comparison_features:
    spam_vals = features_df[features_df['is_spammer'] == 1][feature]
    legit_vals = features_df[features_df['is_spammer'] == 0][feature]
    
    spam_mean = spam_vals.mean()
    legit_mean = legit_vals.mean()
    spam_med  = spam_vals.median()
    legit_med = legit_vals.median()
    ratio = spam_mean / legit_mean if legit_mean > 0 else float('inf')
    
    rows.append([
        feature,
        round(spam_mean, 4),
        round(legit_mean, 4),
        round(spam_med, 4),
        round(legit_med, 4),
        round(ratio, 2),
        "Spam higher" if ratio > 1 else "Legit higher"
    ])

compare_table = pd.DataFrame(rows, columns=[
    "Feature", "Spam Mean", "Legit Mean", "Spam Median", "Legit Median", "Ratio", "Interpretation"
])

print(compare_table.to_string(index=False))


    Feature  Spam Mean  Legit Mean  Spam Median  Legit Median  Ratio Interpretation
 out_degree     1.0414      1.9879          1.0           1.0   0.52   Legit higher
  in_degree     0.2702      0.9860          0.0           0.0   0.27   Legit higher
reciprocity     0.0082      0.0361          0.0           0.0   0.23   Legit higher
 total_sent     1.0543      4.5291          1.0           1.0   0.23   Legit higher
   pagerank     0.0000      0.0000          0.0           0.0   1.12    Spam higher
 clustering     0.0002      0.0633          0.0           0.0   0.00   Legit higher
 avg_weight     1.0088      2.4716          1.0           1.0   0.41   Legit higher


In [None]:


print("\n" + "="*70)
print(" COMPUTING ADVANCED GRAPH FEATURES (FAST VERSION)")
print("="*70)

# Basic graph metrics
print("\n[1/4] Computing basic centrality measures...")
pagerank = nx.pagerank(G, max_iter=50)
clustering = nx.clustering(G.to_undirected())
out_degrees = dict(G.out_degree())
in_degrees = dict(G.in_degree())

print("[2/4] Computing closeness and eigenvector centrality...")
closeness = nx.closeness_centrality(G)
try:
    eigenvector = nx.eigenvector_centrality(G, max_iter=100)
except:
    print("   Warning: Eigenvector centrality failed, using zeros")
    eigenvector = {node: 0 for node in G.nodes()}

# HITS algorithm (Hub and Authority scores)
print("[3/4] Computing HITS (hub/authority scores)...")
hits_h, hits_a = nx.hits(G, max_iter=100)

# Additional structural features
print("[4/4] Computing additional structural features...")
# Remove self-loop edges
G.remove_edges_from(nx.selfloop_edges(G))
core_number = nx.core_number(G)  # k-core decomposition
triangles = nx.triangles(G.to_undirected())  # number of triangles

# Average neighbor degree
avg_neighbor_deg = nx.average_neighbor_degree(G)

# Harmonic centrality (works better for disconnected graphs)
print("   Computing harmonic centrality...")
harmonic_cent = nx.harmonic_centrality(G)

# Compute eccentricity for connected nodes (faster alternative to betweenness)
print("   Computing eccentricity...")
try:
    # Only compute for nodes in the largest strongly connected component
    largest_scc = max(nx.strongly_connected_components(G), key=len)
    G_scc = G.subgraph(largest_scc)
    eccentricity_scc = nx.eccentricity(G_scc)
    eccentricity = {node: eccentricity_scc.get(node, 0) for node in G.nodes()}
except:
    eccentricity = {node: 0 for node in G.nodes()}

print("\nâœ“ All graph features computed!\n")

# Build feature dataframe
sender_features = []
print("Extracting features for each sender...")

for sender, group in tqdm(df.groupby('sender'), desc="Processing senders"):
    if sender not in G:
        continue
    
    out_deg = out_degrees.get(sender, 0)
    if out_deg == 0:
        continue
    
    in_deg = in_degrees.get(sender, 0)
    
    # Email statistics
    total_sent = len(group)
    spam_sent = group['label'].sum()
    spam_ratio = spam_sent / total_sent
    
    # Reciprocity
    receivers = list(G.successors(sender))
    reciprocity = (
        sum(1 for r in receivers if G.has_edge(r, sender)) / len(receivers)
        if receivers else 0
    )
    
    # Average weight of outgoing edges
    avg_weight = (
        np.mean([G[sender][r]['weight'] for r in receivers])
        if receivers else 0
    )
    
    # Max weight (strongest connection)
    max_weight = (
        max([G[sender][r]['weight'] for r in receivers])
        if receivers else 0
    )
    
    # Weight variance (consistency of connections)
    weight_variance = (
        np.var([G[sender][r]['weight'] for r in receivers])
        if len(receivers) > 1 else 0
    )
    
    # Ego network features
    ego_graph = nx.ego_graph(G, sender, radius=1)
    ego_density = nx.density(ego_graph) if ego_graph.number_of_nodes() > 1 else 0
    ego_nodes = ego_graph.number_of_nodes() - 1  # exclude ego itself
    ego_edges = ego_graph.number_of_edges()
    
    # Out/In degree ratio
    in_out_ratio = in_deg / (out_deg + 1e-10)
    
    # Degree difference
    degree_diff = out_deg - in_deg
    
    # Local clustering of neighbors
    neighbors = list(G.successors(sender)) + list(G.predecessors(sender))
    neighbor_clustering = (
        np.mean([clustering.get(n, 0) for n in neighbors])
        if neighbors else 0
    )
    
    # Average PageRank of neighbors (do they connect to important people?)
    neighbor_pagerank = (
        np.mean([pagerank.get(n, 0) for n in neighbors])
        if neighbors else 0
    )
    
    # Receiver diversity
    receiver_diversity = len(set(receivers)) / (out_deg + 1e-10)
    
    # Connection strength concentration (Gini coefficient of weights)
    if len(receivers) > 1:
        weights = sorted([G[sender][r]['weight'] for r in receivers])
        n = len(weights)
        index = np.arange(1, n + 1)
        gini = (2 * np.sum(index * weights)) / (n * np.sum(weights)) - (n + 1) / n
    else:
        gini = 0
    
    # Unique receivers ratio
    unique_receiver_ratio = len(set(receivers)) / total_sent if total_sent > 0 else 0
    
    # Incoming email statistics
    predecessors = list(G.predecessors(sender))
    num_senders_to_me = len(predecessors)
    
    # Mutual connections (both send to each other)
    mutual_connections = sum(1 for r in receivers if G.has_edge(r, sender))
    mutual_ratio = mutual_connections / out_deg if out_deg > 0 else 0
    
    sender_features.append({
        # Basic info
        'sender': sender,
        'total_sent': total_sent,
        'spam_ratio': spam_ratio,
        'is_spammer': 1 if spam_ratio > 0.8 else 0,
        
        # Degree features
        'out_degree': out_deg,
        'in_degree': in_deg,
        'total_degree': out_deg + in_deg,
        'in_out_ratio': in_out_ratio,
        'degree_diff': degree_diff,
        
        # Centrality features (FAST ONLY)
        'pagerank': pagerank.get(sender, 0),
        'closeness': closeness.get(sender, 0),
        'eigenvector': eigenvector.get(sender, 0),
        'harmonic_centrality': harmonic_cent.get(sender, 0),
        'eccentricity': eccentricity.get(sender, 0),
        'degree_centrality': out_deg / (G.number_of_nodes() - 1),
        
        # HITS scores
        'hub_score': hits_h.get(sender, 0),
        'authority_score': hits_a.get(sender, 0),
        
        # Clustering and community
        'clustering': clustering.get(sender, 0),
        'triangles': triangles.get(sender, 0),
        'core_number': core_number.get(sender, 0),
        'neighbor_clustering': neighbor_clustering,
        'neighbor_pagerank': neighbor_pagerank,
        
        # Reciprocity and interaction
        'reciprocity': reciprocity,
        'mutual_connections': mutual_connections,
        'mutual_ratio': mutual_ratio,
        'num_senders_to_me': num_senders_to_me,
        
        # Weight features
        'avg_weight': avg_weight,
        'max_weight': max_weight,
        'weight_variance': weight_variance,
        'weight_gini': gini,
        
        # Diversity features
        'avg_neighbor_degree': avg_neighbor_deg.get(sender, 0),
        'receiver_diversity': receiver_diversity,
        'unique_receiver_ratio': unique_receiver_ratio,
        
        # Ego network features
        'ego_density': ego_density,
        'ego_nodes': ego_nodes,
        'ego_edges': ego_edges,
    })

features_df = pd.DataFrame(sender_features)

print("\n" + "="*70)
print(" FEATURE EXTRACTION COMPLETE")
print("="*70)
print(f"Total features: {len(features_df.columns)}")
print(f"Senders analyzed: {len(features_df):,}")
print(f"Spammers: {features_df['is_spammer'].sum():,}")
print(f"Legitimate: {(features_df['is_spammer'] == 0).sum():,}")

# Display sample
print("\n" + "="*70)
print(" SAMPLE OF EXTRACTED FEATURES")
print("="*70)
print(features_df.head())

# ============================================================================
# ENHANCED COMPARISON: SPAMMER vs LEGITIMATE
# ============================================================================

comparison_features = [
    # Degree
    'out_degree', 'in_degree', 'in_out_ratio', 'degree_diff',
    # Centrality
    'pagerank', 'closeness', 'eigenvector', 'harmonic_centrality', 'eccentricity',
    'hub_score', 'authority_score',
    # Community
    'clustering', 'triangles', 'core_number', 'neighbor_clustering', 'neighbor_pagerank',
    # Reciprocity
    'reciprocity', 'mutual_ratio', 'num_senders_to_me',
    # Weights
    'avg_weight', 'max_weight', 'weight_variance', 'weight_gini',
    # Diversity
    'receiver_diversity', 'unique_receiver_ratio', 'avg_neighbor_degree',
    # Ego
    'ego_density', 'ego_nodes'
]

rows = []
for feature in comparison_features:
    spam_vals = features_df[features_df['is_spammer'] == 1][feature]
    legit_vals = features_df[features_df['is_spammer'] == 0][feature]
    
    spam_mean = spam_vals.mean()
    legit_mean = legit_vals.mean()
    spam_med = spam_vals.median()
    legit_med = legit_vals.median()
    
    ratio = spam_mean / legit_mean if legit_mean > 0 else float('inf')
    
    rows.append([
        feature,
        round(spam_mean, 6),
        round(legit_mean, 6),
        round(spam_med, 6),
        round(legit_med, 6),
        round(ratio, 3),
        "Spam higher" if ratio > 1 else "Legit higher"
    ])

compare_table = pd.DataFrame(rows, columns=[
    "Feature", "Spam Mean", "Legit Mean", "Spam Median", "Legit Median", 
    "Ratio", "Interpretation"
])

print("\n" + "="*70)
print(" COMPREHENSIVE FEATURE COMPARISON: SPAMMER vs LEGITIMATE")
print("="*70)
print(compare_table.to_string(index=False))

# Highlight top discriminative features
print("\n" + "="*70)
print(" TOP FEATURES THAT DISTINGUISH SPAMMERS")
print("="*70)

compare_table['abs_diff'] = abs(compare_table['Ratio'] - 1.0)
top_features = compare_table.nlargest(10, 'abs_diff')

print("\nTop 10 most discriminative features:")
print(top_features[['Feature', 'Spam Mean', 'Legit Mean', 'Ratio']].to_string(index=False))

# Key insights
print("\n" + "="*70)
print(" KEY INSIGHTS")
print("="*70)

spam = features_df[features_df['is_spammer'] == 1]
legit = features_df[features_df['is_spammer'] == 0]

print("\nðŸŽ¯ CENTRALITY DIFFERENCES:")
print(f"   PageRank:     Spam={spam['pagerank'].mean():.6f} vs Legit={legit['pagerank'].mean():.6f}")
print(f"   Closeness:    Spam={spam['closeness'].mean():.6f} vs Legit={legit['closeness'].mean():.6f}")
print(f"   Eigenvector:  Spam={spam['eigenvector'].mean():.6f} vs Legit={legit['eigenvector'].mean():.6f}")
print(f"   Hub score:    Spam={spam['hub_score'].mean():.6f} vs Legit={legit['hub_score'].mean():.6f}")
print(f"   Authority:    Spam={spam['authority_score'].mean():.6f} vs Legit={legit['authority_score'].mean():.6f}")

print("\nðŸ“Š STRUCTURAL DIFFERENCES:")
print(f"   Core number:  Spam={spam['core_number'].mean():.2f} vs Legit={legit['core_number'].mean():.2f}")
print(f"   Triangles:    Spam={spam['triangles'].mean():.2f} vs Legit={legit['triangles'].mean():.2f}")
print(f"   Ego density:  Spam={spam['ego_density'].mean():.4f} vs Legit={legit['ego_density'].mean():.4f}")
print(f"   Clustering:   Spam={spam['clustering'].mean():.4f} vs Legit={legit['clustering'].mean():.4f}")

print("\nðŸ”— INTERACTION PATTERNS:")
print(f"   In/Out ratio:     Spam={spam['in_out_ratio'].mean():.4f} vs Legit={legit['in_out_ratio'].mean():.4f}")
print(f"   Reciprocity:      Spam={spam['reciprocity'].mean():.4f} vs Legit={legit['reciprocity'].mean():.4f}")
print(f"   Mutual ratio:     Spam={spam['mutual_ratio'].mean():.4f} vs Legit={legit['mutual_ratio'].mean():.4f}")
print(f"   Receiver div:     Spam={spam['receiver_diversity'].mean():.4f} vs Legit={legit['receiver_diversity'].mean():.4f}")
print(f"   Weight Gini:      Spam={spam['weight_gini'].mean():.4f} vs Legit={legit['weight_gini'].mean():.4f}")
print(f"   Neighbor PR:      Spam={spam['neighbor_pagerank'].mean():.6f} vs Legit={legit['neighbor_pagerank'].mean():.6f}")

print("\n" + "="*70)


 COMPUTING ADVANCED GRAPH FEATURES (FAST VERSION)

[1/4] Computing basic centrality measures...
[2/4] Computing closeness and eigenvector centrality...
[3/4] Computing HITS (hub/authority scores)...
[4/4] Computing additional structural features...
   Computing harmonic centrality...
   Computing eccentricity...

âœ“ All graph features computed!

Extracting features for each sender...


Processing senders: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 31398/31398 [00:01<00:00, 16262.59it/s]



 FEATURE EXTRACTION COMPLETE
Total features: 36
Senders analyzed: 31,398
Spammers: 26,671
Legitimate: 4,727

 SAMPLE OF EXTRACTED FEATURES
                                 sender  total_sent  spam_ratio  is_spammer  \
0                    "  coinbase   " <>           1         1.0           1   
1  " " <netflix_mails@skybluefoods.net>           1         1.0           1   
2   " " <wellsfargo_online@knology.net>           1         1.0           1   
3  " " <wellsfargo_online@pflagscv.net>           1         1.0           1   
4       " amex " <priemna@leluk.org.ua>           1         1.0           1   

   out_degree  in_degree  total_degree  in_out_ratio  degree_diff  pagerank  \
0           1          0             1           0.0            1  0.000011   
1           1          0             1           0.0            1  0.000011   
2           1          0             1           0.0            1  0.000011   
3           1          0             1           0.0            1  0.