In [25]:

#!/usr/bin/env python3
%pip install pandas numpy matplotlib scikit-learn seaborn tqdm

# Import Block
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    roc_curve, auc, roc_auc_score, confusion_matrix, classification_report
)
from glob import glob
import os
from tqdm import tqdm



Note: you may need to restart the kernel to use updated packages.


In [37]:
import os
import pandas as pd
import glob

# Define file paths for each category
categories = {
    "benign_with_logo": {
         "multimodal": "/home/tiffanybao/PhishIntention/results/4.11/Multimodal/benign_with_logo_predict.csv",
         "original":   "/home/tiffanybao/PhishIntention/results/4.11/Original/benign_with_logo_predict.txt"
    },
    "benign_without_logo": {
         "multimodal": "/home/tiffanybao/PhishIntention/results/4.11/Multimodal/benign_without_logo_predict.csv",
         "original":   "/home/tiffanybao/PhishIntention/results/4.11/Original/benign_without_logo_predict.txt"
    },
    "Fresh_Logo_Phishing": {
         "multimodal": "/home/tiffanybao/PhishIntention/results/4.11/Multimodal/Fresh_Logo_Phishing_predict.csv",
         "original":   "/home/tiffanybao/PhishIntention/results/4.11/Original/Fresh_Logo_Phishing_predict.txt"
    },
    "Learned_Logo_Phishing": {
         "multimodal": "/home/tiffanybao/PhishIntention/results/4.11/Multimodal/Learned_Logo_Phishing_predict.csv",
         "original":   "/home/tiffanybao/PhishIntention/results/4.11/Original/Learned_Logo_Phishing_predict.txt"
    },
    "No_Logo_Phishing": {
         "multimodal": "/home/tiffanybao/PhishIntention/results/4.11/Multimodal/No_Logo_Phishing_predict.csv",
         "original":   "/home/tiffanybao/PhishIntention/results/4.11/Original/No_Logo_Phishing_predict.txt"
    }
}



def load_multimodal_file(file_path):
    """
    Loads a CSV file from the multimodal predictions.
    Assumes a header is available. Renames column if necessary.
    """
    try:
        df = pd.read_csv(file_path)
        # Standardize URL column name if needed
        if 'url' not in df.columns and 'URL' in df.columns:
            df = df.rename(columns={"URL": "url"})
            
        return df.drop_duplicates(subset=["url"])
        # return df

    except Exception as e:
        print(f"Error loading multimodal file {file_path}: {e}")
        return pd.DataFrame()

def load_original_file(file_path):
    """
    Loads a TXT file from the original (Phishintention) predictions.
    Expects a tab-separated file with no header.
    The expected order is:
      folder, url, phish_category, pred_target, matched_domain, siamese_conf
    """
    try:
        df = pd.read_csv(file_path, sep="\t", header=None, encoding='ISO-8859-1')
        df.columns = ["folder", "url", "phish_category", "pred_target", "matched_domain", "siamese_conf"]
        return df.drop_duplicates(subset=["url"])
    except Exception as e:
        print(f"Error loading original file {file_path}: {e}")
        return pd.DataFrame()

# Loop through each category, load files, and compute the counts
for category, paths in categories.items():
    print(f"\nCategory: {category}")
    
    # Load multimodal predictions
    mm_path = paths["multimodal"]
    mm_df = load_multimodal_file(mm_path)
    mm_count = len(mm_df)
    mm_unique = len(mm_df["url"].dropna().unique())
    print(f"  Multimodal predictions count: {mm_count} (Unique URLs: {mm_unique})")
    
    # Load original predictions if available
    orig_path = paths["original"]
    if orig_path is not None:
        orig_df = load_original_file(orig_path)
        orig_count = len(orig_df)
        orig_unique = len(orig_df["url"].dropna().unique())
        print(f"  Original predictions count: {orig_count} (Unique URLs: {orig_unique})")
        
        # Calculate common samples based on unique URLs
        mm_urls = set(mm_df["url"].dropna().unique())
        orig_urls = set(orig_df["url"].dropna().unique())
        common_urls = mm_urls.intersection(orig_urls)
        print(f"  Common predicted samples count: {len(common_urls)}")
    else:
        print("  Original predictions not available for this category.")



Category: benign_with_logo
  Multimodal predictions count: 242 (Unique URLs: 242)
  Original predictions count: 249 (Unique URLs: 249)
  Common predicted samples count: 241

Category: benign_without_logo
  Multimodal predictions count: 250 (Unique URLs: 250)
  Original predictions count: 249 (Unique URLs: 249)
  Common predicted samples count: 249

Category: Fresh_Logo_Phishing
  Multimodal predictions count: 238 (Unique URLs: 238)
  Original predictions count: 225 (Unique URLs: 225)
  Common predicted samples count: 225

Category: Learned_Logo_Phishing
  Multimodal predictions count: 244 (Unique URLs: 244)
  Original predictions count: 239 (Unique URLs: 239)
  Common predicted samples count: 239

Category: No_Logo_Phishing
  Multimodal predictions count: 230 (Unique URLs: 230)
  Original predictions count: 228 (Unique URLs: 228)
  Common predicted samples count: 222


In [42]:
# Initialize dictionaries to store the common DataFrames for each category
multi_common_dfs = {}
ori_common_dfs = {}

# Loop over each category, load the data and filter only rows whose URLs are common in both model outputs
for category, paths in categories.items():
    print(f"\nCategory: {category}")
    
    # Load multimodal predictions (CSV)
    mm_df = load_multimodal_file(paths["multimodal"])
    # For original predictions, load the TXT file (if available)
    if paths["original"] is not None:
        orig_df = load_original_file(paths["original"])
    else:
        orig_df = pd.DataFrame()
    
    # Get the unique URL sets from each DataFrame
    mm_urls = set(mm_df["url"].dropna().unique())
    ori_urls = set(orig_df["url"].dropna().unique()) if not orig_df.empty else set()
    
    # Compute the common URLs (intersection)
    common_urls = mm_urls.intersection(ori_urls)
    print(f"  Common predicted samples count: {len(common_urls)}")
    
    # Filter the multimodal DataFrame: extract rows corresponding to common URLs
    mm_common_df = mm_df[mm_df["url"].isin(common_urls)].copy()
    # Similarly filter the original DataFrame, if available
    ori_common_df = orig_df[orig_df["url"].isin(common_urls)].copy() if not orig_df.empty else pd.DataFrame()
    
    # Store the filtered DataFrames in the dictionaries
    multi_common_dfs[category] = mm_common_df
    ori_common_dfs[category] = ori_common_df
    
    # Create global variables with descriptive names for ease of reference.
    # For example, for "benign_with_logo", we'll create "multi_common_benign_with_logo" and "ori_common_benign_with_logo".
    global_var_multi = f"multi_common_{category.lower()}"
    global_var_ori   = f"ori_common_{category.lower()}"
    globals()[global_var_multi] = mm_common_df
    globals()[global_var_ori] = ori_common_df
    
    print(f"  Multimodal common DataFrame shape: {mm_common_df.shape}")
    if not ori_common_df.empty:
        print(f"  Original common DataFrame shape: {ori_common_df.shape}")
    else:
        print("  No original DataFrame available for this category.")
        
# Example of checking one of the created global variables:
# print("\nExample: multi_common_benign_with_logo shape:", multi_common_benign_with_logo.shape)



Category: benign_with_logo
  Common predicted samples count: 241
  Multimodal common DataFrame shape: (241, 10)
  Original common DataFrame shape: (241, 6)

Category: benign_without_logo
  Common predicted samples count: 249
  Multimodal common DataFrame shape: (249, 10)
  Original common DataFrame shape: (249, 6)

Category: Fresh_Logo_Phishing
  Common predicted samples count: 225
  Multimodal common DataFrame shape: (225, 10)
  Original common DataFrame shape: (225, 6)

Category: Learned_Logo_Phishing
  Common predicted samples count: 239
  Multimodal common DataFrame shape: (239, 10)
  Original common DataFrame shape: (239, 6)

Category: No_Logo_Phishing
  Common predicted samples count: 222
  Multimodal common DataFrame shape: (222, 10)
  Original common DataFrame shape: (222, 6)


In [46]:
# Define the output directory for the CSV files
output_dir = "/home/tiffanybao/PhishIntention/results/4.12"

# Create the output directory if it doesn't exist
os.makedirs(output_dir, exist_ok=True)

# Loop over each category in the multi_common_dfs dictionary and save them.
for category, df in multi_common_dfs.items():
    # Create file name for multimodal common predictions
    output_file = os.path.join(output_dir, f"multi_common_{category}.csv")
    df.to_csv(output_file, index=False)
    print(f"Saved multimodal common DataFrame for '{category}' to {output_file}")

# Loop over each category in the ori_common_dfs dictionary and save them.
for category, df in ori_common_dfs.items():
    # Create file name for original common predictions
    output_file = os.path.join(output_dir, f"ori_common_{category}.csv")
    df.to_csv(output_file, index=False)
    print(f"Saved original common DataFrame for '{category}' to {output_file}")


Saved multimodal common DataFrame for 'benign_with_logo' to /home/tiffanybao/PhishIntention/results/4.12/multi_common_benign_with_logo.csv
Saved multimodal common DataFrame for 'benign_without_logo' to /home/tiffanybao/PhishIntention/results/4.12/multi_common_benign_without_logo.csv
Saved multimodal common DataFrame for 'Fresh_Logo_Phishing' to /home/tiffanybao/PhishIntention/results/4.12/multi_common_Fresh_Logo_Phishing.csv
Saved multimodal common DataFrame for 'Learned_Logo_Phishing' to /home/tiffanybao/PhishIntention/results/4.12/multi_common_Learned_Logo_Phishing.csv
Saved multimodal common DataFrame for 'No_Logo_Phishing' to /home/tiffanybao/PhishIntention/results/4.12/multi_common_No_Logo_Phishing.csv
Saved original common DataFrame for 'benign_with_logo' to /home/tiffanybao/PhishIntention/results/4.12/ori_common_benign_with_logo.csv
Saved original common DataFrame for 'benign_without_logo' to /home/tiffanybao/PhishIntention/results/4.12/ori_common_benign_without_logo.csv
Saved o

In [43]:
multi_common_benign_with_logo.head()

Unnamed: 0,Site Folder,url,Multimodal_Decision,Image Phish Score,Image_Decision,Text Phish Score,Text_Decision,Image Features,Text Features,Fused Features
0,hatsandcaps.co.uk,https://hatsandcaps.co.uk,benign,0.0,0,0.2,0,1.00|0.00|0.66|0.00|0.00|0.00|1.00|0.50|0.02|0...,0.00|0.00|0.00|0.00|0.00|0.00|0.00|0.00|0.00|0...,0.3648|0.0000|0.2398|0.0000|0.0000|0.0000|0.36...
1,zzperformance.com,https://zzperformance.com,benign,0.0,0,0.24,0,1.00|1.00|0.85|0.00|0.00|0.00|1.00|0.70|0.01|0...,0.00|0.00|0.00|0.00|1.00|0.00|0.00|0.00|0.00|0...,0.2658|0.2658|0.2259|0.0000|0.0000|0.0000|0.26...
2,google.co.ke,https://google.co.ke,benign,0.0,0,0.37,0,1.00|1.00|0.96|0.00|1.00|0.00|1.00|0.80|0.00|0...,0.00|0.00|1.00|0.00|1.00|0.00|0.00|0.00|0.00|0...,0.2547|0.2547|0.2436|0.0000|0.2547|0.0000|0.25...
3,kannadashaadi.com,https://kannadashaadi.com,benign,0.0,0,0.33,0,1.00|0.00|0.75|0.00|0.00|0.00|1.00|0.50|0.01|0...,0.00|0.00|1.00|0.00|0.00|0.00|0.00|0.00|0.00|0...,0.3588|0.0000|0.2675|0.0000|0.0000|0.0000|0.35...
4,ilpiacenza.it,https://ilpiacenza.it,benign,0.0,0,0.2,0,1.00|0.00|0.73|0.00|0.00|0.00|1.00|0.50|0.01|0...,0.00|0.00|0.00|0.00|0.00|0.00|0.00|0.00|0.00|0...,0.3600|0.0000|0.2622|0.0000|0.0000|0.0000|0.36...
