# Phishing Website Detection using CountVectorizer

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, f1_score
from sklearn.pipeline import Pipeline
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier, AdaBoostClassifier, ExtraTreesClassifier
import xgboost as xgb
import lightgbm as lgb
from sklearn.preprocessing import StandardScaler

In [None]:
import psutil
import gc


def get_memory_usage():
    """Get current memory usage"""
    process = psutil.Process()
    memory_info = process.memory_info()
    memory_mb = memory_info.rss / 1024 / 1024
    return memory_mb


def print_memory_usage(label=""):
    """Print current memory usage with optional label"""
    memory_mb = get_memory_usage()
    print(f"Memory usage {label}: {memory_mb:.2f} MB")


def cleanup_memory():
    """Force garbage collection to free memory"""
    gc.collect()
    print("Memory cleanup completed")


# Print initial memory usage
print_memory_usage("(initial)")

In [None]:
import os
import joblib
import numpy as np
from scipy.sparse import save_npz, load_npz, vstack
import gc


dataset_path = "/kaggle/input/phishing-website-webcode-dataset/phishing_complete_dataset.csv"
ARTIFACT_DIR = "/kaggle/working/count/"

# Create artifacts directory if it doesn't exist
os.makedirs(ARTIFACT_DIR, exist_ok=True)

# First, let's check the total number of rows in the dataset
total_rows = sum(1 for line in open(dataset_path)) - 1  # subtract 1 for header
print(f"Total rows in dataset: {total_rows}")

# Read a small sample first to understand the data structure
sample_df = pd.read_csv(dataset_path, nrows=5)
print(f"Dataset columns: {sample_df.columns.tolist()}")
print(f"Sample data:")
print(sample_df.head())

In [None]:
# Batch processing configuration
BATCH_SIZE = 2000
CHUNK_SIZE = 2000


def process_dataset_in_batches():
    """Process dataset in batches to manage memory efficiently"""

    # Initialize variables to track dataset statistics
    total_processed = 0
    class_counts = {}
    code_lengths = []

    print("Processing dataset in batches to analyze structure...")

    # Process dataset in chunks for analysis
    chunk_iter = pd.read_csv(dataset_path, chunksize=CHUNK_SIZE)

    for i, chunk in enumerate(chunk_iter):
        print(
            f"Processing chunk {i+1}, rows {total_processed+1} to {total_processed+len(chunk)}")

        # Update class distribution
        chunk_classes = chunk['result'].value_counts()
        for class_label, count in chunk_classes.items():
            class_counts[class_label] = class_counts.get(
                class_label, 0) + count

        # Sample some code lengths (to avoid memory issues)
        if len(code_lengths) < 10000:  # Only sample first 10k for statistics
            chunk_lengths = chunk['webpage_code'].str.len()
            code_lengths.extend(chunk_lengths.tolist())

        total_processed += len(chunk)

        # Clear chunk from memory
        del chunk
        gc.collect()

        if total_processed >= 10000:  # Limit analysis to first 10k rows for speed
            break

    print(f"\nDataset Analysis (first {total_processed} rows):")
    print(f"Total processed: {total_processed}")
    print(f"Class distribution: {class_counts}")

    if class_counts:
        total_samples = sum(class_counts.values())
        for class_label, count in class_counts.items():
            percentage = (count / total_samples) * 100
            print(f"Class {class_label}: {count} ({percentage:.2f}%)")

    if code_lengths:
        code_lengths_array = np.array(code_lengths)
        print(f"\nWebpage code length statistics:")
        print(f"Mean: {code_lengths_array.mean():.2f}")
        print(f"Std: {code_lengths_array.std():.2f}")
        print(f"Min: {code_lengths_array.min()}")
        print(f"Max: {code_lengths_array.max()}")
        print(f"Median: {np.median(code_lengths_array):.2f}")


# Run the analysis
process_dataset_in_batches()

In [None]:
def create_train_test_split_batched():
    """Create train-test split while processing dataset in batches"""

    print("Creating train-test split with batched processing...")

    # First pass: collect all data for proper stratified split
    all_X = []
    all_y = []

    chunk_iter = pd.read_csv(dataset_path, chunksize=CHUNK_SIZE)

    for i, chunk in enumerate(chunk_iter):
        print(f"Reading chunk {i+1} for train-test split...")
        all_X.extend(chunk['webpage_code'].tolist())
        all_y.extend(chunk['result'].tolist())
        del chunk
        gc.collect()

    print(f"Total samples loaded: {len(all_X)}")

    # Convert to pandas Series for train_test_split
    X_series = pd.Series(all_X)
    y_series = pd.Series(all_y)

    # Create stratified train-test split
    X_train, X_test, y_train, y_test = train_test_split(
        X_series, y_series, test_size=0.2, random_state=42, stratify=y_series
    )

    print(f"Training set size: {len(X_train)}")
    print(f"Test set size: {len(X_test)}")
    print(f"Training target distribution:\n{y_train.value_counts()}")
    print(f"Test target distribution:\n{y_test.value_counts()}")

    # Clear original data from memory
    del all_X, all_y, X_series, y_series
    gc.collect()

    return X_train, X_test, y_train, y_test


# Create the train-test split
X_train, X_test, y_train, y_test = create_train_test_split_batched()

In [None]:
# Load previously processed CountVectorizer data if needed
# Uncomment the lines below to load previously saved CountVectorizer data instead of processing
# loaded_count_data = load_count_processed_data()
# if loaded_count_data:
#     count_vectorizer = loaded_count_data['vectorizer']
#     X_train_count = loaded_count_data['X_train_count']
#     X_test_count = loaded_count_data['X_test_count']
#     y_train = loaded_count_data['y_train']
#     y_test = loaded_count_data['y_test']
#     print("‚úÖ Loaded previously processed CountVectorizer data")

In [None]:
def vectorize_data_count_in_batches(X_train, X_test, y_train, y_test):
    """Vectorize training and test data using CountVectorizer in batches to manage memory"""

    print("Starting CountVectorizer batch processing...")
    print(f"‚úì Count artifacts will be saved to: {ARTIFACT_DIR}")

    # Initialize Count vectorizer
    count_vectorizer = CountVectorizer(
        max_features=5000,
        stop_words='english',
        ngram_range=(1, 2)
    )

    # Convert to lists for batch processing
    X_train_list = X_train.tolist() if hasattr(
        X_train, 'tolist') else list(X_train)
    X_test_list = X_test.tolist() if hasattr(X_test, 'tolist') else list(X_test)

    # Fit vectorizer on training data in batches
    print("Fitting CountVectorizer on training data...")

    # For fitting, we need to process all training data
    # We'll do this in batches but accumulate the vocabulary
    train_batches = [X_train_list[i:i+BATCH_SIZE]
                     for i in range(0, len(X_train_list), BATCH_SIZE)]

    print(f"Processing {len(train_batches)} training batches...")

    # Fit on the first few batches to establish vocabulary
    # Use up to 10k samples for fitting
    sample_size = min(10000, len(X_train_list))
    sample_data = X_train_list[:sample_size]
    count_vectorizer.fit(sample_data)

    print("CountVectorizer fitted. Now transforming training data in batches...")

    # Transform training data in batches and save
    X_train_count_batches = []
    for i, batch in enumerate(train_batches):
        print(f"Transforming training batch {i+1}/{len(train_batches)}")
        batch_count = count_vectorizer.transform(batch)
        X_train_count_batches.append(batch_count)

        # Save batch to disk to free memory
        batch_filename = os.path.join(
            ARTIFACT_DIR, f"X_train_count_batch_{i}.npz")
        save_npz(batch_filename, batch_count)

        # Clear batch from memory
        del batch_count
        gc.collect()

    print("Training data CountVectorization completed. Now processing test data...")

    # Transform test data in batches
    test_batches = [X_test_list[i:i+BATCH_SIZE]
                    for i in range(0, len(X_test_list), BATCH_SIZE)]
    X_test_count_batches = []

    for i, batch in enumerate(test_batches):
        print(f"Transforming test batch {i+1}/{len(test_batches)}")
        batch_count = count_vectorizer.transform(batch)
        X_test_count_batches.append(batch_count)

        # Save batch to disk
        batch_filename = os.path.join(
            ARTIFACT_DIR, f"X_test_count_batch_{i}.npz")
        save_npz(batch_filename, batch_count)

        # Clear batch from memory
        del batch_count
        gc.collect()

    print("Combining all training batches...")
    # Load and combine all training batches
    X_train_count = None
    for i in range(len(train_batches)):
        batch_filename = os.path.join(
            ARTIFACT_DIR, f"X_train_count_batch_{i}.npz")
        batch_data = load_npz(batch_filename)

        if X_train_count is None:
            X_train_count = batch_data
        else:
            X_train_count = vstack([X_train_count, batch_data])

        # Clean up temporary file
        os.remove(batch_filename)

    print("Combining all test batches...")
    # Load and combine all test batches
    X_test_count = None
    for i in range(len(test_batches)):
        batch_filename = os.path.join(
            ARTIFACT_DIR, f"X_test_count_batch_{i}.npz")
        batch_data = load_npz(batch_filename)

        if X_test_count is None:
            X_test_count = batch_data
        else:
            X_test_count = vstack([X_test_count, batch_data])

        # Clean up temporary file
        os.remove(batch_filename)

    print(f"CountVectorization completed!")
    print(f"Training data shape: {X_train_count.shape}")
    print(f"Test data shape: {X_test_count.shape}")

    # Clear original text data from memory
    del X_train_list, X_test_list
    gc.collect()

    return count_vectorizer, X_train_count, X_test_count

In [None]:
# Process data with CountVectorizer in batches
print("Starting CountVectorizer batch processing...")
print_memory_usage("(before vectorization)")

    # Call the CountVectorizer batch processing function
    count_vectorizer, X_train_count, X_test_count = vectorize_data_count_in_batches(
        X_train, X_test, y_train, y_test)
    
    # Clear original text data
    del X_train, X_test
    gc.collect()
    
    print_memory_usage("(after vectorization and cleanup)")

In [None]:
def save_count_processed_data(count_vectorizer, X_train_count, X_test_count, y_train, y_test):
    """Save all CountVectorizer processed data and vectorizer to disk"""

    print("Saving CountVectorizer processed data to disk...")

    # Save vectorizer
    vectorizer_path = os.path.join(ARTIFACT_DIR, "count_vectorizer.joblib")
    joblib.dump(count_vectorizer, vectorizer_path)
    print(f"‚úì Saved Count vectorizer to {vectorizer_path}")

    # Save vectorized training data
    train_count_path = os.path.join(ARTIFACT_DIR, "X_train_count.npz")
    save_npz(train_count_path, X_train_count)
    print(f"‚úì Saved training Count data to {train_count_path}")

    # Save vectorized test data
    test_count_path = os.path.join(ARTIFACT_DIR, "X_test_count.npz")
    save_npz(test_count_path, X_test_count)
    print(f"‚úì Saved test Count data to {test_count_path}")

    # Save target variables
    y_train_np = y_train.to_numpy() if hasattr(
        y_train, "to_numpy") else np.asarray(y_train)
    y_test_np = y_test.to_numpy() if hasattr(
        y_test, "to_numpy") else np.asarray(y_test)

    y_train_count_path = os.path.join(ARTIFACT_DIR, "y_train_count.npy")
    y_test_count_path = os.path.join(ARTIFACT_DIR, "y_test_count.npy")

    np.save(y_train_count_path, y_train_np)
    np.save(y_test_count_path, y_test_np)
    print(f"‚úì Saved training targets to {y_train_count_path}")
    print(f"‚úì Saved test targets to {y_test_count_path}")

    # Print file sizes for verification
    print("\nCountVectorizer file sizes:")
    for filename in ["count_vectorizer.joblib", "X_train_count.npz", "X_test_count.npz", "y_train_count.npy", "y_test_count.npy"]:
        filepath = os.path.join(ARTIFACT_DIR, filename)
        if os.path.exists(filepath):
            size_mb = os.path.getsize(filepath) / (1024 * 1024)
            print(f"  {filename}: {size_mb:.2f} MB")

    print(
        f"\n‚úÖ All CountVectorizer data saved successfully to '{ARTIFACT_DIR}'")

    return {
        'vectorizer_path': vectorizer_path,
        'train_count_path': train_count_path,
        'test_count_path': test_count_path,
        'y_train_path': y_train_count_path,
        'y_test_path': y_test_count_path
    }

In [None]:
# Save all processed CountVectorizer data
saved_paths = save_count_processed_data(
    count_vectorizer, X_train_count, X_test_count, y_train, y_test)

In [None]:
def load_count_processed_data():
    """Load previously saved CountVectorizer processed data for model training"""

    print("Loading CountVectorizer processed data from disk...")

    # Check if all required files exist
    required_files = [
        "count_vectorizer.joblib",
        "X_train_count.npz",
        "X_test_count.npz",
        "y_train_count.npy",
        "y_test_count.npy"
    ]

    missing_files = []
    for filename in required_files:
        filepath = os.path.join(ARTIFACT_DIR, filename)
        if not os.path.exists(filepath):
            missing_files.append(filename)

    if missing_files:
        print(f"‚ùå Missing files: {missing_files}")
        print("Please run the CountVectorizer data processing cells first.")
        return None

    # Load vectorizer
    vectorizer_path = os.path.join(ARTIFACT_DIR, "count_vectorizer.joblib")
    loaded_vectorizer = joblib.load(vectorizer_path)
    print(f"‚úì Loaded Count vectorizer from {vectorizer_path}")

    # Load vectorized data
    train_count_path = os.path.join(ARTIFACT_DIR, "X_train_count.npz")
    test_count_path = os.path.join(ARTIFACT_DIR, "X_test_count.npz")

    loaded_X_train_count = load_npz(train_count_path)
    loaded_X_test_count = load_npz(test_count_path)
    print(f"‚úì Loaded training Count data: {loaded_X_train_count.shape}")
    print(f"‚úì Loaded test Count data: {loaded_X_test_count.shape}")

    # Load target variables
    y_train_path = os.path.join(ARTIFACT_DIR, "y_train_count.npy")
    y_test_path = os.path.join(ARTIFACT_DIR, "y_test_count.npy")

    loaded_y_train = np.load(y_train_path)
    loaded_y_test = np.load(y_test_path)
    print(f"‚úì Loaded training targets: {loaded_y_train.shape}")
    print(f"‚úì Loaded test targets: {loaded_y_test.shape}")

    print(f"\n‚úÖ All CountVectorizer data loaded successfully!")

    return {
        'vectorizer': loaded_vectorizer,
        'X_train_count': loaded_X_train_count,
        'X_test_count': loaded_X_test_count,
        'y_train': loaded_y_train,
        'y_test': loaded_y_test
    }

In [None]:
def train_models_on_count_data(X_train_count, X_test_count, y_train, y_test):
    """Train models using the CountVectorizer processed data"""

    count_results = {}

    # Define classifiers for Count data
    classifiers_count = {
        'Random Forest (Count)': RandomForestClassifier(n_estimators=100, random_state=42),
        'XGBoost (Count)': xgb.XGBClassifier(random_state=42, verbosity=0),
        'LightGBM (Count)': lgb.LGBMClassifier(random_state=42, verbose=-1),
        'Extra Trees (Count)': ExtraTreesClassifier(n_estimators=100, random_state=42),
        'Naive Bayes (Count)': MultinomialNB(),
        'Logistic Regression (Count)': LogisticRegression(random_state=42, max_iter=1000)
    }

    print("Training and evaluating models on CountVectorizer data...")
    print("=" * 60)
    print(f"Training data shape: {X_train_count.shape}")
    print(f"Test data shape: {X_test_count.shape}")
    print(f"Models will be saved to: {ARTIFACT_DIR}")
    print("=" * 60)

    # Train Count models
    for name, clf in classifiers_count.items():
        print(f"\nüöÄ Training {name}...")
        try:
            # Train the model
            clf.fit(X_train_count, y_train)

            # Make predictions
            y_pred = clf.predict(X_test_count)

            # Calculate metrics
            accuracy = accuracy_score(y_test, y_pred)
            f1 = f1_score(y_test, y_pred)

            # Store results
            count_results[name] = {
                'accuracy': accuracy,
                'f1_score': f1,
                'predictions': y_pred,
                'model': clf
            }

            print(f"   ‚úÖ Accuracy: {accuracy:.4f}")
            print(f"   ‚úÖ F1 Score: {f1:.4f}")

            # Save the trained model
            model_filename = f"{name.replace(' ', '_').replace('(', '').replace(')', '').lower()}.joblib"
            model_path = os.path.join(ARTIFACT_DIR, model_filename)
            joblib.dump(clf, model_path)
            print(f"   üíæ Saved model to {model_filename}")

        except Exception as e:
            print(f"   ‚ùå Error training {name}: {e}")

    print(f"\n{'='*60}")
    print("üéâ CountVectorizer model training completed!")

    return count_results

In [None]:
def analyze_count_vectorizer_results(count_results):
    """Analyze and summarize CountVectorizer model performance"""

    print("üîç COUNTVECTORIZER MODEL ANALYSIS")
    print("=" * 50)

    # Create analysis dataframe
    analysis_data = []
    for name, result in count_results.items():
        model_name = name.replace(" (Count)", "")

        analysis_data.append({
            'Model': model_name,
            'Accuracy': result['accuracy'],
            'F1 Score': result['f1_score'],
            'Full Name': name
        })

    analysis_df = pd.DataFrame(analysis_data)
    analysis_df = analysis_df.sort_values('Accuracy', ascending=False)

    print("üìä DETAILED RESULTS:")
    print(analysis_df[['Model', 'Accuracy', 'F1 Score']
                      ].to_string(index=False))

    # Find best model
    best_model_row = analysis_df.iloc[0]
    best_model_name = best_model_row['Full Name']
    best_model = count_results[best_model_name]['model']

    print(f"\nüèÜ BEST MODEL:")
    print(f"   Model: {best_model_row['Model']}")
    print(f"   Accuracy: {best_model_row['Accuracy']:.4f}")
    print(f"   F1 Score: {best_model_row['F1 Score']:.4f}")

    # Performance statistics
    print(f"\nüìà PERFORMANCE STATISTICS:")
    print(f"   Average Accuracy: {analysis_df['Accuracy'].mean():.4f}")
    print(f"   Best Accuracy: {analysis_df['Accuracy'].max():.4f}")
    print(f"   Worst Accuracy: {analysis_df['Accuracy'].min():.4f}")
    print(f"   Accuracy Std: {analysis_df['Accuracy'].std():.4f}")
    print(f"   Average F1 Score: {analysis_df['F1 Score'].mean():.4f}")
    print(f"   Best F1 Score: {analysis_df['F1 Score'].max():.4f}")

    # Visualize performance
    plt.figure(figsize=(12, 8))

    # Plot 1: Accuracy ranking
    plt.subplot(2, 2, 1)
    colors = plt.cm.Reds(np.linspace(0.4, 0.8, len(analysis_df)))
    plt.barh(range(len(analysis_df)), analysis_df['Accuracy'], color=colors)
    plt.xlabel('Accuracy')
    plt.ylabel('Models')
    plt.title('Model Ranking by Accuracy')
    plt.yticks(range(len(analysis_df)), analysis_df['Model'])
    plt.gca().invert_yaxis()

    # Plot 2: F1 Score ranking
    plt.subplot(2, 2, 2)
    colors = plt.cm.Greens(np.linspace(0.4, 0.8, len(analysis_df)))
    plt.barh(range(len(analysis_df)), analysis_df['F1 Score'], color=colors)
    plt.xlabel('F1 Score')
    plt.ylabel('Models')
    plt.title('Model Ranking by F1 Score')
    plt.yticks(range(len(analysis_df)), analysis_df['Model'])
    plt.gca().invert_yaxis()

    # Plot 3: Accuracy vs F1 scatter
    plt.subplot(2, 2, 3)
    plt.scatter(analysis_df['Accuracy'], analysis_df['F1 Score'],
                c=range(len(analysis_df)), cmap='viridis', s=100)
    plt.xlabel('Accuracy')
    plt.ylabel('F1 Score')
    plt.title('Accuracy vs F1 Score')
    for i, model in enumerate(analysis_df['Model']):
        plt.annotate(model, (analysis_df.iloc[i]['Accuracy'], analysis_df.iloc[i]['F1 Score']),
                     xytext=(5, 5), textcoords='offset points', fontsize=8)

    # Plot 4: Performance distribution
    plt.subplot(2, 2, 4)
    plt.hist([analysis_df['Accuracy'], analysis_df['F1 Score']],
             bins=10, alpha=0.7, label=['Accuracy', 'F1 Score'])
    plt.xlabel('Score')
    plt.ylabel('Frequency')
    plt.title('Performance Distribution')
    plt.legend()

    plt.tight_layout()
    plt.show()

    return analysis_df, best_model_name, best_model

# Note: Use this function after training CountVectorizer models
# analysis_df, best_model_name, best_model = analyze_count_vectorizer_results(results)

In [None]:
# üöÄ COMPLETE COUNTVECTORIZER WORKFLOW

def run_complete_count_vectorization_workflow():
    """
    Complete workflow to process data with CountVectorizer and train models
    Run this after the initial data loading and train-test split
    """

    print("üéØ STARTING COUNTVECTORIZER WORKFLOW")
    print("=" * 50)

    # Check if we have the basic data
    if 'X_train' not in globals() or 'X_test' not in globals():
        print("‚ùå Train-test split data not found. Please run the data loading cells first.")
        return False

    # Step 1: Process with CountVectorizer
    print("\nüìä STEP 1: CountVectorizer Processing")
    if 'count_vectorizer' not in globals():
        print("Processing with CountVectorizer...")
        global count_vectorizer, X_train_count, X_test_count
        count_vectorizer, X_train_count, X_test_count = vectorize_data_count_in_batches(
            X_train, X_test, y_train, y_test)
        save_count_processed_data(
            count_vectorizer, X_train_count, X_test_count, y_train, y_test)
    else:
        print("‚úÖ CountVectorizer data already available")

    # Step 2: Train models on CountVectorizer data
    print("\nü§ñ STEP 2: Training CountVectorizer Models")
    if 'results' not in globals():
        global results
        results = train_models_on_count_data(
            X_train_count, X_test_count, y_train, y_test)
    else:
        print("‚úÖ CountVectorizer models already trained")

    # Step 3: Analyze results
    print("\nüìà STEP 3: Analyzing Results")
    analysis_df, best_model_name, best_model = analyze_count_vectorizer_results(
        results)

    # Step 4: Test predictions
    print("\nüß™ STEP 4: Testing Predictions")
    test_prediction_functions()

    print("\nüéâ COUNTVECTORIZER WORKFLOW COMPLETED!")
    print("=" * 50)

    return True


# Instructions for usage:
print("""
üìã COUNTVECTORIZER WORKFLOW INSTRUCTIONS:

1. FIRST RUN (Complete Processing):
   - Run cells 1-6 to load data and create train-test split
   - Run: run_complete_count_vectorization_workflow()
   - This will process data with CountVectorizer and train all models

2. SUBSEQUENT RUNS (Load Saved Data):
   - Uncomment and run the data loading section
   - Load Count data: load_count_processed_data()
   - Analyze results: analyze_count_vectorizer_results(results)

3. INDIVIDUAL STEPS:
   - Data processing: vectorize_data_count_in_batches()
   - Model training: train_models_on_count_data()
   - Results analysis: analyze_count_vectorizer_results()
   - Prediction testing: test_prediction_functions()

üí° MEMORY MANAGEMENT TIPS:
- CountVectorizer processes data in 2000-row batches
- Intermediate files are cleaned up automatically
- Use gc.collect() between steps if memory is limited
- All processed data is saved to {ARTIFACT_DIR} for future use

üéØ COUNTVECTORIZER ADVANTAGES:
- Simple and fast text vectorization
- Preserves exact word frequencies
- Less memory intensive during processing
- Good baseline approach for text classification
- Efficient for large datasets with batch processing
""")

# Uncomment the line below to run the complete workflow:
# run_complete_count_vectorization_workflow()

In [None]:
# üìÅ COUNTVECTORIZER DATA ORGANIZATION

def show_count_directory_structure():
    """Display the CountVectorizer data directory structure"""
    
    print("üìÅ COUNTVECTORIZER DATA ORGANIZATION")
    print("=" * 40)
    
    # Check CountVectorizer directory
    count_dir = ARTIFACT_DIR
    
    print(f"üî∏ CountVectorizer Directory: {count_dir}")
    if os.path.exists(count_dir):
        count_files = [f for f in os.listdir(count_dir) if os.path.isfile(os.path.join(count_dir, f))]
        print(f"   ‚úì Contains {len(count_files)} files")
        if count_files:
            total_size = 0
            for file in sorted(count_files):
                size_mb = os.path.getsize(os.path.join(count_dir, file)) / (1024 * 1024)
                total_size += size_mb
                print(f"     ‚Ä¢ {file} ({size_mb:.2f} MB)")
            print(f"   üìä Total size: {total_size:.2f} MB")
    else:
        print("   ‚ùå Directory does not exist")
    
    print(f"\nüìä EXPECTED DIRECTORY STRUCTURE:")
    print(f"""
{ARTIFACT_DIR}
‚îú‚îÄ‚îÄ count_vectorizer.joblib         # CountVectorizer
‚îú‚îÄ‚îÄ X_train_count.npz               # Training features
‚îú‚îÄ‚îÄ X_test_count.npz                # Test features
‚îú‚îÄ‚îÄ y_train_count.npy               # Training labels
‚îú‚îÄ‚îÄ y_test_count.npy                # Test labels
‚îú‚îÄ‚îÄ random_forest_count.joblib      # Trained Random Forest model
‚îú‚îÄ‚îÄ xgboost_count.joblib            # Trained XGBoost model
‚îú‚îÄ‚îÄ lightgbm_count.joblib           # Trained LightGBM model
‚îú‚îÄ‚îÄ extra_trees_count.joblib        # Trained Extra Trees model
‚îú‚îÄ‚îÄ naive_bayes_count.joblib        # Trained Naive Bayes model
‚îî‚îÄ‚îÄ logistic_regression_count.joblib # Trained Logistic Regression model
""")

def verify_count_data_integrity():
    """Verify the integrity of CountVectorizer processed data"""
    
    print("? VERIFYING COUNTVECTORIZER DATA INTEGRITY")
    print("=" * 45)
    
    required_files = [
        "count_vectorizer.joblib",
        "X_train_count.npz", 
        "X_test_count.npz",
        "y_train_count.npy",
        "y_test_count.npy"
    ]
    
    missing_files = []
    existing_files = []
    
    for filename in required_files:
        filepath = os.path.join(ARTIFACT_DIR, filename)
        if os.path.exists(filepath):
            existing_files.append(filename)
            # Check file size
            size_mb = os.path.getsize(filepath) / (1024 * 1024)
            print(f"‚úì {filename} ({size_mb:.2f} MB)")
        else:
            missing_files.append(filename)
            print(f"‚ùå {filename} - MISSING")
    
    print(f"\nüìä SUMMARY:")
    print(f"   Found: {len(existing_files)}/{len(required_files)} required files")
    
    if missing_files:
        print(f"   ‚ùå Missing files: {missing_files}")
        print("   Please run the CountVectorizer processing pipeline first.")
        return False
    else:
        print("   ‚úÖ All required files present")
        
        # Check for trained models
        model_patterns = [
            "random_forest_count.joblib",
            "xgboost_count.joblib", 
            "lightgbm_count.joblib",
            "extra_trees_count.joblib",
            "naive_bayes_count.joblib",
            "logistic_regression_count.joblib"
        ]
        
        model_count = 0
        for pattern in model_patterns:
            if os.path.exists(os.path.join(ARTIFACT_DIR, pattern)):
                model_count += 1
        
        print(f"   ü§ñ Trained models: {model_count}/{len(model_patterns)}")
        
        return True

def cleanup_count_directory():
    """Clean up temporary or unnecessary files in the CountVectorizer directory"""
    
    print("üßπ CLEANING UP COUNTVECTORIZER DIRECTORY")
    print("=" * 40)
    
    if not os.path.exists(ARTIFACT_DIR):
        print("‚ùå CountVectorizer directory does not exist")
        return
    
    # Patterns for temporary files to clean up
    temp_patterns = [
        "*_batch_*.npz",  # Temporary batch files
        "*.tmp",          # Temporary files
        "*.log"           # Log files
    ]
    
    import glob
    cleaned_count = 0
    
    for pattern in temp_patterns:
        temp_files = glob.glob(os.path.join(ARTIFACT_DIR, pattern))
        for temp_file in temp_files:
            try:
                os.remove(temp_file)
                print(f"üóëÔ∏è Removed: {os.path.basename(temp_file)}")
                cleaned_count += 1
            except Exception as e:
                print(f"‚ùå Failed to remove {os.path.basename(temp_file)}: {e}")
    
    if cleaned_count == 0:
        print("‚úÖ No temporary files to clean up")
    else:
        print(f"‚úÖ Cleaned up {cleaned_count} temporary files")

# Display current structure
show_count_directory_structure()

print("\n" + "="*50)
print("üí° COUNTVECTORIZER ORGANIZATION TIPS:")
print("1. Run verify_count_data_integrity() to check data completeness")
print("2. Run cleanup_count_directory() to remove temporary files") 
print("3. All CountVectorizer data is stored in a single directory")
print("4. Models are saved with descriptive names for easy identification")

In [None]:
# Train all models using CountVectorizer data
results = train_models_on_count_data(
    X_train_count, X_test_count, y_train, y_test)

In [None]:
# Compare CountVectorizer model performances
print("CountVectorizer Model Performance Comparison:")
print("=" * 60)

# Create comparison dataframe
comparison_data = []
for name, result in results.items():
    comparison_data.append({
        # Remove (Count) suffix for cleaner display
        'Model': name.replace(' (Count)', ''),
        'Accuracy': result['accuracy'],
        'F1 Score': result['f1_score']
    })

comparison_df = pd.DataFrame(comparison_data)
comparison_df = comparison_df.sort_values('Accuracy', ascending=False)
print(comparison_df.to_string(index=False))

# Find the best model
# Get original name with (Count)
best_model_name = list(results.keys())[comparison_df.index[0]]
best_model = results[best_model_name]['model']
print(f"\nBest performing model: {comparison_df.iloc[0]['Model']}")
print(f"Best accuracy: {comparison_df.iloc[0]['Accuracy']:.4f}")
print(f"Best F1 score: {comparison_df.iloc[0]['F1 Score']:.4f}")

# Visualize results
plt.figure(figsize=(12, 5))

# Plot 1: Accuracy comparison
plt.subplot(1, 2, 1)
plt.bar(range(len(comparison_df)),
        comparison_df['Accuracy'], color='lightcoral')
plt.xlabel('Models')
plt.ylabel('Accuracy')
plt.title('CountVectorizer Model Accuracy Comparison')
plt.xticks(range(len(comparison_df)),
           comparison_df['Model'], rotation=45, ha='right')
plt.ylim(0, 1)

# Plot 2: F1 Score comparison
plt.subplot(1, 2, 2)
plt.bar(range(len(comparison_df)),
        comparison_df['F1 Score'], color='lightgreen')
plt.xlabel('Models')
plt.ylabel('F1 Score')
plt.title('CountVectorizer Model F1 Score Comparison')
plt.xticks(range(len(comparison_df)),
           comparison_df['Model'], rotation=45, ha='right')
plt.ylim(0, 1)

plt.tight_layout()
plt.show()

In [None]:
# Create confusion matrix for the best CountVectorizer model
best_predictions = results[best_model_name]['predictions']

plt.figure(figsize=(8, 6))
cm = confusion_matrix(y_test, best_predictions)
sns.heatmap(cm, annot=True, fmt='d', cmap='Reds',
            xticklabels=['Legitimate', 'Phishing'],
            yticklabels=['Legitimate', 'Phishing'])
plt.title(
    f'Confusion Matrix - {comparison_df.iloc[0]["Model"]} (CountVectorizer)')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()

# Print detailed classification report
print(
    f"\nDetailed Classification Report for {comparison_df.iloc[0]['Model']}:")
print("=" * 60)
print(classification_report(y_test, best_predictions,
      target_names=['Legitimate', 'Phishing']))

In [None]:
# CountVectorizer prediction functions
def predict_webpage_status(webpage_code, model=None, vectorizer=None):
    """
    Predict if a webpage is phishing (1) or legitimate (0) based on its HTML code using CountVectorizer.

    Parameters:
    webpage_code (str): The HTML code of the webpage
    model: The trained model to use for prediction (default: best model)
    vectorizer: The CountVectorizer to use (default: loaded vectorizer)

    Returns:
    dict: Prediction result with probability scores
    """
    if model is None:
        if 'best_model' in globals():
            model = best_model
        else:
            print("‚ùå No model available. Please train models first.")
            return None

    if vectorizer is None:
        if 'count_vectorizer' in globals():
            vectorizer = count_vectorizer
        else:
            print("‚ùå No CountVectorizer available. Please load processed data first.")
            return None

    try:
        # Vectorize the input
        webpage_vectorized = vectorizer.transform([webpage_code])

        # Make prediction
        prediction = model.predict(webpage_vectorized)[0]

        # Get prediction probabilities
        probabilities = model.predict_proba(webpage_vectorized)[0]

        # Create result dictionary
        result = {
            'prediction': prediction,
            'status': 'Phishing' if prediction == 1 else 'Legitimate',
            'confidence': max(probabilities),
            'probability_legitimate': probabilities[0],
            'probability_phishing': probabilities[1],
            'vectorizer_used': 'CountVectorizer'
        }

        return result

    except Exception as e:
        print(f"‚ùå Error making prediction: {e}")
        return None


def load_model_for_prediction(model_name):
    """Load a specific trained CountVectorizer model for prediction"""
    model_filename = f"{model_name.replace(' ', '_').replace('(', '').replace(')', '').lower()}.joblib"
    model_path = os.path.join(ARTIFACT_DIR, model_filename)

    if os.path.exists(model_path):
        loaded_model = joblib.load(model_path)
        print(f"‚úÖ Loaded model: {model_name}")
        return loaded_model
    else:
        print(f"‚ùå Model file not found: {model_path}")
        return None


def test_prediction_functions():
    """Test the prediction function with sample data using CountVectorizer"""

    if 'results' not in globals() or not results:
        print("‚ùå No trained models available. Please train models first.")
        return

    print(f"Using best model: {comparison_df.iloc[0]['Model']}")
    print(f"Best accuracy: {comparison_df.iloc[0]['Accuracy']:.4f}")

    # Test with sample HTML codes
    test_samples = [
        "<html><head><title>Google</title></head><body>Welcome to Google</body></html>",
        "<html><head><title>Secure Bank Login</title></head><body><form>Enter password</form></body></html>",
        "<html><script>window.location='phishing-site.com'</script></html>",
        "<html><body>Click here to verify your account: <a href='fake-bank.com'>Verify</a></body></html>",
        "<html><body>Urgent! Your account will be suspended. Click <a href='malicious-bank-site.com'>here</a></body></html>"
    ]

    print("\nüß™ TESTING COUNTVECTORIZER PREDICTION FUNCTION")
    print("=" * 70)

    for i, sample in enumerate(test_samples, 1):
        result = predict_webpage_status(sample, best_model, count_vectorizer)
        if result:
            print(f"\nüìÑ Sample {i}: {sample[:60]}...")
            print(f"   Prediction: {result['status']}")
            print(f"   Confidence: {result['confidence']:.3f}")
            print(
                f"   Prob Legitimate: {result['probability_legitimate']:.3f}")
            print(f"   Prob Phishing: {result['probability_phishing']:.3f}")
        else:
            print(f"\nüìÑ Sample {i}: ‚ùå Prediction failed")

# Note: Run test_prediction_functions() after training models

In [None]:
# üéØ CountVectorizer Batch Processing Workflow Summary
print("""
üìä COUNTVECTORIZER BATCH PROCESSING WORKFLOW FOR PHISHING DETECTION
================================================================

This notebook implements an efficient batch processing approach for handling large datasets using CountVectorizer:

üîÑ WORKFLOW STEPS:
1. ‚úÖ Load dataset in chunks (2000 rows at a time)
2. ‚úÖ Analyze dataset structure and statistics 
3. ‚úÖ Create stratified train-test split
4. ‚úÖ Vectorize data in batches using CountVectorizer
5. ‚úÖ Save vectorized data and vectorizer to disk
6. ‚úÖ Train multiple models on processed data
7. ‚úÖ Save trained models for future use
8. ‚úÖ Evaluate and compare model performance

üíæ SAVED ARTIFACTS (in /kaggle/working/count/):
- count_vectorizer.joblib          (CountVectorizer)
- X_train_count.npz               (Training features)
- X_test_count.npz                (Test features) 
- y_train_count.npy               (Training labels)
- y_test_count.npy                (Test labels)
- [model_name].joblib             (Trained models)

üöÄ MEMORY MANAGEMENT:
- Processes data in 2000-row batches
- Saves intermediate results to disk
- Cleans up memory after each batch
- Monitors memory usage throughout

üìà MODELS TRAINED:
- Random Forest
- XGBoost  
- LightGBM
- Extra Trees
- Naive Bayes
- Logistic Regression

üéØ USAGE:
1. Run all cells in sequence to process data and train models
2. Use load_count_processed_data() to reload saved data
3. Use predict_webpage_status() for new predictions
4. Use test_prediction_functions() to test the prediction pipeline

üí° COUNTVECTORIZER BENEFITS:
- Fast and simple text vectorization
- Preserves exact word counts
- Less computationally intensive than TF-IDF
- Good baseline for text classification
- Handles large datasets efficiently in batches
""")

# Print current status
print(f"\nüìç CURRENT STATUS:")
print(f"Working directory: {os.getcwd()}")
print(f"Artifact directory: {ARTIFACT_DIR}")
print(f"Dataset path: {dataset_path}")

# Check if processed data exists
if os.path.exists(os.path.join(ARTIFACT_DIR, "count_vectorizer.joblib")):
    print("‚úÖ CountVectorizer processed data available")
else:
    print("‚è≥ CountVectorizer processed data not yet created - run processing cells first")

# Memory status
print_memory_usage("(current)")

# Phishing URL Detection

In [None]:
# Check if we're running locally or on Kaggle for URL dataset
if os.path.exists("/home/iqbal/Programming/ML/project/dataset/new_data_urls.csv"):
    url_dataset_path = "/home/iqbal/Programming/ML/project/dataset/new_data_urls.csv"
else:
    url_dataset_path = "/kaggle/input/phising-website-url-dataset/new_data_urls.csv"

url_dataSet = pd.read_csv(url_dataset_path)
url_dataSet.head()

In [None]:
# Prepare features and target for URL dataset
from sklearn.model_selection import train_test_split

url_X = url_dataSet['url']
url_y = url_dataSet['status']

print(f"URL dataset shape: {url_dataSet.shape}")
print(f"Class distribution:\n{url_y.value_counts()}")

# Split into train/test sets
url_X_train, url_X_test, url_y_train, url_y_test = train_test_split(
    url_X, url_y, test_size=0.2, random_state=42, stratify=url_y
)
print(f"Train size: {len(url_X_train)}, Test size: {len(url_X_test)}")

# Create pipelines for URL classification
pipelines = {
    'CountVectorizer + Random Forest': Pipeline([
        ('vectorizer', CountVectorizer(max_features=5000, stop_words='english')),
        ('classifier', RandomForestClassifier(n_estimators=100, random_state=42))
    ]),
    'CountVectorizer + XGBoost': Pipeline([
        ('vectorizer', CountVectorizer(max_features=5000, stop_words='english')),
        ('classifier', xgb.XGBClassifier(random_state=42, verbosity=0))
    ]),
    'CountVectorizer + Naive Bayes': Pipeline([
        ('vectorizer', CountVectorizer(max_features=5000, stop_words='english')),
        ('classifier', MultinomialNB())
    ]),
    'CountVectorizer + Logistic Regression': Pipeline([
        ('vectorizer', CountVectorizer(max_features=5000, stop_words='english')),
        ('classifier', LogisticRegression(random_state=42, max_iter=1000))
    ])
}

In [None]:
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix


url_results = {}
print("Training and evaluating URL models...")
for name, pipeline in pipelines.items():
    print(f"\nTraining {name}...")
    try:
        pipeline.fit(url_X_train, url_y_train)
        y_pred = pipeline.predict(url_X_test)
        acc = accuracy_score(url_y_test, y_pred)
        f1 = f1_score(url_y_test, y_pred)
        url_results[name] = {'accuracy': acc, 'f1_score': f1,
                             'predictions': y_pred, 'model': pipeline}
        print(f"Accuracy: {acc:.4f}, F1 Score: {f1:.4f}")
    except Exception as e:
        print(f"Error training {name}: {e}")

In [None]:
# Compare URL model performances
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

comparison_url = []
for name, result in url_results.items():
    comparison_url.append({
        'Model': name,
        'Accuracy': result['accuracy'],
        'F1 Score': result['f1_score']
    })

comparison_url_df = pd.DataFrame(
    comparison_url).sort_values('Accuracy', ascending=False)
print(comparison_url_df.to_string(index=False))

best_url_model_name = comparison_url_df.iloc[0]['Model']
best_url_model = url_results[best_url_model_name]['model']
print(f"\nBest URL model: {best_url_model_name}")
print(f"Accuracy: {comparison_url_df.iloc[0]['Accuracy']:.4f}")

# Visualize accuracy and F1 score
plt.figure(figsize=(10, 4))
plt.bar(comparison_url_df['Model'], comparison_url_df['Accuracy'],
        color='skyblue', label='Accuracy')
plt.bar(comparison_url_df['Model'], comparison_url_df['F1 Score'],
        color='lightcoral', alpha=0.7, label='F1 Score')
plt.xticks(rotation=45, ha='right')
plt.ylabel('Score')
plt.title('Phishing URL Model Performance')
plt.legend()
plt.tight_layout()
plt.show()

# Confusion matrix for best model
y_pred_best = url_results[best_url_model_name]['predictions']
cm = confusion_matrix(url_y_test, y_pred_best)
plt.figure(figsize=(6, 5))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=[
            'Legitimate', 'Phishing'], yticklabels=['Legitimate', 'Phishing'])
plt.title(f'Confusion Matrix - {best_url_model_name}')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()

In [None]:
# Create a function to predict URL status
def predict_url_status(url, model=None):
    """
    Predict if a URL is phishing (1) or legitimate (0) based on its text.

    Parameters:
    url (str): The URL to predict
    model: The trained model to use for prediction (default: best model)

    Returns:
    dict: Prediction result with probability scores
    """
    if model is None:
        model = best_url_model

    # Make prediction
    prediction = model.predict([url])[0]

    # Get prediction probabilities
    probabilities = model.predict_proba([url])[0]

    # Create result dictionary
    result = {
        'prediction': prediction,
        'status': 'Legitimate' if prediction == 1 else 'Phishing',
        'confidence': max(probabilities),
        'probability_legitimate': probabilities[0],
        'probability_phishing': probabilities[1]
    }

    return result

In [None]:
url_sample = ["google.com", "facebook.com", "phishing-test.com",
              "example.com", "malicious-site.com", 'facebook-test.com']

print("\nTesting URL prediction function:")
for url in url_sample:
    result = predict_url_status(url)
    print(f"URL: {url} | Prediction: {result['status']} | "
          f"Confidence: {result['confidence']:.4f} | "
          f"Prob Legitimate: {result['probability_legitimate']:.4f} | "
          f"Prob Phishing: {result['probability_phishing']:.4f}")