# Predict from web_code

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, f1_score
from sklearn.pipeline import Pipeline
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier, AdaBoostClassifier, ExtraTreesClassifier
import xgboost as xgb
import lightgbm as lgb
from sklearn.preprocessing import StandardScaler

In [None]:
import psutil
import gc


def get_memory_usage():
    """Get current memory usage"""
    process = psutil.Process()
    memory_info = process.memory_info()
    memory_mb = memory_info.rss / 1024 / 1024
    return memory_mb


def print_memory_usage(label=""):
    """Print current memory usage with optional label"""
    memory_mb = get_memory_usage()
    print(f"Memory usage {label}: {memory_mb:.2f} MB")


def cleanup_memory():
    """Force garbage collection to free memory"""
    gc.collect()
    print("Memory cleanup completed")


# Print initial memory usage
print_memory_usage("(initial)")

In [None]:
import os
import joblib
import numpy as np
from scipy.sparse import save_npz, load_npz, vstack
import gc

# Check if we're running locally or on Kaggle
if os.path.exists("/home/iqbal/Programming/ML/project/dataset/phishing_complete_dataset.csv"):
    dataset_path = "/home/iqbal/Programming/ML/project/dataset/phishing_complete_dataset.csv"
    ARTIFACT_DIR = "/home/iqbal/Programming/ML/project/artifacts/"
else:
    dataset_path = "/kaggle/input/phishing-website-webcode-dataset/phishing_complete_dataset.csv"
    ARTIFACT_DIR = "/kaggle/working/"

# Create artifacts directory if it doesn't exist
os.makedirs(ARTIFACT_DIR, exist_ok=True)

# First, let's check the total number of rows in the dataset
total_rows = sum(1 for line in open(dataset_path)) - 1  # subtract 1 for header
print(f"Total rows in dataset: {total_rows}")

# Read a small sample first to understand the data structure
sample_df = pd.read_csv(dataset_path, nrows=5)
print(f"Dataset columns: {sample_df.columns.tolist()}")
print(f"Sample data:")
print(sample_df.head())

In [None]:
# Batch processing configuration
BATCH_SIZE = 2000
CHUNK_SIZE = 2000


def process_dataset_in_batches():
    """Process dataset in batches to manage memory efficiently"""

    # Initialize variables to track dataset statistics
    total_processed = 0
    class_counts = {}
    code_lengths = []

    print("Processing dataset in batches to analyze structure...")

    # Process dataset in chunks for analysis
    chunk_iter = pd.read_csv(dataset_path, chunksize=CHUNK_SIZE)

    for i, chunk in enumerate(chunk_iter):
        print(
            f"Processing chunk {i+1}, rows {total_processed+1} to {total_processed+len(chunk)}")

        # Update class distribution
        chunk_classes = chunk['result'].value_counts()
        for class_label, count in chunk_classes.items():
            class_counts[class_label] = class_counts.get(
                class_label, 0) + count

        # Sample some code lengths (to avoid memory issues)
        if len(code_lengths) < 10000:  # Only sample first 10k for statistics
            chunk_lengths = chunk['webpage_code'].str.len()
            code_lengths.extend(chunk_lengths.tolist())

        total_processed += len(chunk)

        # Clear chunk from memory
        del chunk
        gc.collect()

        if total_processed >= 10000:  # Limit analysis to first 10k rows for speed
            break

    print(f"\nDataset Analysis (first {total_processed} rows):")
    print(f"Total processed: {total_processed}")
    print(f"Class distribution: {class_counts}")

    if class_counts:
        total_samples = sum(class_counts.values())
        for class_label, count in class_counts.items():
            percentage = (count / total_samples) * 100
            print(f"Class {class_label}: {count} ({percentage:.2f}%)")

    if code_lengths:
        code_lengths_array = np.array(code_lengths)
        print(f"\nWebpage code length statistics:")
        print(f"Mean: {code_lengths_array.mean():.2f}")
        print(f"Std: {code_lengths_array.std():.2f}")
        print(f"Min: {code_lengths_array.min()}")
        print(f"Max: {code_lengths_array.max()}")
        print(f"Median: {np.median(code_lengths_array):.2f}")


# Run the analysis
process_dataset_in_batches()

In [None]:
def create_train_test_split_batched():
    """Create train-test split while processing dataset in batches"""

    print("Creating train-test split with batched processing...")

    # First pass: collect all data for proper stratified split
    all_X = []
    all_y = []

    chunk_iter = pd.read_csv(dataset_path, chunksize=CHUNK_SIZE)

    for i, chunk in enumerate(chunk_iter):
        print(f"Reading chunk {i+1} for train-test split...")
        all_X.extend(chunk['webpage_code'].tolist())
        all_y.extend(chunk['result'].tolist())
        del chunk
        gc.collect()

    print(f"Total samples loaded: {len(all_X)}")

    # Convert to pandas Series for train_test_split
    X_series = pd.Series(all_X)
    y_series = pd.Series(all_y)

    # Create stratified train-test split
    X_train, X_test, y_train, y_test = train_test_split(
        X_series, y_series, test_size=0.2, random_state=42, stratify=y_series
    )

    print(f"Training set size: {len(X_train)}")
    print(f"Test set size: {len(X_test)}")
    print(f"Training target distribution:\n{y_train.value_counts()}")
    print(f"Test target distribution:\n{y_test.value_counts()}")

    # Clear original data from memory
    del all_X, all_y, X_series, y_series
    gc.collect()

    return X_train, X_test, y_train, y_test


# Create the train-test split
X_train, X_test, y_train, y_test = create_train_test_split_batched()

In [None]:
def vectorize_data_in_batches(X_train, X_test, y_train, y_test):
    """Vectorize training and test data in batches to manage memory"""

    print("Starting batch vectorization...")

    # Initialize TF-IDF vectorizer
    tfidf_vectorizer = TfidfVectorizer(
        max_features=5000,
        stop_words='english',
        ngram_range=(1, 2)
    )

    # Convert to lists for batch processing
    X_train_list = X_train.tolist()
    X_test_list = X_test.tolist()

    # Fit vectorizer on training data in batches
    print("Fitting TF-IDF vectorizer on training data...")

    # For fitting, we need to process all training data
    # We'll do this in batches but accumulate the vocabulary
    train_batches = [X_train_list[i:i+BATCH_SIZE]
                     for i in range(0, len(X_train_list), BATCH_SIZE)]

    print(f"Processing {len(train_batches)} training batches...")

    # Fit on the first few batches to establish vocabulary
    # Use up to 10k samples for fitting
    sample_size = min(10000, len(X_train_list))
    sample_data = X_train_list[:sample_size]
    tfidf_vectorizer.fit(sample_data)

    print("Vectorizer fitted. Now transforming training data in batches...")

    # Transform training data in batches and save
    X_train_tfidf_batches = []
    for i, batch in enumerate(train_batches):
        print(f"Transforming training batch {i+1}/{len(train_batches)}")
        batch_tfidf = tfidf_vectorizer.transform(batch)
        X_train_tfidf_batches.append(batch_tfidf)

        # Save batch to disk to free memory
        batch_filename = os.path.join(
            ARTIFACT_DIR, f"X_train_tfidf_batch_{i}.npz")
        save_npz(batch_filename, batch_tfidf)

        # Clear batch from memory
        del batch_tfidf
        gc.collect()

    print("Training data vectorization completed. Now processing test data...")

    # Transform test data in batches
    test_batches = [X_test_list[i:i+BATCH_SIZE]
                    for i in range(0, len(X_test_list), BATCH_SIZE)]
    X_test_tfidf_batches = []

    for i, batch in enumerate(test_batches):
        print(f"Transforming test batch {i+1}/{len(test_batches)}")
        batch_tfidf = tfidf_vectorizer.transform(batch)
        X_test_tfidf_batches.append(batch_tfidf)

        # Save batch to disk
        batch_filename = os.path.join(
            ARTIFACT_DIR, f"X_test_tfidf_batch_{i}.npz")
        save_npz(batch_filename, batch_tfidf)

        # Clear batch from memory
        del batch_tfidf
        gc.collect()

    print("Combining all training batches...")
    # Load and combine all training batches
    X_train_tfidf = None
    for i in range(len(train_batches)):
        batch_filename = os.path.join(
            ARTIFACT_DIR, f"X_train_tfidf_batch_{i}.npz")
        batch_data = load_npz(batch_filename)

        if X_train_tfidf is None:
            X_train_tfidf = batch_data
        else:
            X_train_tfidf = vstack([X_train_tfidf, batch_data])

        # Clean up temporary file
        os.remove(batch_filename)

    print("Combining all test batches...")
    # Load and combine all test batches
    X_test_tfidf = None
    for i in range(len(test_batches)):
        batch_filename = os.path.join(
            ARTIFACT_DIR, f"X_test_tfidf_batch_{i}.npz")
        batch_data = load_npz(batch_filename)

        if X_test_tfidf is None:
            X_test_tfidf = batch_data
        else:
            X_test_tfidf = vstack([X_test_tfidf, batch_data])

        # Clean up temporary file
        os.remove(batch_filename)

    print(f"Vectorization completed!")
    print(f"Training data shape: {X_train_tfidf.shape}")
    print(f"Test data shape: {X_test_tfidf.shape}")

    # Clear original text data from memory
    del X_train_list, X_test_list
    gc.collect()

    return tfidf_vectorizer, X_train_tfidf, X_test_tfidf


# Perform batch vectorization
tfidf_vectorizer, X_train_tfidf, X_test_tfidf = vectorize_data_in_batches(
    X_train, X_test, y_train, y_test)

# Clear original text data
del X_train, X_test
gc.collect()

In [None]:
def save_processed_data():
    """Save all processed data and vectorizer to disk"""

    print("Saving processed data to disk...")

    # Save vectorizer
    vectorizer_path = os.path.join(ARTIFACT_DIR, "tfidf_vectorizer.joblib")
    joblib.dump(tfidf_vectorizer, vectorizer_path)
    print(f"✓ Saved TF-IDF vectorizer to {vectorizer_path}")

    # Save vectorized training data
    train_tfidf_path = os.path.join(ARTIFACT_DIR, "X_train_tfidf.npz")
    save_npz(train_tfidf_path, X_train_tfidf)
    print(f"✓ Saved training TF-IDF data to {train_tfidf_path}")

    # Save vectorized test data
    test_tfidf_path = os.path.join(ARTIFACT_DIR, "X_test_tfidf.npz")
    save_npz(test_tfidf_path, X_test_tfidf)
    print(f"✓ Saved test TF-IDF data to {test_tfidf_path}")

    # Save target variables
    y_train_np = y_train.to_numpy() if hasattr(
        y_train, "to_numpy") else np.asarray(y_train)
    y_test_np = y_test.to_numpy() if hasattr(
        y_test, "to_numpy") else np.asarray(y_test)

    y_train_path = os.path.join(ARTIFACT_DIR, "y_train.npy")
    y_test_path = os.path.join(ARTIFACT_DIR, "y_test.npy")

    np.save(y_train_path, y_train_np)
    np.save(y_test_path, y_test_np)
    print(f"✓ Saved training targets to {y_train_path}")
    print(f"✓ Saved test targets to {y_test_path}")

    # Print file sizes for verification
    print("\nFile sizes:")
    for filename in ["tfidf_vectorizer.joblib", "X_train_tfidf.npz", "X_test_tfidf.npz", "y_train.npy", "y_test.npy"]:
        filepath = os.path.join(ARTIFACT_DIR, filename)
        if os.path.exists(filepath):
            size_mb = os.path.getsize(filepath) / (1024 * 1024)
            print(f"  {filename}: {size_mb:.2f} MB")

    print(f"\n✅ All data saved successfully to '{ARTIFACT_DIR}'")

    return {
        'vectorizer_path': vectorizer_path,
        'train_tfidf_path': train_tfidf_path,
        'test_tfidf_path': test_tfidf_path,
        'y_train_path': y_train_path,
        'y_test_path': y_test_path
    }


# Save all processed data
saved_paths = save_processed_data()

In [None]:
def load_processed_data():
    """Load previously saved processed data for model training"""

    print("Loading processed data from disk...")

    # Check if all required files exist
    required_files = [
        "tfidf_vectorizer.joblib",
        "X_train_tfidf.npz",
        "X_test_tfidf.npz",
        "y_train.npy",
        "y_test.npy"
    ]

    missing_files = []
    for filename in required_files:
        filepath = os.path.join(ARTIFACT_DIR, filename)
        if not os.path.exists(filepath):
            missing_files.append(filename)

    if missing_files:
        print(f"❌ Missing files: {missing_files}")
        print("Please run the data processing cells first.")
        return None

    # Load vectorizer
    vectorizer_path = os.path.join(ARTIFACT_DIR, "tfidf_vectorizer.joblib")
    loaded_vectorizer = joblib.load(vectorizer_path)
    print(f"✓ Loaded TF-IDF vectorizer from {vectorizer_path}")

    # Load vectorized data
    train_tfidf_path = os.path.join(ARTIFACT_DIR, "X_train_tfidf.npz")
    test_tfidf_path = os.path.join(ARTIFACT_DIR, "X_test_tfidf.npz")

    loaded_X_train_tfidf = load_npz(train_tfidf_path)
    loaded_X_test_tfidf = load_npz(test_tfidf_path)
    print(f"✓ Loaded training TF-IDF data: {loaded_X_train_tfidf.shape}")
    print(f"✓ Loaded test TF-IDF data: {loaded_X_test_tfidf.shape}")

    # Load target variables
    y_train_path = os.path.join(ARTIFACT_DIR, "y_train.npy")
    y_test_path = os.path.join(ARTIFACT_DIR, "y_test.npy")

    loaded_y_train = np.load(y_train_path)
    loaded_y_test = np.load(y_test_path)
    print(f"✓ Loaded training targets: {loaded_y_train.shape}")
    print(f"✓ Loaded test targets: {loaded_y_test.shape}")

    print(f"\n✅ All data loaded successfully!")

    return {
        'vectorizer': loaded_vectorizer,
        'X_train_tfidf': loaded_X_train_tfidf,
        'X_test_tfidf': loaded_X_test_tfidf,
        'y_train': loaded_y_train,
        'y_test': loaded_y_test
    }

# Uncomment the next line if you want to load previously saved data instead of processing
# loaded_data = load_processed_data()
# if loaded_data:
#     tfidf_vectorizer = loaded_data['vectorizer']
#     X_train_tfidf = loaded_data['X_train_tfidf']
#     X_test_tfidf = loaded_data['X_test_tfidf']
#     y_train = loaded_data['y_train']
#     y_test = loaded_data['y_test']

In [None]:
def train_models_on_processed_data():
    """Train models using the processed and vectorized data"""

    results = {}

    # Define classifiers for TF-IDF data
    classifiers_tfidf = {
        'Random Forest (TF-IDF)': RandomForestClassifier(n_estimators=100, random_state=42),
        'XGBoost (TF-IDF)': xgb.XGBClassifier(random_state=42, verbosity=0),
        'LightGBM (TF-IDF)': lgb.LGBMClassifier(random_state=42, verbose=-1),
        'Extra Trees (TF-IDF)': ExtraTreesClassifier(n_estimators=100, random_state=42),
        'Naive Bayes (TF-IDF)': MultinomialNB(),
        'Logistic Regression (TF-IDF)': LogisticRegression(random_state=42, max_iter=1000)
    }

    print("Training and evaluating models on processed data...")
    print("=" * 60)
    print(f"Training data shape: {X_train_tfidf.shape}")
    print(f"Test data shape: {X_test_tfidf.shape}")
    print("=" * 60)

    # Train TF-IDF models
    for name, clf in classifiers_tfidf.items():
        print(f"\n🚀 Training {name}...")
        try:
            # Train the model
            clf.fit(X_train_tfidf, y_train)

            # Make predictions
            y_pred = clf.predict(X_test_tfidf)

            # Calculate metrics
            accuracy = accuracy_score(y_test, y_pred)
            f1 = f1_score(y_test, y_pred)

            # Store results
            results[name] = {
                'accuracy': accuracy,
                'f1_score': f1,
                'predictions': y_pred,
                'model': clf
            }

            print(f"   ✅ Accuracy: {accuracy:.4f}")
            print(f"   ✅ F1 Score: {f1:.4f}")

            # Save the trained model
            model_filename = f"{name.replace(' ', '_').replace('(', '').replace(')', '').lower()}.joblib"
            model_path = os.path.join(ARTIFACT_DIR, model_filename)
            joblib.dump(clf, model_path)
            print(f"   💾 Saved model to {model_filename}")

        except Exception as e:
            print(f"   ❌ Error training {name}: {e}")

    print(f"\n{'='*60}")
    print("🎉 Training completed!")

    return results


# Train all models
results = train_models_on_processed_data()

In [None]:
# Compare model performances
print("Model Performance Comparison:")
print("=" * 60)

# Create comparison dataframe
comparison_data = []
for name, result in results.items():
    comparison_data.append({
        'Model': name,
        'Accuracy': result['accuracy'],
        'F1 Score': result['f1_score']
    })

comparison_df = pd.DataFrame(comparison_data)
comparison_df = comparison_df.sort_values('Accuracy', ascending=False)
print(comparison_df.to_string(index=False))

# Find the best model
best_model_name = comparison_df.iloc[0]['Model']
best_model = results[best_model_name]['model']
print(f"\nBest performing model: {best_model_name}")
print(f"Best accuracy: {comparison_df.iloc[0]['Accuracy']:.4f}")

# Visualize results
plt.figure(figsize=(12, 5))

# Plot 1: Accuracy comparison
plt.subplot(1, 2, 1)
plt.bar(range(len(comparison_df)), comparison_df['Accuracy'], color='skyblue')
plt.xlabel('Models')
plt.ylabel('Accuracy')
plt.title('Model Accuracy Comparison')
plt.xticks(range(len(comparison_df)),
           comparison_df['Model'], rotation=45, ha='right')
plt.ylim(0, 1)

# Plot 2: F1 Score comparison
plt.subplot(1, 2, 2)
plt.bar(range(len(comparison_df)),
        comparison_df['F1 Score'], color='lightcoral')
plt.xlabel('Models')
plt.ylabel('F1 Score')
plt.title('Model F1 Score Comparison')
plt.xticks(range(len(comparison_df)),
           comparison_df['Model'], rotation=45, ha='right')
plt.ylim(0, 1)

plt.tight_layout()
plt.show()

In [None]:
# Create confusion matrix for the best model
best_predictions = results[best_model_name]['predictions']

plt.figure(figsize=(8, 6))
cm = confusion_matrix(y_test, best_predictions)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
            xticklabels=['Legitimate', 'Phishing'],
            yticklabels=['Legitimate', 'Phishing'])
plt.title(f'Confusion Matrix - {best_model_name}')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()

In [None]:
# Function to predict if a webpage is phishing or legitimate
def predict_webpage_status(webpage_code, model=None, vectorizer=None):
    """
    Predict if a webpage is phishing (1) or legitimate (0) based on its HTML code.

    Parameters:
    webpage_code (str): The HTML code of the webpage
    model: The trained model to use for prediction (default: best model)
    vectorizer: The TF-IDF vectorizer to use (default: loaded vectorizer)

    Returns:
    dict: Prediction result with probability scores
    """
    if model is None:
        if 'best_model' in globals():
            model = best_model
        else:
            print("❌ No model available. Please train models first.")
            return None

    if vectorizer is None:
        if 'tfidf_vectorizer' in globals():
            vectorizer = tfidf_vectorizer
        else:
            print("❌ No vectorizer available. Please load processed data first.")
            return None

    try:
        # Vectorize the input
        webpage_vectorized = vectorizer.transform([webpage_code])

        # Make prediction
        prediction = model.predict(webpage_vectorized)[0]

        # Get prediction probabilities
        probabilities = model.predict_proba(webpage_vectorized)[0]

        # Create result dictionary
        result = {
            'prediction': prediction,
            'status': 'Phishing' if prediction == 1 else 'Legitimate',
            'confidence': max(probabilities),
            'probability_legitimate': probabilities[0],
            'probability_phishing': probabilities[1]
        }

        return result

    except Exception as e:
        print(f"❌ Error making prediction: {e}")
        return None


def load_model_for_prediction(model_name):
    """Load a specific trained model for prediction"""
    model_filename = f"{model_name.replace(' ', '_').replace('(', '').replace(')', '').lower()}.joblib"
    model_path = os.path.join(ARTIFACT_DIR, model_filename)

    if os.path.exists(model_path):
        loaded_model = joblib.load(model_path)
        print(f"✅ Loaded model: {model_name}")
        return loaded_model
    else:
        print(f"❌ Model file not found: {model_path}")
        return None

# Test the prediction function once models are trained


def test_prediction_function():
    """Test the prediction function with sample data"""

    if 'results' not in globals() or not results:
        print("❌ No trained models available. Please train models first.")
        return

    # Get the best model
    comparison_data = []
    for name, result in results.items():
        comparison_data.append({
            'Model': name,
            'Accuracy': result['accuracy'],
            'F1 Score': result['f1_score']
        })

    comparison_df = pd.DataFrame(comparison_data).sort_values(
        'Accuracy', ascending=False)
    best_model_name = comparison_df.iloc[0]['Model']
    best_model = results[best_model_name]['model']

    print(f"Using best model: {best_model_name}")
    print(f"Best accuracy: {comparison_df.iloc[0]['Accuracy']:.4f}")

    # Test with sample HTML codes
    test_samples = [
        "<html><head><title>Google</title></head><body>Welcome to Google</body></html>",
        "<html><head><title>Secure Bank Login</title></head><body><form>Enter password</form></body></html>",
        "<html><script>window.location='phishing-site.com'</script></html>",
        "<html><body>Click here to verify your account: <a href='fake-bank.com'>Verify</a></body></html>"
    ]

    print("\n🧪 Testing prediction function with sample HTML codes:")
    print("=" * 60)

    for i, sample in enumerate(test_samples, 1):
        result = predict_webpage_status(sample, best_model, tfidf_vectorizer)
        if result:
            print(
                f"Sample {i}: {result['status']} (confidence: {result['confidence']:.3f})")
            print(f"  HTML: {sample[:50]}...")
            print(f"  Prob Legitimate: {result['probability_legitimate']:.3f}")
            print(f"  Prob Phishing: {result['probability_phishing']:.3f}")
            print()

# Note: Run test_prediction_function() after training models

In [None]:
# 🎯 Batch Processing Workflow Summary
print("""
📊 BATCH PROCESSING WORKFLOW FOR PHISHING DETECTION
==================================================

This notebook implements an efficient batch processing approach for handling large datasets:

🔄 WORKFLOW STEPS:
1. ✅ Load dataset in chunks (2000 rows at a time)
2. ✅ Analyze dataset structure and statistics 
3. ✅ Create stratified train-test split
4. ✅ Vectorize data in batches using TF-IDF
5. ✅ Save vectorized data and vectorizer to disk
6. ✅ Train multiple models on processed data
7. ✅ Save trained models for future use
8. ✅ Evaluate and compare model performance

💾 SAVED ARTIFACTS:
- tfidf_vectorizer.joblib      (TF-IDF vectorizer)
- X_train_tfidf.npz           (Training features)
- X_test_tfidf.npz            (Test features) 
- y_train.npy                 (Training labels)
- y_test.npy                  (Test labels)
- [model_name].joblib         (Trained models)

🚀 MEMORY MANAGEMENT:
- Processes data in 2000-row batches
- Saves intermediate results to disk
- Cleans up memory after each batch
- Monitors memory usage throughout

📈 MODELS TRAINED:
- Random Forest
- XGBoost  
- LightGBM
- Extra Trees
- Naive Bayes
- Logistic Regression

🎯 USAGE:
1. Run all cells in sequence to process data and train models
2. Use load_processed_data() to reload saved data
3. Use predict_webpage_status() for new predictions
4. Use test_prediction_function() to test the prediction pipeline

💡 BENEFITS:
- Handles datasets larger than available RAM
- Reproducible results with saved artifacts  
- Memory efficient processing
- Easy to resume from saved state
""")

# Print current status
print(f"\n📍 CURRENT STATUS:")
print(f"Working directory: {os.getcwd()}")
print(f"Artifact directory: {ARTIFACT_DIR}")
print(f"Dataset path: {dataset_path}")

# Check if processed data exists
if os.path.exists(os.path.join(ARTIFACT_DIR, "tfidf_vectorizer.joblib")):
    print("✅ Processed data available")
else:
    print("⏳ Processed data not yet created - run processing cells first")

# Memory status
print_memory_usage("(current)")

# Phishing URL Detection

In [None]:
url_dataSet = pd.read_csv(
    "/kaggle/input/phising-website-url-dataset/new_data_urls.csv"
)
url_dataSet.head()

In [None]:
import os
import shutil

def move_files_to_tfidf_directory():
    """
    Move all files from /kaggle/working/ to /kaggle/working/tfidf/
    """
    
    source_dir = "/kaggle/working/"
    target_dir = "/kaggle/working/tfidf/"
    
    print("🔄 Moving files to TF-IDF directory...")
    
    # Create target directory if it doesn't exist
    os.makedirs(target_dir, exist_ok=True)
    print(f"✓ Created directory: {target_dir}")
    
    # Get list of all files in source directory (excluding subdirectories)
    try:
        files_in_source = [f for f in os.listdir(source_dir) 
                          if os.path.isfile(os.path.join(source_dir, f))]
        
        if not files_in_source:
            print("ℹ️ No files found in source directory to move.")
            return
        
        print(f"📁 Found {len(files_in_source)} files to move:")
        
        moved_count = 0
        for filename in files_in_source:
            source_path = os.path.join(source_dir, filename)
            target_path = os.path.join(target_dir, filename)
            
            try:
                # Move the file (cut and paste)
                shutil.move(source_path, target_path)
                print(f"  ✓ Moved: {filename}")
                moved_count += 1
                
            except Exception as e:
                print(f"  ❌ Error moving {filename}: {e}")
        
        print(f"\n🎉 Successfully moved {moved_count}/{len(files_in_source)} files!")
        
        # Verify the move
        files_in_target = [f for f in os.listdir(target_dir) 
                          if os.path.isfile(os.path.join(target_dir, f))]
        print(f"📂 Files now in {target_dir}: {len(files_in_target)}")
        
        # List files in target directory
        if files_in_target:
            print("📋 Files in TF-IDF directory:")
            for i, filename in enumerate(files_in_target, 1):
                filepath = os.path.join(target_dir, filename)
                size_mb = os.path.getsize(filepath) / (1024 * 1024)
                print(f"  {i}. {filename} ({size_mb:.2f} MB)")
        
    except Exception as e:
        print(f"❌ Error accessing source directory: {e}")



# Complete workflow function
def organize_files_for_tfidf():
    """
    Complete workflow to organize files for TF-IDF processing
    """
    print("🎯 ORGANIZING FILES FOR TF-IDF PROCESSING")
    print("=" * 50)
    
    # Step 1: Move files
    move_files_to_tfidf_directory()
    

    
    # Step 3: Verify organization
    print("\n🔍 Verifying file organization...")
    if os.path.exists("/kaggle/working/tfidf/"):
        files = os.listdir("/kaggle/working/tfidf/")
        print(f"✓ TF-IDF directory contains {len(files)} items")
        
        # Check for required TF-IDF files
        required_tfidf_files = [
            "tfidf_vectorizer.joblib",
            "X_train_tfidf.npz", 
            "X_test_tfidf.npz",
            "y_train.npy",
            "y_test.npy"
        ]
        
        found_files = []
        missing_files = []
        
        for filename in required_tfidf_files:
            if filename in files:
                found_files.append(filename)
            else:
                missing_files.append(filename)
        
        if found_files:
            print(f"✓ Found TF-IDF files: {found_files}")
        if missing_files:
            print(f"⚠️ Missing TF-IDF files: {missing_files}")
    
    print("\n✅ File organization complete!")

# Run the organization
organize_files_for_tfidf()

In [None]:
# Prepare features and target for URL dataset
from sklearn.model_selection import train_test_split
url_X = url_dataSet['url']
# fallback if column name differs
url_y = url_dataSet['status']

print(f"URL dataset shape: {url_dataSet.shape}")
print(f"Class distribution:\n{url_y.value_counts()}")

# Split into train/test sets
url_X_train, url_X_test, url_y_train, url_y_test = train_test_split(
    url_X, url_y, test_size=0.2, random_state=42, stratify=url_y
)
print(f"Train size: {len(url_X_train)}, Test size: {len(url_X_test)}")

In [None]:
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix


url_results = {}
print("Training and evaluating URL models...")
for name, pipeline in pipelines.items():
    print(f"\nTraining {name}...")
    try:
        pipeline.fit(url_X_train, url_y_train)
        y_pred = pipeline.predict(url_X_test)
        acc = accuracy_score(url_y_test, y_pred)
        f1 = f1_score(url_y_test, y_pred)
        url_results[name] = {'accuracy': acc, 'f1_score': f1,
                             'predictions': y_pred, 'model': pipeline}
        print(f"Accuracy: {acc:.4f}, F1 Score: {f1:.4f}")
    except Exception as e:
        print(f"Error training {name}: {e}")

In [None]:
# Compare URL model performances
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

comparison_url = []
for name, result in url_results.items():
    comparison_url.append({
        'Model': name,
        'Accuracy': result['accuracy'],
        'F1 Score': result['f1_score']
    })

comparison_url_df = pd.DataFrame(
    comparison_url).sort_values('Accuracy', ascending=False)
print(comparison_url_df.to_string(index=False))

best_url_model_name = comparison_url_df.iloc[0]['Model']
best_url_model = url_results[best_url_model_name]['model']
print(f"\nBest URL model: {best_url_model_name}")
print(f"Accuracy: {comparison_url_df.iloc[0]['Accuracy']:.4f}")

# Visualize accuracy and F1 score
plt.figure(figsize=(10, 4))
plt.bar(comparison_url_df['Model'], comparison_url_df['Accuracy'],
        color='skyblue', label='Accuracy')
plt.bar(comparison_url_df['Model'], comparison_url_df['F1 Score'],
        color='lightcoral', alpha=0.7, label='F1 Score')
plt.xticks(rotation=45, ha='right')
plt.ylabel('Score')
plt.title('Phishing URL Model Performance')
plt.legend()
plt.tight_layout()
plt.show()

# Confusion matrix for best model
y_pred_best = url_results[best_url_model_name]['predictions']
cm = confusion_matrix(url_y_test, y_pred_best)
plt.figure(figsize=(6, 5))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=[
            'Legitimate', 'Phishing'], yticklabels=['Legitimate', 'Phishing'])
plt.title(f'Confusion Matrix - {best_url_model_name}')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()

In [None]:
# Create a function to predict URL status
def predict_url_status(url, model=None):
    """
    Predict if a URL is phishing (1) or legitimate (0) based on its text.

    Parameters:
    url (str): The URL to predict
    model: The trained model to use for prediction (default: best model)

    Returns:
    dict: Prediction result with probability scores
    """
    if model is None:
        model = best_url_model

    # Make prediction
    prediction = model.predict([url])[0]

    # Get prediction probabilities
    probabilities = model.predict_proba([url])[0]

    # Create result dictionary
    result = {
        'prediction': prediction,
        'status': 'Legitimate' if prediction == 1 else 'Phishing',
        'confidence': max(probabilities),
        'probability_legitimate': probabilities[0],
        'probability_phishing': probabilities[1]
    }

    return result

In [None]:
url_sample = ["google.com", "facebook.com", "phishing-test.com",
              "example.com", "malicious-site.com", 'facebook-test.com']

print("\nTesting URL prediction function:")
for url in url_sample:
    result = predict_url_status(url)
    print(f"URL: {url} | Prediction: {result['status']} | "
          f"Confidence: {result['confidence']:.4f} | "
          f"Prob Legitimate: {result['probability_legitimate']:.4f} | "
          f"Prob Phishing: {result['probability_phishing']:.4f}")