# Predict from web_code

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, f1_score
from sklearn.pipeline import Pipeline
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier, AdaBoostClassifier, ExtraTreesClassifier
import xgboost as xgb
import lightgbm as lgb
from sklearn.preprocessing import StandardScaler
import joblib
from scipy.sparse import save_npz, load_npz, vstack

In [None]:
# dataset = pd.read_csv(
#     "/content/drive/MyDrive/Colab/phishing/phishing_complete_dataset.csv",
#     sep=",",
#     quotechar='"',
#     nrows=10000
# )

# dataset.head()

In [None]:
# # Check dataset shape and class distribution
# print(f"Dataset shape: {dataset.shape}")
# print(f"\nClass distribution:")
# print(dataset['result'].value_counts())
# print(f"\nClass distribution (percentages):")
# print(dataset['result'].value_counts(normalize=True) * 100)

# # Check for missing values
# print(f"\nMissing values:")
# print(dataset.isnull().sum())

# # Check length of webpage_code
# dataset['code_length'] = dataset['webpage_code'].str.len()
# print(f"\nWebpage code length statistics:")
# print(dataset['code_length'].describe())

In [None]:
# # Prepare the data
# # Features (X) = webpage_code, Target (y) = result
# X = dataset['webpage_code']
# y = dataset['result']

# print(f"Features shape: {X.shape}")
# print(f"Target shape: {y.shape}")
# print(f"Target distribution:\n{y.value_counts()}")

# # Split the data into training and testing sets
# X_train, X_test, y_train, y_test = train_test_split(
#     X, y, test_size=0.2, random_state=42, stratify=y
# )

# del X
# del y
# del dataset

# print(f"\nTraining set size: {len(X_train)}")
# print(f"Test set size: {len(X_test)}")
# print(f"Training target distribution:\n{y_train.value_counts()}")
# print(f"Test target distribution:\n{y_test.value_counts()}")

In [None]:
# # Vectorize once for both TF-IDF and CountVectorizer, then release raw data
# tfidf_vectorizer = TfidfVectorizer(
#     max_features=5000, stop_words='english', ngram_range=(1, 2))
# count_vectorizer = CountVectorizer(
#     max_features=5000, stop_words='english', ngram_range=(1, 2))

# # Fit and transform training data
# X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
# X_test_tfidf = tfidf_vectorizer.transform(X_test)
# X_train_count = count_vectorizer.fit_transform(X_train)
# X_test_count = count_vectorizer.transform(X_test)

# # Release raw text data from memory
# del X_train
# del X_test

In [None]:

tfidf_vectorizer = joblib.load("/kaggle/input/tf-idf-dataset/tfidf_vectorizer.joblib")
count_vectorizer = joblib.load("/kaggle/input/count-dataset/count_vectorizer.joblib")


X_train_tfidf = load_npz("/kaggle/input/tf-idf-dataset/X_train_tfidf.npz")
X_test_tfidf = load_npz("/kaggle/input/tf-idf-dataset/X_test_tfidf.npz")

X_train_count = load_npz("/kaggle/input/count-dataset/X_train_count.npz")
X_test_count = load_npz("/kaggle/input/count-dataset/X_test_count.npz")

# tf-idf
y_train = np.load("/kaggle/input/tf-idf-dataset/y_train_tfidf.npy")
y_test = np.load("/kaggle/input/tf-idf-dataset/y_test_tfidf.npy")
# count
y_train = np.load("/kaggle/input/count-dataset/y_train_count.npy")
y_test = np.load("/kaggle/input/count-dataset/y_test_count.npy")

In [None]:
results = {}

classifiers_tfidf = {
    'Gradient Boosting (TF-IDF)': GradientBoostingClassifier(random_state=42),
    'XGBoost (TF-IDF)': xgb.XGBClassifier(random_state=42, verbosity=0),
    'LightGBM (TF-IDF)': lgb.LGBMClassifier(random_state=42, verbose=-1),
    'Extra Trees (TF-IDF)': ExtraTreesClassifier(n_estimators=100, random_state=42)
}

classifiers_count = {
    'Gradient Boosting (Count)': GradientBoostingClassifier(random_state=42),
    'XGBoost (Count)': xgb.XGBClassifier(random_state=42, verbosity=0),
    'Extra Trees (Count)': ExtraTreesClassifier(n_estimators=100, random_state=42)
}

print("Training and evaluating models...")
print("=" * 50)

# Train TF-IDF models
for name, clf in classifiers_tfidf.items():
    print(f"\nTraining {name}...")
    try:
        clf.fit(X_train_tfidf, y_train)
        y_pred = clf.predict(X_test_tfidf)
        accuracy = accuracy_score(y_test, y_pred)
        f1 = f1_score(y_test, y_pred)
        results[name] = {
            'accuracy': accuracy,
            'f1_score': f1,
            'predictions': y_pred,
            'model': clf
        }
    except Exception as e:
        print(f"Error training {name}: {e}")

# Train CountVectorizer models
for name, clf in classifiers_count.items():
    print(f"\nTraining {name}...")
    try:
        clf.fit(X_train_count, y_train)
        y_pred = clf.predict(X_test_count)
        accuracy = accuracy_score(y_test, y_pred)
        f1 = f1_score(y_test, y_pred)
        results[name] = {
            'accuracy': accuracy,
            'f1_score': f1,
            'predictions': y_pred,
            'model': clf
        }
    except Exception as e:
        print(f"Error training {name}: {e}")

print(f"\n{'='*50}")
print("Training completed!")

In [None]:
# Compare model performances
print("Model Performance Comparison:")
print("=" * 60)

# Create comparison dataframe
comparison_data = []
for name, result in results.items():
    comparison_data.append({
        'Model': name,
        'Accuracy': result['accuracy'],
        'F1 Score': result['f1_score']
    })

comparison_df = pd.DataFrame(comparison_data)
comparison_df = comparison_df.sort_values('Accuracy', ascending=False)
print(comparison_df.to_string(index=False))

# Find the best model
best_model_name = comparison_df.iloc[0]['Model']
best_model = results[best_model_name]['model']
print(f"\nBest performing model: {best_model_name}")
print(f"Best accuracy: {comparison_df.iloc[0]['Accuracy']:.4f}")

# Visualize results
plt.figure(figsize=(12, 5))

# Plot 1: Accuracy comparison
plt.subplot(1, 2, 1)
plt.bar(range(len(comparison_df)), comparison_df['Accuracy'], color='skyblue')
plt.xlabel('Models')
plt.ylabel('Accuracy')
plt.title('Model Accuracy Comparison')
plt.xticks(range(len(comparison_df)),
           comparison_df['Model'], rotation=45, ha='right')
plt.ylim(0, 1)

# Plot 2: F1 Score comparison
plt.subplot(1, 2, 2)
plt.bar(range(len(comparison_df)),
        comparison_df['F1 Score'], color='lightcoral')
plt.xlabel('Models')
plt.ylabel('F1 Score')
plt.title('Model F1 Score Comparison')
plt.xticks(range(len(comparison_df)),
           comparison_df['Model'], rotation=45, ha='right')
plt.ylim(0, 1)

plt.tight_layout()
plt.show()

In [None]:
# Create confusion matrix for the best model
best_predictions = results[best_model_name]['predictions']

plt.figure(figsize=(8, 6))
cm = confusion_matrix(y_test, best_predictions)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
            xticklabels=['Legitimate', 'Phishing'],
            yticklabels=['Legitimate', 'Phishing'])
plt.title(f'Confusion Matrix - {best_model_name}')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()

In [None]:
# Function to predict if a webpage is phishing or legitimate
def predict_webpage_status(webpage_code, model=None):
    """
    Predict if a webpage is phishing (1) or legitimate (0) based on its HTML code.

    Parameters:
    webpage_code (str): The HTML code of the webpage
    model: The trained model to use for prediction (default: best model)

    Returns:
    dict: Prediction result with probability scores
    """
    if model is None:
        model = best_model

    # Make prediction
    prediction = model.predict([webpage_code])[0]

    # Get prediction probabilities
    probabilities = model.predict_proba([webpage_code])[0]

    # Create result dictionary
    result = {
        'prediction': prediction,
        'status': 'Phishing' if prediction == 1 else 'Legitimate',
        'confidence': max(probabilities),
        'probability_legitimate': probabilities[0],
        'probability_phishing': probabilities[1]
    }

    return result


# Test the function with a sample from the test set
sample_index = 22
sample_code = X_test.iloc[sample_index]
actual_label = y_test.iloc[sample_index]

prediction_result = predict_webpage_status(sample_code)

print("Testing the prediction function:")
print("=" * 40)
print(
    f"Actual label: {actual_label} ({'Phishing' if actual_label == 1 else 'Legitimate'})")
print(f"Predicted: {prediction_result['status']}")
print(f"Confidence: {prediction_result['confidence']:.4f}")
print(
    f"Probability Legitimate: {prediction_result['probability_legitimate']:.4f}")
print(f"Probability Phishing: {prediction_result['probability_phishing']:.4f}")

# Test with a few more samples
print(f"\nTesting with 5 random samples:")
print("=" * 50)
for i in range(5):
    sample_code = X_test.iloc[i]
    actual_label = y_test.iloc[i]
    prediction_result = predict_webpage_status(sample_code)

    correct = "✓" if prediction_result['prediction'] == actual_label else "✗"
    print(f"Sample {i+1}: Actual: {actual_label}, Predicted: {prediction_result['prediction']}, "
          f"Confidence: {prediction_result['confidence']:.3f} {correct}")

# Phishing URL Detection

In [None]:
url_dataSet = pd.read_csv(
    "/content/drive/MyDrive/Colab/phishing/new_data_urls.csv",
    nrows=400000
)
url_dataSet.head()

In [None]:
# Prepare features and target for URL dataset
from sklearn.model_selection import train_test_split
url_X = url_dataSet['url']
# fallback if column name differs
url_y = url_dataSet['status']

print(f"URL dataset shape: {url_dataSet.shape}")
print(f"Class distribution:\n{url_y.value_counts()}")

# Split into train/test sets
url_X_train, url_X_test, url_y_train, url_y_test = train_test_split(
    url_X, url_y, test_size=0.2, random_state=42, stratify=url_y
)
print(f"Train size: {len(url_X_train)}, Test size: {len(url_X_test)}")

In [None]:
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix


url_results = {}
print("Training and evaluating URL models...")
for name, pipeline in pipelines.items():
    print(f"\nTraining {name}...")
    try:
        pipeline.fit(url_X_train, url_y_train)
        y_pred = pipeline.predict(url_X_test)
        acc = accuracy_score(url_y_test, y_pred)
        f1 = f1_score(url_y_test, y_pred)
        url_results[name] = {'accuracy': acc, 'f1_score': f1,
                             'predictions': y_pred, 'model': pipeline}
        print(f"Accuracy: {acc:.4f}, F1 Score: {f1:.4f}")
    except Exception as e:
        print(f"Error training {name}: {e}")

In [None]:
# Compare URL model performances
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

comparison_url = []
for name, result in url_results.items():
    comparison_url.append({
        'Model': name,
        'Accuracy': result['accuracy'],
        'F1 Score': result['f1_score']
    })

comparison_url_df = pd.DataFrame(
    comparison_url).sort_values('Accuracy', ascending=False)
print(comparison_url_df.to_string(index=False))

best_url_model_name = comparison_url_df.iloc[0]['Model']
best_url_model = url_results[best_url_model_name]['model']
print(f"\nBest URL model: {best_url_model_name}")
print(f"Accuracy: {comparison_url_df.iloc[0]['Accuracy']:.4f}")

# Visualize accuracy and F1 score
plt.figure(figsize=(10, 4))
plt.bar(comparison_url_df['Model'], comparison_url_df['Accuracy'],
        color='skyblue', label='Accuracy')
plt.bar(comparison_url_df['Model'], comparison_url_df['F1 Score'],
        color='lightcoral', alpha=0.7, label='F1 Score')
plt.xticks(rotation=45, ha='right')
plt.ylabel('Score')
plt.title('Phishing URL Model Performance')
plt.legend()
plt.tight_layout()
plt.show()

# Confusion matrix for best model
y_pred_best = url_results[best_url_model_name]['predictions']
cm = confusion_matrix(url_y_test, y_pred_best)
plt.figure(figsize=(6, 5))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=[
            'Legitimate', 'Phishing'], yticklabels=['Legitimate', 'Phishing'])
plt.title(f'Confusion Matrix - {best_url_model_name}')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()

In [None]:
# Create a function to predict URL status
def predict_url_status(url, model=None):
    """
    Predict if a URL is phishing (1) or legitimate (0) based on its text.

    Parameters:
    url (str): The URL to predict
    model: The trained model to use for prediction (default: best model)

    Returns:
    dict: Prediction result with probability scores
    """
    if model is None:
        model = best_url_model

    # Make prediction
    prediction = model.predict([url])[0]

    # Get prediction probabilities
    probabilities = model.predict_proba([url])[0]

    # Create result dictionary
    result = {
        'prediction': prediction,
        'status': 'Legitimate' if prediction == 1 else 'Phishing',
        'confidence': max(probabilities),
        'probability_legitimate': probabilities[0],
        'probability_phishing': probabilities[1]
    }

    return result

In [None]:
url_sample = ["google.com", "facebook.com", "phishing-test.com",
              "example.com", "malicious-site.com", 'facebook-test.com']

print("\nTesting URL prediction function:")
for url in url_sample:
    result = predict_url_status(url)
    print(f"URL: {url} | Prediction: {result['status']} | "
          f"Confidence: {result['confidence']:.4f} | "
          f"Prob Legitimate: {result['probability_legitimate']:.4f} | "
          f"Prob Phishing: {result['probability_phishing']:.4f}")