In [1]:
import numpy as np
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, classification_report
from hummingbird.ml import convert
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score

In [4]:
# Import necessary libraries


# Load the preprocessed dataset (replace the path with your actual file path)
data = pd.read_csv('../data/clean_data.csv')

# Define features (X) and target (y)
X = data.copy()
y = X.pop('label')

# Remove the 'url' column as it is not a feature
X.pop('url')

# Convert the target labels to binary format
y = y.map({'phishing': 1, 'legitimate': 0})

# Split the dataset into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale the features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Initialize the models
log_reg = LogisticRegression(max_iter=1000)  # Increased max_iter
tree_clf = DecisionTreeClassifier()
rf_clf = RandomForestClassifier()
svm_clf = SVC()


param_grid_tree_clf = {
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10]
}

param_grid_rf_clf = {
    'n_estimators': [10, 50, 100],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10]
}

param_grid_svm_clf = {
    'C': [0.1, 1, 10, 100],
    'kernel': ['linear', 'rbf']
}



# Analysis of logisitc regression

## Default Model:  
- Trains a Logistic Regression model with default settings.
- Evaluates the model on the test data.

## Tuned Model:  

- Uses GridSearchCV to find the best settings (hyperparameters) for the Logistic Regression model.
- Trains the model with these best settings.
- Evaluates the tuned model on the test data.

## Cross-Validation:  

- Checks the average performance of both the default and tuned models using cross-validation (splitting the data into 5 parts and training/testing on each part).

## Results:  


### Default Model:  
- Accuracy: 0.999978 

### Tuned Model:  
- Accuracy: 0.999990 (slightly better than default)

### Cross-Validation:  
- Default model average accuracy: 0.999941
- Tuned model average accuracy: 0.999975
- Both models perform extremely well, with the tuned model being slightly better.

In [3]:
# Train the Logistic Regression model with default parameters
default_log_reg = LogisticRegression(max_iter=1000)
default_log_reg.fit(X_train, y_train)

# Evaluate the default model
default_log_reg_pred = default_log_reg.predict(X_test)
print("Default Logistic Regression Accuracy: ", accuracy_score(y_test, default_log_reg_pred))
print("Default Logistic Regression Report:\n", classification_report(y_test, default_log_reg_pred))

# Perform GridSearchCV for hyperparameter tuning with parallelization
param_grid_log_reg = {
    'C': [0.1, 1, 10, 100],
    'solver': ['liblinear', 'lbfgs']
}
grid_log_reg = GridSearchCV(LogisticRegression(max_iter=1000), param_grid_log_reg, cv=5, scoring='accuracy', n_jobs=-1)
grid_log_reg.fit(X_train, y_train)

# Get the best model from GridSearchCV
best_log_reg = grid_log_reg.best_estimator_

# Evaluate the best model
log_reg_pred = best_log_reg.predict(X_test)
print("Tuned Logistic Regression Accuracy: ", accuracy_score(y_test, log_reg_pred))
print("Tuned Logistic Regression Report:\n", classification_report(y_test, log_reg_pred))


# Perform cross-validation on the default model
default_scores = cross_val_score(default_log_reg, X_train, y_train, cv=5, scoring='accuracy', n_jobs=-1)
print("Default Logistic Regression Cross-Validation Accuracy: ", default_scores.mean())

# Perform cross-validation on the tuned model
tuned_scores = cross_val_score(best_log_reg, X_train, y_train, cv=5, scoring='accuracy', n_jobs=-1)
print("Tuned Logistic Regression Cross-Validation Accuracy: ", tuned_scores.mean())

Default Logistic Regression Accuracy:  0.9999779999559999
Default Logistic Regression Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00    249632
           1       1.00      1.00      1.00    250367

    accuracy                           1.00    499999
   macro avg       1.00      1.00      1.00    499999
weighted avg       1.00      1.00      1.00    499999

Tuned Logistic Regression Accuracy:  0.9999899999799999
Tuned Logistic Regression Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00    249632
           1       1.00      1.00      1.00    250367

    accuracy                           1.00    499999
   macro avg       1.00      1.00      1.00    499999
weighted avg       1.00      1.00      1.00    499999

Default Logistic Regression Cross-Validation Accuracy:  0.9999409998062493
Tuned Logistic Regression Cross-Validation Accuracy:  0.9999749999149996


# Analysis of Decision Tree

## Default Model:

- Trains a Decision Tree model with default settings.
- Evaluates the model on the test data.

## Tuned Model:

- Uses GridSearchCV to find the best settings (hyperparameters) for the Decision Tree model.
- Trains the model with these best settings.
- Evaluates the tuned model on the test data.

## Cross-Validation:
- Checks the average performance of both the default and tuned models using cross-validation (splitting the data into 5 parts and training/testing on each part).

## Results:
Default Model:
- Accuracy: 0.999972
-  Detailed report shows precision, recall, and F1-score all close to 1.00 for both classes (0 and 1)

Tuned Model:
- Accuracy: 0.999972 (same as default)
- Detailed report shows precision, recall, and F1-score all close to 1.00 for both classes (0 and 1)

Cross-Validation:
- Default model average accuracy: 0.999985
- Tuned model average accuracy: 0.999986

In [5]:
# Train the Decision Tree model with default parameters
default_tree_clf = DecisionTreeClassifier()
default_tree_clf.fit(X_train, y_train)

# Evaluate the default model
default_tree_clf_pred = default_tree_clf.predict(X_test)
print("Default Decision Tree Accuracy: ", accuracy_score(y_test, default_tree_clf_pred))
print("Default Decision Tree Report:\n", classification_report(y_test, default_tree_clf_pred))

grid_tree_clf = GridSearchCV(DecisionTreeClassifier(), param_grid_tree_clf, cv=5, scoring='accuracy', n_jobs=-1)
grid_tree_clf.fit(X_train, y_train)

# Get the best model from GridSearchCV
best_tree_clf = grid_tree_clf.best_estimator_

# Evaluate the best model
tree_clf_pred = best_tree_clf.predict(X_test)
print("Tuned Decision Tree Accuracy: ", accuracy_score(y_test, tree_clf_pred))
print("Tuned Decision Tree Report:\n", classification_report(y_test, tree_clf_pred))



# Perform cross-validation on the default model
default_tree_scores = cross_val_score(default_tree_clf, X_train, y_train, cv=5, scoring='accuracy', n_jobs=-1)
print("Default Decision Tree Cross-Validation Accuracy: ", default_tree_scores.mean())

# Perform cross-validation on the tuned model
tuned_tree_scores = cross_val_score(best_tree_clf, X_train, y_train, cv=5, scoring='accuracy', n_jobs=-1)
print("Tuned Decision Tree Cross-Validation Accuracy: ", tuned_tree_scores.mean())

Default Decision Tree Accuracy:  0.9999719999439999
Default Decision Tree Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00    249632
           1       1.00      1.00      1.00    250367

    accuracy                           1.00    499999
   macro avg       1.00      1.00      1.00    499999
weighted avg       1.00      1.00      1.00    499999

Tuned Decision Tree Accuracy:  0.9999719999439999
Tuned Decision Tree Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00    249632
           1       1.00      1.00      1.00    250367

    accuracy                           1.00    499999
   macro avg       1.00      1.00      1.00    499999
weighted avg       1.00      1.00      1.00    499999

Default Decision Tree Cross-Validation Accuracy:  0.9999849999474998
Tuned Decision Tree Cross-Validation Accuracy:  0.9999854999499999


In [None]:
# 
# 




In [6]:
# Train the Random Forest model with default parameters
default_rf_clf = RandomForestClassifier()
default_rf_clf.fit(X_train, y_train)

# Evaluate the default model
default_rf_clf_pred = default_rf_clf.predict(X_test)
print("Default Random Forest Accuracy: ", accuracy_score(y_test, default_rf_clf_pred))
print("Default Random Forest Report:\n", classification_report(y_test, default_rf_clf_pred))


grid_rf_clf = GridSearchCV(RandomForestClassifier(), param_grid_rf_clf, cv=5, scoring='accuracy', n_jobs=-1)
grid_rf_clf.fit(X_train, y_train)

# Get the best model from GridSearchCV
best_rf_clf = grid_rf_clf.best_estimator_

# Evaluate the best model
rf_clf_pred = best_rf_clf.predict(X_test)
print("Tuned Random Forest Accuracy: ", accuracy_score(y_test, rf_clf_pred))
print("Tuned Random Forest Report:\n", classification_report(y_test, rf_clf_pred))

from sklearn.model_selection import cross_val_score

# Perform cross-validation on the default model
default_rf_scores = cross_val_score(default_rf_clf, X_train, y_train, cv=5, scoring='accuracy', n_jobs=-1)
print("Default Random Forest Cross-Validation Accuracy: ", default_rf_scores.mean())

# Perform cross-validation on the tuned model
tuned_rf_scores = cross_val_score(best_rf_clf, X_train, y_train, cv=5, scoring='accuracy', n_jobs=-1)
print("Tuned Random Forest Cross-Validation Accuracy: ", tuned_rf_scores.mean())

Default Random Forest Accuracy:  1.0
Default Random Forest Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00    249632
           1       1.00      1.00      1.00    250367

    accuracy                           1.00    499999
   macro avg       1.00      1.00      1.00    499999
weighted avg       1.00      1.00      1.00    499999



KeyboardInterrupt: 

In [7]:
from sklearn.svm import SVC
from hummingbird.ml import convert
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, classification_report

# Define the SVM classifier
svm_clf = SVC()

# Define parameter grid for SVM
param_grid_svm_clf = {
    'C': [0.1, 1, 10],
    'kernel': ['linear', 'rbf']
}

# Initialize GridSearchCV with SVM classifier
grid_svm_clf = GridSearchCV(svm_clf, param_grid_svm_clf, cv=5, scoring='accuracy', n_jobs=-1)

try:
    # Fit the model
    grid_svm_clf.fit(X_train, y_train)
    best_svm_clf = grid_svm_clf.best_estimator_

    # Convert the best SVM model to a Hummingbird model
    hb_model = convert(best_svm_clf, 'pytorch')
    hb_model.to('cuda')

    # Make predictions using the Hummingbird model
    svm_clf_pred = hb_model.predict(X_test)

    # Print accuracy and classification report
    print("SVM Accuracy: ", accuracy_score(y_test, svm_clf_pred))
    print("SVM Report:\n", classification_report(y_test, svm_clf_pred))

except Exception as e:
    print(f"An error occurred during GridSearchCV: {e}")

KeyboardInterrupt: 