In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split,GridSearchCV,cross_val_score,StratifiedKFold
from sklearn.metrics import accuracy_score,fbeta_score,precision_score,recall_score,confusion_matrix

In [None]:
!unzip 72ptz43s9v-1.zip

Archive:  72ptz43s9v-1.zip
replace dataset_small.csv? [y]es, [n]o, [A]ll, [N]one, [r]ename: y
  inflating: dataset_small.csv       
replace dataset_full.csv? [y]es, [n]o, [A]ll, [N]one, [r]ename: y
  inflating: dataset_full.csv        


In [None]:
df = pd.read_csv('/content/dataset_full.csv')

In [None]:
df.shape

(88647, 112)

In [None]:
skipped_features = [
    'time_response',
    'domain_spf',
    'asn_ip',
    'qty_ip_resolved',
    'qty_nameservers',
    'qty_mx_servers',
    'ttl_hostname',
    'qty_redirects',
    'url_google_index',
    'domain_google_index', #remove url shorten and https one
    'time_domain_activation',
    'time_domain_expiration',
]


df = df.drop(skipped_features,axis=1)

class DataCleaning:

    def __init__(self, df, missing_threshold, corr_threshold):
        self.df = df
        self.missing_threshold = missing_threshold
        self.corr_threshold = corr_threshold

    def col_with_variance_0(self):
        columns_to_drop = []
        numerical_columns = [col for col in self.df.columns if self.df[col].dtype != 'O']
        for col in numerical_columns:
            if self.df[col].std() == 0:
                columns_to_drop.append(col)
        return columns_to_drop

    def get_redundant_cols(self):
        cols_missing_ratios = self.df.isna().sum().div(self.df.shape[0])
        cols_to_drop = list(cols_missing_ratios[cols_missing_ratios > self.missing_threshold].index)
        return cols_to_drop

    def dropping_columns_on_basis_of_correlation(self):
        columns_to_drop = set()
        relation = self.df.corr()
        for columns in range(len(relation.columns)):
            for rows in range(columns):
                if abs(relation.iloc[columns, rows]) > self.corr_threshold:
                    col_name = relation.columns[columns]
                    columns_to_drop.add(col_name)
        columns_to_drop = list(columns_to_drop)
        return columns_to_drop

    def feature_scaling_df(self):
        cols_to_drop_1 = self.get_redundant_cols()
        cols_to_drop_2 = self.col_with_variance_0()
        cols_to_drop_3 = self.dropping_columns_on_basis_of_correlation()
        columns_to_drop = cols_to_drop_1 + cols_to_drop_2 + cols_to_drop_3
        columns_to_drop = set(columns_to_drop)
        return columns_to_drop


clean = DataCleaning(df, 0.8, 0.8)
drop_columns = clean.feature_scaling_df()
print(drop_columns)
df2 = df.drop(columns=drop_columns)

{'qty_underline_file', 'qty_plus_params', 'qty_space_domain', 'domain_length', 'qty_questionmark_file', 'qty_asterisk_domain', 'qty_params', 'qty_space_params', 'tld_present_params', 'qty_plus_file', 'qty_tilde_domain', 'qty_hashtag_domain', 'qty_exclamation_domain', 'qty_dollar_domain', 'qty_hyphen_file', 'qty_at_file', 'qty_equal_params', 'qty_plus_directory', 'qty_and_directory', 'qty_and_params', 'qty_at_directory', 'qty_space_file', 'qty_hashtag_directory', 'qty_asterisk_params', 'qty_slash_domain', 'qty_dollar_directory', 'qty_dot_file', 'qty_hashtag_params', 'qty_exclamation_directory', 'qty_dollar_params', 'qty_questionmark_domain', 'qty_equal_domain', 'qty_asterisk_directory', 'qty_asterisk_file', 'params_length', 'qty_comma_directory', 'qty_tilde_params', 'qty_percent_domain', 'qty_equal_file', 'qty_equal_directory', 'qty_and_url', 'qty_dollar_file', 'qty_space_directory', 'qty_questionmark_directory', 'qty_tilde_file', 'qty_hashtag_file', 'qty_tilde_directory', 'qty_comma_pa

In [None]:
df2.shape

(88647, 41)

In [None]:
df2[df2.duplicated(subset=None, keep='first')]

Unnamed: 0,qty_dot_url,qty_hyphen_url,qty_underline_url,qty_slash_url,qty_questionmark_url,qty_equal_url,qty_at_url,qty_exclamation_url,qty_space_url,qty_tilde_url,...,qty_dot_params,qty_hyphen_params,qty_underline_params,qty_slash_params,qty_questionmark_params,qty_percent_params,email_in_url,tls_ssl_certificate,url_shortened,phishing
64,2,0,0,0,0,0,0,0,0,0,...,-1,-1,-1,-1,-1,-1,0,0,0,0
70,2,0,0,0,0,0,0,0,0,0,...,-1,-1,-1,-1,-1,-1,0,1,0,0
81,2,0,0,0,0,0,0,0,0,0,...,-1,-1,-1,-1,-1,-1,0,0,0,0
87,2,0,0,0,0,0,0,0,0,0,...,-1,-1,-1,-1,-1,-1,0,0,0,0
111,2,0,0,0,0,0,0,0,0,0,...,-1,-1,-1,-1,-1,-1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
88640,2,0,0,0,0,0,0,0,0,0,...,-1,-1,-1,-1,-1,-1,0,1,0,0
88641,2,1,0,0,0,0,0,0,0,0,...,-1,-1,-1,-1,-1,-1,0,1,0,0
88643,2,0,0,0,0,0,0,0,0,0,...,-1,-1,-1,-1,-1,-1,0,0,0,0
88645,2,0,0,1,0,0,0,0,0,0,...,-1,-1,-1,-1,-1,-1,0,1,0,1


In [None]:
df3 = df2.drop_duplicates()

In [None]:
df3.shape

(29316, 41)

In [None]:
phished = df3[df3['phishing'] == 1]
not_phished = df3[df3['phishing'] == 0]

In [None]:
phished.shape,not_phished.shape

((22477, 41), (6839, 41))

lets upsample everything

In [None]:
from sklearn.utils import resample

resample_phished = resample(phished,replace= True,n_samples=45000,random_state=42)
resample_not_phished = resample(not_phished,replace= True,n_samples=45000,random_state=42)

In [None]:
resample_phished.shape,resample_not_phished.shape

((45000, 41), (45000, 41))

In [None]:
df_final = pd.concat([resample_phished,resample_not_phished],axis=0)

In [None]:
df_final.shape

(90000, 41)

In [None]:
df_final = df_final.sample(df_final.shape[0])

In [None]:
X = df_final.drop('phishing',axis=1)
y = df_final['phishing']

In [None]:
X.shape ,y.shape

((90000, 40), (90000,))

In [None]:
x_train,x_test,y_train,y_test = train_test_split(X,y,test_size=0.2)

In [None]:
x_train.shape,y_train.shape

((72000, 40), (72000,))

In [None]:
from sklearn.decomposition import PCA

pca = PCA(n_components=7)
x_pca_train = pca.fit_transform(x_train)
x_pca_test = pca.transform(x_test)

std = StandardScaler()
x_std_train = std.fit_transform(x_pca_train)
x_std_test = std.transform(x_pca_test)

In [None]:
!pip install --upgrade scikit-learn



In [None]:
from sklearn.cluster import KMeans
from sklearn.model_selection import cross_val_score
from sklearn.metrics import silhouette_score


# Create a K-Means instance
kmeans = KMeans(n_clusters=2, random_state=42)

# Define a custom scoring function (using silhouette score as an example)
def silhouette_scorer(estimator, x_std_train):
    labels = estimator.fit_predict(x_std_train)
    score = silhouette_score(x_std_train, labels)
    return score

# Use cross_val_score with the custom scorer
scores = cross_val_score(kmeans, x_std_train, cv=5, scoring=silhouette_scorer)

# Display the silhouette scores for each fold
print("Silhouette Scores:", scores)




Silhouette Scores: [0.683006   0.24858379 0.88707569 0.62039518 0.27480739]


In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier,GradientBoostingClassifier
from xgboost import XGBClassifier
from sklearn.naive_bayes import GaussianNB
from lightgbm import LGBMClassifier
from sklearn.neighbors import KNeighborsClassifier

list_model = [
    LogisticRegression(), SVC(), DecisionTreeClassifier(), RandomForestClassifier(),
    GradientBoostingClassifier(), XGBClassifier(), GaussianNB(), LGBMClassifier(), KNeighborsClassifier()
]

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

details = {}
best_model = None
f_accuracy = 0

for model in list_model:
    cv_results = cross_val_score(model, x_std_train, y_train, cv=cv, scoring='accuracy')
    accuracy = np.mean(cv_results)
    print(f'{model.__class__.__name__} accuracy = {accuracy}')
    details[model.__class__.__name__] = {'accuracy': accuracy, 'cv_results': cv_results}

    if accuracy > f_accuracy:
        f_accuracy = accuracy
        best_model = model.__class__.__name__

print("Best Model:", best_model)
print("Details:", details)

LogisticRegression accuracy = 0.6927638888888888
SVC accuracy = 0.7612916666666667
DecisionTreeClassifier accuracy = 0.95
RandomForestClassifier accuracy = 0.9573472222222221
GradientBoostingClassifier accuracy = 0.7786666666666668
XGBClassifier accuracy = 0.8825833333333334
GaussianNB accuracy = 0.5902222222222223
[LightGBM] [Info] Number of positive: 28814, number of negative: 28786
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.003264 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1275
[LightGBM] [Info] Number of data points in the train set: 57600, number of used features: 5
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500243 -> initscore=0.000972
[LightGBM] [Info] Start training from score 0.000972
[LightGBM] [Info] Number of positive: 28814, number of negative: 28786
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002926 seconds.
You can set `forc

In [None]:
pd.DataFrame(details).T

Unnamed: 0,accuracy,cv_results
LogisticRegression,0.692764,"[0.6935416666666666, 0.6903472222222222, 0.689..."
SVC,0.761292,"[0.7602777777777778, 0.7609027777777778, 0.757..."
DecisionTreeClassifier,0.95,"[0.9499305555555555, 0.9474305555555556, 0.948..."
RandomForestClassifier,0.957347,"[0.9591666666666666, 0.9545138888888889, 0.954..."
GradientBoostingClassifier,0.778667,"[0.7783333333333333, 0.7788888888888889, 0.774..."
XGBClassifier,0.882583,"[0.8822916666666667, 0.8865972222222223, 0.877..."
GaussianNB,0.590222,"[0.591875, 0.58625, 0.5903472222222222, 0.5928..."
LGBMClassifier,0.833778,"[0.8320138888888889, 0.8327083333333334, 0.831..."
KNeighborsClassifier,0.885333,"[0.8815277777777778, 0.8822916666666667, 0.886..."


In [None]:
params_grid = [
    {
        'model': XGBClassifier(),
        'param_grid': {
            'n_estimators': [100, 200, 300],
            'learning_rate': [0.01, 0.1, 0.2],
            'max_depth': [3, 5, 7]
        }
    },
    {
        'model': KNeighborsClassifier(),
        'param_grid': {
            'n_neighbors': [3, 5, 7],
            'weights': ['uniform', 'distance'],
            'p': [1, 2]
        }
    },
    {
        'model': RandomForestClassifier(),
        'param_grid': {
            'n_estimators': [50, 100, 200],
            'max_depth': [None, 10, 20],
            'min_samples_split': [2, 5, 10],
            'min_samples_leaf': [1, 2, 4]
        }
    }
]
best_model = None
best_param = None
best_model_accuracy = 0
best_model_info = {}

for model_info in params_grid:
    model = model_info['model']
    param_grid = model_info['param_grid']

    grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=5, scoring='accuracy')
    grid_search.fit(x_std_train, y_train)

    model_name = model.__class__.__name__

    print(f"Best parameters for {model_name}:")
    print(grid_search.best_params_)

    # Get the best accuracy, parameters, precision, recall, and F2 score for the current model
    best_accuracy = grid_search.best_score_
    best_parameters = grid_search.best_params_

    y_pred = grid_search.predict(x_std_test)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f2_score = fbeta_score(y_test, y_pred, beta=2)

    # Store the results in the best_model_info dictionary
    best_model_info[model_name] = {
        'best_parameters': best_parameters,
        'best_accuracy': best_accuracy,
        'precision': precision,
        'recall': recall,
        'f2_score': f2_score
    }

    # Update the best_model, best_param, and best_model_accuracy if the current model performed better
    if best_accuracy > best_model_accuracy:
        best_model = model_name
        best_param = best_parameters
        best_model_accuracy = best_accuracy

print("\nBest Model:", best_model)
print("Best Parameters:", best_param)
print("Best Model Accuracy:", best_model_accuracy)

# Print the best parameters, accuracy, precision, recall, and F2 score for each model
print("\nBest Results for Each Model:")
for model_name, info in best_model_info.items():
    print(f"{model_name}:")
    print(f"  Best Parameters: {info['best_parameters']}")
    print(f"  Best Accuracy: {info['best_accuracy']:.4f}")
    print(f"  Precision: {info['precision']:.4f}")
    print(f"  Recall: {info['recall']:.4f}")
    print(f"  F2 Score: {info['f2_score']:.4f}")

Best parameters for XGBClassifier:
{'learning_rate': 0.2, 'max_depth': 7, 'n_estimators': 300}
Best parameters for KNeighborsClassifier:
{'n_neighbors': 5, 'p': 2, 'weights': 'distance'}


In [None]:
!pip install git+https://github.com/hyperopt/hyperopt-sklearn

Collecting git+https://github.com/hyperopt/hyperopt-sklearn
  Cloning https://github.com/hyperopt/hyperopt-sklearn to /tmp/pip-req-build-x11q0b45
  Running command git clone --filter=blob:none --quiet https://github.com/hyperopt/hyperopt-sklearn /tmp/pip-req-build-x11q0b45
  Resolved https://github.com/hyperopt/hyperopt-sklearn to commit 4bc286479677a0bfd2178dac4546ea268b3f3b77
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting numpy>=1.26.0 (from hpsklearn==1.0.3)
  Downloading numpy-1.26.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (18.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m18.2/18.2 MB[0m [31m42.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting scikit-learn>=1.3.0 (from hpsklearn==1.0.3)
  Downloading scikit_learn-1.4.0-1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (12.1 MB)
[2K     [90m━━━

In [None]:
!pip install --upgrade hpsklearn



In [None]:
!pip show hpsklearn

Name: hpsklearn
Version: 1.0.3
Summary: Hyperparameter Optimization for sklearn
Home-page: http://hyperopt.github.com/hyperopt-sklearn/
Author: James Bergstra
Author-email: anon@anon.com
License: BSD
Location: /usr/local/lib/python3.10/dist-packages
Requires: hyperopt, numpy, scikit-learn, scipy
Required-by: 


In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, fbeta_score
from hyperopt import tpe
from hpsklearn import HyperoptEstimator, any_preprocessing, all_classifiers

# Define HyperoptEstimator with all classifiers
estim = HyperoptEstimator(
    classifier=all_classifiers('my_classifier'),
    preprocessing=any_preprocessing('my_pre'),
    algo=tpe.suggest,
    max_evals=100,
    trial_timeout=120,
)

# Fit the estimator
estim.fit(x_std_train, y_train)

# Get the best model
best_model = estim.best_model()

# Evaluate on the test set
y_pred = best_model.predict(x_std_test)
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f2_score = fbeta_score(y_test, y_pred, beta=2)

# Print results
print(f"Best Model: {best_model}")
print(f"Test Set Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F2 Score: {f2_score}")


100%|██████████| 1/1 [00:00<00:00,  1.19trial/s, best loss: 0.3014583333333334]
100%|██████████| 2/2 [01:50<00:00, 110.43s/trial, best loss: 0.2789583333333333]
100%|██████████| 3/3 [00:00<00:00,  1.83trial/s, best loss: 0.2789583333333333]
100%|██████████| 4/4 [00:05<00:00,  5.32s/trial, best loss: 0.24965277777777772]
100%|██████████| 5/5 [00:31<00:00, 31.52s/trial, best loss: 0.24965277777777772]
100%|██████████| 6/6 [00:01<00:00,  1.95s/trial, best loss: 0.24965277777777772]
100%|██████████| 7/7 [00:07<00:00,  7.52s/trial, best loss: 0.24965277777777772]
100%|██████████| 8/8 [00:00<00:00,  2.87trial/s, best loss: 0.24965277777777772]
100%|██████████| 9/9 [00:00<00:00,  5.22trial/s, best loss: 0.24965277777777772]
100%|██████████| 10/10 [00:00<00:00,  2.98trial/s, best loss: 0.24965277777777772]
100%|██████████| 11/11 [01:22<00:00, 82.08s/trial, best loss: 0.24965277777777772]
100%|██████████| 12/12 [00:00<00:00,  2.32trial/s, best loss: 0.24965277777777772]
100%|██████████| 13/13 [




100%|██████████| 14/14 [00:01<00:00,  1.18s/trial, best loss: 0.14500000000000002]
 93%|█████████▎| 14/15 [00:00<?, ?trial/s, best loss=?]

ERROR:hyperopt.fmin:job exception: Negative values in data passed to CategoricalNB (input X)


 93%|█████████▎| 14/15 [00:00<?, ?trial/s, best loss=?]


ValueError: Negative values in data passed to CategoricalNB (input X)

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, fbeta_score
from hyperopt import tpe
from hpsklearn import HyperoptEstimator,any_preprocessing,random_forest_classifier,extra_tree_classifier,bagging_classifier,ada_boost_classifier,gradient_boosting_classifier,hist_gradient_boosting_classifier,ridge_classifier_cv,perceptron,decision_tree_classifier,k_neighbors_classifier,xgboost_classification

# Define classifiers
hypertopt_classification_estimators = [
    random_forest_classifier('my_rf'),
    extra_tree_classifier('my_et'),
    bagging_classifier('my_bag'),
    ada_boost_classifier('my_ada'),
    gradient_boosting_classifier('my_gb'),
    hist_gradient_boosting_classifier('my_hgb'),
    ridge_classifier_cv('my_ridge'),
    perceptron('my_perceptron'),
    decision_tree_classifier('my_dt'),
    k_neighbors_classifier('my_knn'),
    xgboost_classification('my_xgb'),
]

# Dictionary to store results
results = {}

# Loop through each classifier
for classifier in hypertopt_classification_estimators:
    print(f"\nClassifier: {classifier}")

    # Create HyperoptEstimator
    estim = HyperoptEstimator(
        classifier=classifier,
        preprocessing=any_preprocessing('my_pre'),
        algo=tpe.suggest,
        max_evals=100,
        trial_timeout=120,
    )

    # Fit the estimator
    estim.fit(x_std_train, y_train)

    # Get the best parameters and score
    best_params = estim._best_learner
    best_score = estim._best_loss

    # Evaluate on the test set
    y_pred = estim.predict(x_std_test)
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f2_score = fbeta_score(y_test, y_pred, beta=2)

    # Store results in the dictionary
    results[classifier] = {
        'best_params': best_params,
        'best_score': best_score,
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f2_score': f2_score,
    }

    # Print results
    print(f"Best Parameters: {best_params}")
    print(f"Best Score: {best_score}")
    print(f"Test Set Accuracy: {accuracy}")
    print(f"Precision: {precision}")
    print(f"Recall: {recall}")
    print(f"F2 Score: {f2_score}")

# Find the best model based on test accuracy
best_model = max(results, key=lambda x: results[x]['accuracy'])

# Print overall best model
print(f"\nBest Model Overall: {best_model}")



Classifier: 0 sklearn_RandomForestClassifier
1  bootstrap =
2   switch
3     hyperopt_param
4       Literal{my_rf.rfc_bootstrap}
5       randint
6         Literal{2}
7     Literal{True}
8     Literal{False}
9  ccp_alpha =
10   Literal{0.0}
11  class_weight =
12   switch
13     hyperopt_param
14       Literal{my_rf.rfc_class_weight}
15       randint
16         Literal{3}
17     Literal{balanced}
18     Literal{balanced_subsample}
19     Literal{None}
20  criterion =
21   switch
22     hyperopt_param
23       Literal{my_rf.rfc_criterion}
24       randint
25         Literal{2}
26     Literal{gini}
27     Literal{entropy}
28  max_depth =
29   switch
30     hyperopt_param
31       Literal{my_rf.rfc_max_depth}
32       categorical
33         pos_args
34           Literal{0.7}
35           Literal{0.1}
36           Literal{0.1}
37           Literal{0.1}
38     Literal{None}
39     Literal{2}
40     Literal{3}
41     Literal{4}
42  max_features =
43   switch
44     hyperopt_param
45       Lit




100%|██████████| 2/2 [00:02<00:00,  2.64s/trial, best loss: 0.29923611111111115]
 67%|██████▋   | 2/3 [00:00<?, ?trial/s, best loss=?]




100%|██████████| 3/3 [00:02<00:00,  2.51s/trial, best loss: 0.29798611111111106]
100%|██████████| 4/4 [00:00<00:00,  1.50trial/s, best loss: 0.29798611111111106]
100%|██████████| 5/5 [00:01<00:00,  1.39s/trial, best loss: 0.29798611111111106]
 83%|████████▎ | 5/6 [00:00<?, ?trial/s, best loss=?]




100%|██████████| 6/6 [00:09<00:00,  9.70s/trial, best loss: 0.29798611111111106]
100%|██████████| 7/7 [00:08<00:00,  8.06s/trial, best loss: 0.2885416666666667]
 88%|████████▊ | 7/8 [00:00<?, ?trial/s, best loss=?]




100%|██████████| 8/8 [00:03<00:00,  3.45s/trial, best loss: 0.26555555555555554]
100%|██████████| 9/9 [00:06<00:00,  6.63s/trial, best loss: 0.26555555555555554]
100%|██████████| 10/10 [00:06<00:00,  6.87s/trial, best loss: 0.26555555555555554]
100%|██████████| 11/11 [00:00<00:00,  1.23trial/s, best loss: 0.26555555555555554]
 92%|█████████▏| 11/12 [00:00<?, ?trial/s, best loss=?]




100%|██████████| 12/12 [00:12<00:00, 12.87s/trial, best loss: 0.26555555555555554]
 92%|█████████▏| 12/13 [00:00<?, ?trial/s, best loss=?]




100%|██████████| 13/13 [00:03<00:00,  3.27s/trial, best loss: 0.26555555555555554]
 93%|█████████▎| 13/14 [00:00<?, ?trial/s, best loss=?]




100%|██████████| 14/14 [00:01<00:00,  1.60s/trial, best loss: 0.26555555555555554]
 93%|█████████▎| 14/15 [00:00<?, ?trial/s, best loss=?]




100%|██████████| 15/15 [00:04<00:00,  4.72s/trial, best loss: 0.26555555555555554]
100%|██████████| 16/16 [00:03<00:00,  3.84s/trial, best loss: 0.26555555555555554]
 94%|█████████▍| 16/17 [00:00<?, ?trial/s, best loss=?]




100%|██████████| 17/17 [00:03<00:00,  3.49s/trial, best loss: 0.26555555555555554]
 94%|█████████▍| 17/18 [00:00<?, ?trial/s, best loss=?]




100%|██████████| 18/18 [00:03<00:00,  3.75s/trial, best loss: 0.26555555555555554]
 95%|█████████▍| 18/19 [00:00<?, ?trial/s, best loss=?]




100%|██████████| 19/19 [00:00<00:00,  1.33trial/s, best loss: 0.26555555555555554]
 95%|█████████▌| 19/20 [00:00<?, ?trial/s, best loss=?]




100%|██████████| 20/20 [00:15<00:00, 15.42s/trial, best loss: 0.26555555555555554]
 95%|█████████▌| 20/21 [00:00<?, ?trial/s, best loss=?]




100%|██████████| 21/21 [00:01<00:00,  1.44s/trial, best loss: 0.26555555555555554]
 95%|█████████▌| 21/22 [00:00<?, ?trial/s, best loss=?]




100%|██████████| 22/22 [00:01<00:00,  1.40s/trial, best loss: 0.26555555555555554]
 96%|█████████▌| 22/23 [00:00<?, ?trial/s, best loss=?]




100%|██████████| 23/23 [00:01<00:00,  1.19s/trial, best loss: 0.26555555555555554]
 96%|█████████▌| 23/24 [00:00<?, ?trial/s, best loss=?]




100%|██████████| 24/24 [00:01<00:00,  1.04s/trial, best loss: 0.26555555555555554]
 96%|█████████▌| 24/25 [00:00<?, ?trial/s, best loss=?]




100%|██████████| 25/25 [00:33<00:00, 33.99s/trial, best loss: 0.24236111111111114]
 96%|█████████▌| 25/26 [00:00<?, ?trial/s, best loss=?]




100%|██████████| 26/26 [00:44<00:00, 44.76s/trial, best loss: 0.24236111111111114]
 96%|█████████▋| 26/27 [00:00<?, ?trial/s, best loss=?]


  sample_weight *= np.exp(

  return fit_method(estimator, *args, **kwargs)



100%|██████████| 27/27 [00:00<00:00,  3.48trial/s, best loss: 0.24236111111111114]
 96%|█████████▋| 27/28 [00:00<?, ?trial/s, best loss=?]




100%|██████████| 28/28 [00:35<00:00, 35.43s/trial, best loss: 0.24236111111111114]
 97%|█████████▋| 28/29 [00:00<?, ?trial/s, best loss=?]




100%|██████████| 29/29 [00:23<00:00, 23.64s/trial, best loss: 0.24236111111111114]
 97%|█████████▋| 29/30 [00:00<?, ?trial/s, best loss=?]




100%|██████████| 30/30 [00:05<00:00,  5.71s/trial, best loss: 0.24236111111111114]
 97%|█████████▋| 30/31 [00:00<?, ?trial/s, best loss=?]




100%|██████████| 31/31 [00:26<00:00, 26.63s/trial, best loss: 0.24236111111111114]
 97%|█████████▋| 31/32 [00:00<?, ?trial/s, best loss=?]




100%|██████████| 32/32 [00:05<00:00,  5.96s/trial, best loss: 0.24236111111111114]
 97%|█████████▋| 32/33 [00:00<?, ?trial/s, best loss=?]




100%|██████████| 33/33 [00:45<00:00, 45.46s/trial, best loss: 0.2188888888888889]
100%|██████████| 34/34 [00:39<00:00, 39.89s/trial, best loss: 0.2188888888888889]
 97%|█████████▋| 34/35 [00:00<?, ?trial/s, best loss=?]




100%|██████████| 35/35 [00:28<00:00, 28.98s/trial, best loss: 0.2188888888888889]
 97%|█████████▋| 35/36 [00:00<?, ?trial/s, best loss=?]




100%|██████████| 36/36 [00:24<00:00, 24.28s/trial, best loss: 0.2188888888888889]
100%|██████████| 37/37 [00:20<00:00, 20.23s/trial, best loss: 0.2188888888888889]
 97%|█████████▋| 37/38 [00:00<?, ?trial/s, best loss=?]




100%|██████████| 38/38 [00:02<00:00,  2.10s/trial, best loss: 0.2188888888888889]
100%|██████████| 39/39 [00:11<00:00, 11.20s/trial, best loss: 0.2188888888888889]
 98%|█████████▊| 39/40 [00:00<?, ?trial/s, best loss=?]




100%|██████████| 40/40 [00:00<00:00,  1.75trial/s, best loss: 0.2188888888888889]
 98%|█████████▊| 40/41 [00:00<?, ?trial/s, best loss=?]




100%|██████████| 41/41 [00:22<00:00, 22.37s/trial, best loss: 0.2188888888888889]
100%|██████████| 42/42 [00:42<00:00, 42.98s/trial, best loss: 0.2188888888888889]
 98%|█████████▊| 42/43 [00:00<?, ?trial/s, best loss=?]




100%|██████████| 43/43 [00:09<00:00,  9.98s/trial, best loss: 0.2188888888888889]
100%|██████████| 44/44 [00:16<00:00, 16.98s/trial, best loss: 0.2188888888888889]
 98%|█████████▊| 44/45 [00:00<?, ?trial/s, best loss=?]




100%|██████████| 45/45 [00:36<00:00, 36.09s/trial, best loss: 0.21625000000000005]
 98%|█████████▊| 45/46 [00:00<?, ?trial/s, best loss=?]




100%|██████████| 46/46 [00:34<00:00, 34.03s/trial, best loss: 0.21625000000000005]
100%|██████████| 47/47 [00:06<00:00,  6.77s/trial, best loss: 0.21625000000000005]
 98%|█████████▊| 47/48 [00:00<?, ?trial/s, best loss=?]




100%|██████████| 48/48 [00:14<00:00, 14.41s/trial, best loss: 0.21625000000000005]
 98%|█████████▊| 48/49 [00:00<?, ?trial/s, best loss=?]




100%|██████████| 49/49 [00:08<00:00,  8.59s/trial, best loss: 0.21625000000000005]
 98%|█████████▊| 49/50 [00:00<?, ?trial/s, best loss=?]




100%|██████████| 50/50 [00:02<00:00,  2.12s/trial, best loss: 0.21625000000000005]
100%|██████████| 51/51 [00:18<00:00, 18.03s/trial, best loss: 0.21625000000000005]
 98%|█████████▊| 51/52 [00:00<?, ?trial/s, best loss=?]




100%|██████████| 52/52 [00:12<00:00, 12.05s/trial, best loss: 0.21625000000000005]
 98%|█████████▊| 52/53 [00:00<?, ?trial/s, best loss=?]




100%|██████████| 53/53 [00:27<00:00, 27.09s/trial, best loss: 0.21625000000000005]
 98%|█████████▊| 53/54 [00:00<?, ?trial/s, best loss=?]




100%|██████████| 54/54 [00:38<00:00, 38.56s/trial, best loss: 0.21625000000000005]
 98%|█████████▊| 54/55 [00:00<?, ?trial/s, best loss=?]




100%|██████████| 55/55 [00:05<00:00,  5.82s/trial, best loss: 0.21625000000000005]
100%|██████████| 56/56 [00:09<00:00,  9.42s/trial, best loss: 0.21625000000000005]
 98%|█████████▊| 56/57 [00:00<?, ?trial/s, best loss=?]




100%|██████████| 57/57 [00:26<00:00, 26.77s/trial, best loss: 0.21625000000000005]
 98%|█████████▊| 57/58 [00:00<?, ?trial/s, best loss=?]




100%|██████████| 58/58 [00:03<00:00,  3.80s/trial, best loss: 0.21625000000000005]
 98%|█████████▊| 58/59 [00:00<?, ?trial/s, best loss=?]




100%|██████████| 59/59 [00:14<00:00, 14.20s/trial, best loss: 0.21625000000000005]
 98%|█████████▊| 59/60 [00:00<?, ?trial/s, best loss=?]




100%|██████████| 60/60 [00:01<00:00,  1.85s/trial, best loss: 0.21625000000000005]
100%|██████████| 61/61 [00:02<00:00,  2.78s/trial, best loss: 0.21625000000000005]
 98%|█████████▊| 61/62 [00:00<?, ?trial/s, best loss=?]




100%|██████████| 62/62 [00:42<00:00, 42.02s/trial, best loss: 0.21625000000000005]
 98%|█████████▊| 62/63 [00:00<?, ?trial/s, best loss=?]




100%|██████████| 63/63 [00:34<00:00, 34.69s/trial, best loss: 0.21625000000000005]
 98%|█████████▊| 63/64 [00:00<?, ?trial/s, best loss=?]




100%|██████████| 64/64 [00:08<00:00,  8.54s/trial, best loss: 0.21625000000000005]
 98%|█████████▊| 64/65 [00:00<?, ?trial/s, best loss=?]




100%|██████████| 65/65 [00:16<00:00, 16.48s/trial, best loss: 0.21625000000000005]
 98%|█████████▊| 65/66 [00:00<?, ?trial/s, best loss=?]




100%|██████████| 66/66 [00:23<00:00, 23.06s/trial, best loss: 0.21625000000000005]
 99%|█████████▊| 66/67 [00:00<?, ?trial/s, best loss=?]




100%|██████████| 67/67 [00:26<00:00, 26.22s/trial, best loss: 0.2146527777777778]
 99%|█████████▊| 67/68 [00:00<?, ?trial/s, best loss=?]




100%|██████████| 68/68 [00:30<00:00, 30.79s/trial, best loss: 0.2146527777777778]
 99%|█████████▊| 68/69 [00:00<?, ?trial/s, best loss=?]




100%|██████████| 69/69 [00:39<00:00, 39.19s/trial, best loss: 0.2146527777777778]
 99%|█████████▊| 69/70 [00:00<?, ?trial/s, best loss=?]




100%|██████████| 70/70 [00:15<00:00, 15.86s/trial, best loss: 0.2146527777777778]
 99%|█████████▊| 70/71 [00:00<?, ?trial/s, best loss=?]




100%|██████████| 71/71 [00:36<00:00, 36.89s/trial, best loss: 0.2146527777777778]
 99%|█████████▊| 71/72 [00:00<?, ?trial/s, best loss=?]




100%|██████████| 72/72 [00:06<00:00,  6.99s/trial, best loss: 0.2146527777777778]
 99%|█████████▊| 72/73 [00:00<?, ?trial/s, best loss=?]




100%|██████████| 73/73 [00:25<00:00, 25.47s/trial, best loss: 0.2146527777777778]
 99%|█████████▊| 73/74 [00:00<?, ?trial/s, best loss=?]




100%|██████████| 74/74 [00:31<00:00, 31.21s/trial, best loss: 0.2146527777777778]
 99%|█████████▊| 74/75 [00:00<?, ?trial/s, best loss=?]




100%|██████████| 75/75 [00:13<00:00, 13.72s/trial, best loss: 0.2146527777777778]
 99%|█████████▊| 75/76 [00:00<?, ?trial/s, best loss=?]




100%|██████████| 76/76 [00:32<00:00, 32.97s/trial, best loss: 0.2146527777777778]
 99%|█████████▊| 76/77 [00:00<?, ?trial/s, best loss=?]




100%|██████████| 77/77 [00:17<00:00, 17.07s/trial, best loss: 0.2146527777777778]
100%|██████████| 78/78 [00:08<00:00,  8.94s/trial, best loss: 0.2146527777777778]
 99%|█████████▊| 78/79 [00:00<?, ?trial/s, best loss=?]




100%|██████████| 79/79 [00:21<00:00, 21.19s/trial, best loss: 0.2146527777777778]
 99%|█████████▉| 79/80 [00:00<?, ?trial/s, best loss=?]




100%|██████████| 80/80 [00:03<00:00,  3.33s/trial, best loss: 0.2146527777777778]
 99%|█████████▉| 80/81 [00:00<?, ?trial/s, best loss=?]




100%|██████████| 81/81 [00:04<00:00,  4.77s/trial, best loss: 0.2146527777777778]
100%|██████████| 82/82 [00:22<00:00, 22.51s/trial, best loss: 0.2146527777777778]
 99%|█████████▉| 82/83 [00:00<?, ?trial/s, best loss=?]




100%|██████████| 83/83 [00:06<00:00,  6.62s/trial, best loss: 0.2146527777777778]
 99%|█████████▉| 83/84 [00:00<?, ?trial/s, best loss=?]


  sample_weight *= np.exp(

  return fit_method(estimator, *args, **kwargs)



100%|██████████| 84/84 [00:00<00:00,  4.40trial/s, best loss: 0.2146527777777778]
 99%|█████████▉| 84/85 [00:00<?, ?trial/s, best loss=?]




100%|██████████| 85/85 [00:25<00:00, 25.34s/trial, best loss: 0.2146527777777778]
100%|██████████| 86/86 [00:06<00:00,  6.65s/trial, best loss: 0.2146527777777778]
 99%|█████████▉| 86/87 [00:00<?, ?trial/s, best loss=?]




100%|██████████| 87/87 [00:11<00:00, 11.25s/trial, best loss: 0.2146527777777778]
 99%|█████████▉| 87/88 [00:00<?, ?trial/s, best loss=?]




100%|██████████| 88/88 [00:42<00:00, 42.54s/trial, best loss: 0.2146527777777778]
 99%|█████████▉| 88/89 [00:00<?, ?trial/s, best loss=?]




100%|██████████| 89/89 [00:01<00:00,  1.01s/trial, best loss: 0.2146527777777778]
 99%|█████████▉| 89/90 [00:00<?, ?trial/s, best loss=?]




100%|██████████| 90/90 [00:16<00:00, 16.26s/trial, best loss: 0.2146527777777778]
100%|██████████| 91/91 [00:12<00:00, 12.30s/trial, best loss: 0.2146527777777778]
 99%|█████████▉| 91/92 [00:00<?, ?trial/s, best loss=?]




100%|██████████| 92/92 [00:19<00:00, 19.99s/trial, best loss: 0.2146527777777778]
 99%|█████████▉| 92/93 [00:00<?, ?trial/s, best loss=?]




100%|██████████| 93/93 [00:34<00:00, 34.10s/trial, best loss: 0.2146527777777778]
 99%|█████████▉| 93/94 [00:00<?, ?trial/s, best loss=?]




100%|██████████| 94/94 [00:09<00:00,  9.24s/trial, best loss: 0.2146527777777778]
 99%|█████████▉| 94/95 [00:00<?, ?trial/s, best loss=?]




100%|██████████| 95/95 [00:04<00:00,  4.15s/trial, best loss: 0.2146527777777778]
100%|██████████| 96/96 [00:15<00:00, 15.76s/trial, best loss: 0.2146527777777778]
 99%|█████████▉| 96/97 [00:00<?, ?trial/s, best loss=?]




100%|██████████| 97/97 [00:12<00:00, 12.62s/trial, best loss: 0.2146527777777778]
 99%|█████████▉| 97/98 [00:00<?, ?trial/s, best loss=?]




100%|██████████| 98/98 [00:28<00:00, 28.40s/trial, best loss: 0.2146527777777778]
 99%|█████████▉| 98/99 [00:00<?, ?trial/s, best loss=?]




100%|██████████| 99/99 [00:21<00:00, 21.85s/trial, best loss: 0.2146527777777778]
 99%|█████████▉| 99/100 [00:00<?, ?trial/s, best loss=?]




100%|██████████| 100/100 [00:08<00:00,  8.60s/trial, best loss: 0.2146527777777778]




Best Parameters: AdaBoostClassifier(learning_rate=1.8070483079192097, n_estimators=587,
                   random_state=1)
Best Score: 0.2146527777777778
Test Set Accuracy: 0.7858333333333334
Precision: 0.7855565371024735
Recall: 0.7880802038329456
F2 Score: 0.7875741741209813

Classifier: 0 sklearn_GradientBoostingClassifier
1  ccp_alpha =
2   Literal{0.0}
3  criterion =
4   switch
5     hyperopt_param
6       Literal{my_gb.gbc_criterion}
7       randint
8         Literal{2}
9     Literal{friedman_mse}
10     Literal{squared_error}
11  init =
12   Literal{None}
13  learning_rate =
14   float
15     hyperopt_param
16       Literal{my_gb.gbc_learning_rate}
17       lognormal
18         Literal{-4.605170185988091}
19         Literal{2.3025850929940455}
20  loss =
21   switch
22     hyperopt_param
23       Literal{my_gb.gbc_loss}
24       randint
25         Literal{2}
26     Literal{log_loss}
27     Literal{exponential}
28  max_depth =
29   switch
30     hyperopt_param
31       Literal{my

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, fbeta_score
from hyperopt import tpe
from hpsklearn import HyperoptEstimator,any_preprocessing,random_forest_classifier,extra_tree_classifier,bagging_classifier,ada_boost_classifier,gradient_boosting_classifier,hist_gradient_boosting_classifier,ridge_classifier_cv,perceptron,decision_tree_classifier,k_neighbors_classifier,xgboost_classification

# Define classifiers
hypertopt_classification_estimators = [
    #random_forest_classifier('my_rf'),
    #extra_tree_classifier('my_et'),
    #bagging_classifier('my_bag'),
    #ada_boost_classifier('my_ada'),
    #gradient_boosting_classifier('my_gb'),
    #hist_gradient_boosting_classifier('my_hgb'),
    #ridge_classifier_cv('my_ridge'),
    perceptron('my_perceptron'),
    #decision_tree_classifier('my_dt'),
    k_neighbors_classifier('my_knn'),
    xgboost_classification('my_xgb'),
]

# Dictionary to store results
results = {}

# Loop through each classifier
for classifier in hypertopt_classification_estimators:
    print(f"\nClassifier: {classifier}")

    # Create HyperoptEstimator
    estim = HyperoptEstimator(
        classifier=classifier,
        preprocessing=any_preprocessing('my_pre'),
        algo=tpe.suggest,
        max_evals=100,
        trial_timeout=120,
    )

    # Fit the estimator
    estim.fit(x_std_train, y_train)

    # Get the best parameters and score
    best_params = estim._best_learner
    best_score = estim._best_loss

    # Evaluate on the test set
    y_pred = estim.predict(x_std_test)
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f2_score = fbeta_score(y_test, y_pred, beta=2)

    # Store results in the dictionary
    results[classifier] = {
        'best_params': best_params,
        'best_score': best_score,
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f2_score': f2_score,
    }

    # Print results
    print(f"Best Parameters: {best_params}")
    print(f"Best Score: {best_score}")
    print(f"Test Set Accuracy: {accuracy}")
    print(f"Precision: {precision}")
    print(f"Recall: {recall}")
    print(f"F2 Score: {f2_score}")

# Find the best model based on test accuracy
best_model = max(results, key=lambda x: results[x]['accuracy'])

# Print overall best model
print(f"\nBest Model Overall: {best_model}")



Classifier: 0 sklearn_Perceptron
1  alpha =
2   float
3     hyperopt_param
4       Literal{my_perceptron.perceptron_alpha}
5       loguniform
6         Literal{-13.815510557964274}
7         Literal{-2.3025850929940455}
8  class_weight =
9   Literal{None}
10  early_stopping =
11   Literal{False}
12  eta0 =
13   float
14     hyperopt_param
15       Literal{my_perceptron.perceptron_eta0}
16       normal
17        mu =
18         Literal{1.0}
19        sigma =
20         Literal{0.1}
21  fit_intercept =
22   Literal{True}
23  l1_ratio =
24   float
25     hyperopt_param
26       Literal{my_perceptron.perceptron_l1_ratio}
27       loguniform
28         Literal{-16.11809565095832}
29         Literal{0.0}
30  max_iter =
31   int
32     float
33       hyperopt_param
34         Literal{my_perceptron.perceptron_max_iter}
35         uniform
36           Literal{750}
37           Literal{1250}
38  n_iter_no_change =
39   Literal{5}
40  n_jobs =
41   Literal{1}
42  penalty =
43   switch
44     hyp

In [None]:
from sklearn.ensemble import RandomForestClassifier

rf_model = RandomForestClassifier(bootstrap=False, class_weight='balanced',
                       max_features=0.2013603334303137, n_estimators=473,
                       n_jobs=1, random_state=4, verbose=False)

rf_model.fit(x_std_train, y_train)

#Train details
y_pred = rf_model.predict(x_std_train)
accuracy = accuracy_score(y_train,y_pred)
precision = precision_score(y_train,y_pred)
recall = recall_score(y_train,y_pred)
f2_scoree = fbeta_score(y_train,y_pred,beta=2)

print(f'Train data Details : accuracy = {accuracy} , precision = {precision} , recall = {recall} , f2_scoree = {f2_scoree}')

#Test details
y_pred = rf_model.predict(x_std_test)
accuracy = accuracy_score(y_test,y_pred)
precision = precision_score(y_test,y_pred)
recall = recall_score(y_test,y_pred)
f2_scoree = fbeta_score(y_test,y_pred,beta=2)
print(f'Test data Details : accuracy = {accuracy} , precision = {precision} , recall = {recall} , f2_scoree = {f2_scoree}')


Train data Details : accuracy = 0.9767638888888889 , precision = 0.9921338960181437 , recall = 0.9610934979698538 , f2_scoree = 0.9671452159068649
Test data Details : accuracy = 0.9636666666666667 , precision = 0.9809633027522936 , recall = 0.9460296394602964 , f2_scoree = 0.952815897344502


In [None]:
df_small = pd.read_csv('/content/dataset_small.csv')
df_small = df_small.drop(skipped_features,axis=1)
df_small = df_small.drop(columns=drop_columns)
x_t = df_small.drop('phishing',axis=1)
y_t = df_small['phishing']

pca_xt = pca.transform(x_t)
std_xt = std.transform(pca_xt)

#Validate details
y_pred = rf_model.predict(std_xt)
accuracy = accuracy_score(y_t,y_pred)
precision = precision_score(y_t,y_pred)
recall = recall_score(y_t,y_pred)
f2_scoree = fbeta_score(y_t,y_pred,beta=2)
print(f'Test data Details : accuracy = {accuracy} , precision = {precision} , recall = {recall} , f2_scoree = {f2_scoree}')

Test data Details : accuracy = 0.918663142637906 , precision = 0.9388163472952349 , recall = 0.9032205436094887 , f2_scoree = 0.9101221124065416


In [None]:
from xgboost import XGBClassifier

xgb_model = XGBClassifier(base_score=0.5, booster=None, callbacks=None,
              colsample_bylevel=0.7411992060554791, colsample_bynode=None,
              colsample_bytree=0.6644703736569552, device=None,
              early_stopping_rounds=None, enable_categorical=False,
              eval_metric=None, feature_types=None, gamma=0.0295687727673445,
              grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=0.08628276003151057,
              max_bin=None, max_cat_threshold=None, max_cat_to_onehot=None,
              max_delta_step=0, max_depth=9, max_leaves=None,
              min_child_weight=1, monotone_constraints=None,
              multi_strategy=None, n_estimators=2800, n_jobs=1,
              num_parallel_tree=None, random_state=None)

xgb_model.fit(x_std_train, y_train)

#Train details
y_pred = xgb_model.predict(x_std_train)
accuracy = accuracy_score(y_train,y_pred)
precision = precision_score(y_train,y_pred)
recall = recall_score(y_train,y_pred)
f2_scoree = fbeta_score(y_train,y_pred,beta=2)

print(f'Train data Details : accuracy = {accuracy} , precision = {precision} , recall = {recall} , f2_scoree = {f2_scoree}')

#Test details
y_pred = xgb_model.predict(x_std_test)
accuracy = accuracy_score(y_test,y_pred)
precision = precision_score(y_test,y_pred)
recall = recall_score(y_test,y_pred)
f2_scoree = fbeta_score(y_test,y_pred,beta=2)
print(f'Test data Details : accuracy = {accuracy} , precision = {precision} , recall = {recall} , f2_scoree = {f2_scoree}')

df_small = pd.read_csv('/content/dataset_small.csv')
df_small = df_small.drop(skipped_features,axis=1)
df_small = df_small.drop(columns=drop_columns)
x_t = df_small.drop('phishing',axis=1)
y_t = df_small['phishing']

pca_xt = pca.transform(x_t)
std_xt = std.transform(pca_xt)

#Validate details
y_pred = rf_model.predict(std_xt)
accuracy = accuracy_score(y_t,y_pred)
precision = precision_score(y_t,y_pred)
recall = recall_score(y_t,y_pred)
f2_scoree = fbeta_score(y_t,y_pred,beta=2)
print(f'Test data Details : accuracy = {accuracy} , precision = {precision} , recall = {recall} , f2_scoree = {f2_scoree}')


Train data Details : accuracy = 0.9753194444444444 , precision = 0.9965137561372418 , recall = 0.9539184604260527 , f2_scoree = 0.9621436946362752
Test data Details : accuracy = 0.9625555555555556 , precision = 0.9905041031652989 , recall = 0.9344171643441717 , f2_scoree = 0.9451205870508748
Test data Details : accuracy = 0.918663142637906 , precision = 0.9388163472952349 , recall = 0.9032205436094887 , f2_scoree = 0.9101221124065416


In [None]:
from sklearn.neighbors import KNeighborsClassifier

knn_model = KNeighborsClassifier(algorithm='kd_tree', leaf_size=22, n_jobs=1, n_neighbors=2,
                     p=2.372428226692859, weights='distance')

knn_model.fit(x_std_train, y_train)

#Train details
y_pred = knn_model.predict(x_std_train)
accuracy = accuracy_score(y_train,y_pred)
precision = precision_score(y_train,y_pred)
recall = recall_score(y_train,y_pred)
f2_scoree = fbeta_score(y_train,y_pred,beta=2)

print(f'Train data Details : accuracy = {accuracy} , precision = {precision} , recall = {recall} , f2_scoree = {f2_scoree}')

#Test details
y_pred = knn_model.predict(x_std_test)
accuracy = accuracy_score(y_test,y_pred)
precision = precision_score(y_test,y_pred)
recall = recall_score(y_test,y_pred)
f2_scoree = fbeta_score(y_test,y_pred,beta=2)
print(f'Test data Details : accuracy = {accuracy} , precision = {precision} , recall = {recall} , f2_scoree = {f2_scoree}')

df_small = pd.read_csv('/content/dataset_small.csv')
df_small = df_small.drop(skipped_features,axis=1)
df_small = df_small.drop(columns=drop_columns)
x_t = df_small.drop('phishing',axis=1)
y_t = df_small['phishing']

pca_xt = pca.transform(x_t)
std_xt = std.transform(pca_xt)

#Validate details
y_pred = knn_model.predict(std_xt)
accuracy = accuracy_score(y_t,y_pred)
precision = precision_score(y_t,y_pred)
recall = recall_score(y_t,y_pred)
f2_scoree = fbeta_score(y_t,y_pred,beta=2)
print(f'Test data Details : accuracy = {accuracy} , precision = {precision} , recall = {recall} , f2_scoree = {f2_scoree}')


Train data Details : accuracy = 0.9753194444444444 , precision = 0.9942164772562968 , recall = 0.9561432782690917 , f2_scoree = 0.9635228374613958
Test data Details : accuracy = 0.9592222222222222 , precision = 0.9882463563704749 , recall = 0.9298827692988277 , f2_scoree = 0.9409974035276211
Test data Details : accuracy = 0.9308892488703214 , precision = 0.981391644341467 , recall = 0.8845237706790224 , f2_scoree = 0.902336728579988


In [30]:
import numpy as np
from urllib.parse import urlparse, parse_qs
import joblib
import socket

def is_ip_address(domain):
    try:
        socket.inet_aton(domain)
        return True
    except socket.error:
        return False

def extract_additional_url_features(url):
    parsed_url = urlparse(url)

    return {
        'qty_dot_url': url.count('.'),
        'qty_hyphen_url': url.count('-'),
        'qty_underline_url': url.count('_'),
        'qty_slash_url': url.count('/'),
        'qty_questionmark_url': url.count('?'),
        'qty_equal_url': url.count('='),
        'qty_at_url': url.count('@'),
        'qty_exclamation_url': url.count('!'),
        'qty_space_url': url.count(' '),
        'qty_tilde_url': url.count('~'),
        'qty_comma_url': url.count(','),
        'qty_plus_url': url.count('+'),
        'qty_asterisk_url': url.count('*'),
        'qty_hashtag_url': url.count('#'),
        'qty_dollar_url': url.count('$'),
        'qty_percent_url': url.count('%'),
        'qty_tld_url': len(parsed_url.netloc.split('.')[-1]),
        'length_url': len(url)
    }

def extract_additional_domain_features(url):
    # Parse the URL to get the domain
    domain = urlparse(url).netloc

    if not domain:
        return {
            'qty_dot_domain': -1,
            'qty_hyphen_domain': -1,
            'qty_underline_domain': -1,
            'qty_at_domain': -1,
            'qty_vowels_domain': -1,
            'domain_in_ip': -1,
            'server_client_domain': -1
        }

    return {
        'qty_dot_domain': domain.count('.'),
        'qty_hyphen_domain': domain.count('-'),
        'qty_underline_domain': domain.count('_'),
        'qty_at_domain': domain.count('@'),
        'qty_vowels_domain': sum(1 for char in domain if char.lower() in "aeiou"),
        'domain_in_ip': 1 if is_ip_address(domain) else 0,
        'server_client_domain': 1 if domain.startswith("www.") else 0
    }

def extract_additional_path_features(url):
    # Parse the URL to get the path
    path = urlparse(url).path

    if not path:
        return {
            'qty_dot_directory': -1,
            'qty_hyphen_directory': -1,
            'qty_underline_directory': -1,
            'qty_percent_directory': -1,
            'directory_length': -1
        }

    return {
        'qty_dot_directory': path.count('.'),
        'qty_hyphen_directory': path.count('-'),
        'qty_underline_directory': path.count('_'),
        'qty_percent_directory': path.count('%'),
        'directory_length': len(path)
    }

def extract_file_features(url):
    # Parse the URL to get the path
    path = urlparse(url).path

    if not path:
        return {
            'file_length': -1
        }

    # Extract the file name from the path
    file_name = path.split('/')[-1]

    # Attribute: Length of the file name
    file_length = len(file_name)

    return {
        'file_length': file_length
    }

def extract_additional_params_features(url):
    # Parse the URL to get the query parameters
    query_params = urlparse(url).query

    if not query_params:
        return {
            'qty_dot_params': -1,
            'qty_hyphen_params': -1,
            'qty_underline_params': -1,
            'qty_slash_params': -1,
            'qty_questionmark_params': -1,
            'qty_percent_params': -1
        }

    # Extract parameter names from the query string
    param_names = parse_qs(query_params).keys()

    return {
        'qty_dot_params': sum(param.count('.') for param in param_names),
        'qty_hyphen_params': sum(param.count('-') for param in param_names),
        'qty_underline_params': sum(param.count('_') for param in param_names),
        'qty_slash_params': sum(param.count('/') for param in param_names),
        'qty_questionmark_params': sum(param.count('?') for param in param_names),
        'qty_percent_params': sum(param.count('%') for param in param_names)
    }

def email_urlshorten(url):
    # Parse the URL
    parsed_url = urlparse(url)

    # Extract the domain from the URL
    domain = parsed_url.netloc

    if not domain:
        return {
            'email_in_url': -1,
            'tls_ssl_certificate' : -1,
            'url_shortened': -1
        }

    return {
        'email_in_url': 1 if '@' in url else 0,
        'tls_ssl_certificate' : 1 if url.startswith("https://") else 0,
        'url_shortened': 1 if domain in ['bit.ly', 'goo.gl', 'tinyurl.com', 'ow.ly'] else 0
    }

def extract_all_features(url):
    # Extract URL-based features
    url_features = extract_additional_url_features(url)

    # Extract Domain-based features
    domain_features = extract_additional_domain_features(url)

    # Extract Page-based features
    path_features = extract_additional_path_features(url)

    # Extract File-based feature
    file_feature = extract_file_features(url)

    # Extract Params-based features
    params_features = extract_additional_params_features(url)

    # Extract Additional Features
    additional_features = email_urlshorten(url)

    # Combine all features
    all_features = {**url_features, **domain_features, **path_features, **file_feature, **params_features, **additional_features}

    return all_features


In [32]:
url = "https://tinyurl.com/SHEIN-420"
# https://google.com
# http://app.validchk.com/visitqr.aspx?vid=1073653
# https://platform.openai.com/docs/overview
# https://bard.google.com/chat/504c612c047ad681
# https://chat.openai.com/c/e6ed6f21-d91a-45e8-b1ab-526589713026
extracted_features = extract_all_features(url)

# Extract features and reshape into a 2D array
data = np.array(list(extracted_features.values())).reshape(1, -1)

# Assuming you have a PCA object
pca_transformed_data = pca.transform(data)

# Assuming you have a scaler object
scaled_data = std.transform(pca_transformed_data)


# Use the trained XGBBoost for prediction
#prediction = rf_model.predict(scaled_data)
pd2 = knn_model.predict(scaled_data)
pd3 = xgb_model.predict(scaled_data)
print(prediction,pd2,pd3)



NameError: name 'knn_model' is not defined

In [None]:
from hpsklearn import HyperoptEstimator,any_preprocessing,random_forest_classifier,extra_tree_classifier,bagging_classifier,ada_boost_classifier,gradient_boosting_classifier,hist_gradient_boosting_classifier,ridge_classifier_cv,perceptron,decision_tree_classifier,k_neighbors_classifier,xgboost_classification
from hyperopt import tpe

estim = HyperoptEstimator(
            classifier=k_neighbors_classifier('my_knn'),
            preprocessing=any_preprocessing('my_pre'),
            algo=tpe.suggest,
            max_evals=100,
            trial_timeout=120,
        )

        # Fit the estimator
estim.fit(x_std_train, y_train)

# Get the best parameters and score
best_params = estim._best_learner
best_score = estim._best_loss

# Evaluate on the test set
y_pred = estim.predict(x_std_test)
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f2_score = fbeta_score(y_test, y_pred, beta=2)

100%|██████████| 1/1 [00:14<00:00, 14.61s/trial, best loss: 0.050694444444444486]
100%|██████████| 2/2 [01:21<00:00, 81.81s/trial, best loss: 0.050694444444444486]
100%|██████████| 3/3 [00:12<00:00, 12.46s/trial, best loss: 0.050694444444444486]
100%|██████████| 4/4 [00:07<00:00,  7.40s/trial, best loss: 0.050694444444444486]
100%|██████████| 5/5 [00:00<00:00,  1.81trial/s, best loss: 0.050694444444444486]
100%|██████████| 6/6 [00:06<00:00,  6.33s/trial, best loss: 0.050694444444444486]
100%|██████████| 7/7 [00:13<00:00, 13.40s/trial, best loss: 0.050694444444444486]
100%|██████████| 8/8 [00:00<00:00,  1.73trial/s, best loss: 0.0500694444444445]
100%|██████████| 9/9 [00:01<00:00,  1.10s/trial, best loss: 0.0500694444444445]
100%|██████████| 10/10 [00:00<00:00,  2.21trial/s, best loss: 0.04861111111111116]
100%|██████████| 11/11 [00:01<00:00,  1.56s/trial, best loss: 0.04861111111111116]
100%|██████████| 12/12 [00:04<00:00,  4.75s/trial, best loss: 0.04861111111111116]
100%|██████████| 

In [None]:
best_params

In [28]:
print(best_params)

KNeighborsClassifier(algorithm='kd_tree', leaf_size=25, metric='cityblock',
                     n_jobs=1, n_neighbors=2, p=2.2992555541721913,
                     weights='distance')


TypeError: 'KNeighborsClassifier' object is not subscriptable