In [None]:

import os
import cv2
import numpy as np
from sklearn.model_selection import train_test_split

from tensorflow.keras.models import Model

import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np

%matplotlib inline

import pandas as pd


In [None]:
columns_of_interest  = [
    "qty_slash_url",
    "time_domain_activation",
    "length_url",
    "qty_mx_servers",
    "qty_dot_directory",
    "qty_dot_domain",
    "url_shortened",
    "directory_length",
    "file_length",
    "tls_ssl_certificate",
    "qty_nameservers",
    "qty_at_params",
    "qty_ip_resolved",
    "tld_present_params",
    "qty_hyphen_domain",
    "qty_at_url",
    "qty_vowels_domain",
    "qty_hyphen_url",
    "time_domain_expiration",
    "domain_spf"
]
best_model='feed_forward'

In [None]:
def calculate_accur(predictions, actual_results):
    # Initialize a variable to count the number of correct predictions
    correct_predictions = 0

    # Iterate through each prediction and actual result
    for pred, actual in zip(predictions, actual_results):
        # Check if the prediction matches the actual result
        if pred == actual:
            # If so, increment the count of correct predictions
            correct_predictions += 1

    # Calculate accuracy by dividing the number of correct predictions by the total number of predictions
    accuracy = (correct_predictions / len(predictions)) * 100
    return accuracy


In [2]:
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras.optimizers import SGD, Adam
from keras.regularizers import l1, l2

def create_feedforward_neural_network(input_dim, num_hidden_layers, num_neurons, activation, dropout_rate, l1_reg, l2_reg, learning_rate):
    model = Sequential()
    model.add(Dense(num_neurons, input_dim=input_dim, activation=activation, kernel_regularizer=l1(l1_reg), kernel_initializer=GlorotUniform()))
    for _ in range(num_hidden_layers):
        model.add(Dense(num_neurons, activation=activation, kernel_regularizer=l2(l2_reg), kernel_initializer=GlorotUniform()))
        model.add(Dropout(dropout_rate))
    model.add(Dense(1, activation='sigmoid'))
    
    # Define optimizer with custom learning rate
    optimizer = Adam(learning_rate=learning_rate)
    
    model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy'])
    return model

def preprocess_data(csv_path, columns_of_interest, test_size):
    # Read the CSV file into a DataFrame
    df = pd.read_csv(csv_path)

    # Replace -1 with mean values
    for col in df.columns:
        if -1 in df[col].unique():
            mean_value = int(df[col][df[col] != -1].mean())
            df[col] = df[col].replace(-1, mean_value)

    target_column = 'phishing'
    x = df.drop(target_column, axis=1)
    scaler = MinMaxScaler()

    # Apply Min-Max scaling to each column
    for col in x.columns:
        column_values = x[col].values.reshape(-1, 1)
        x[col] = scaler.fit_transform(column_values)

    df = pd.concat([x, df[target_column]], axis=1)

    X = df.drop('phishing', axis=1)
    y = df['phishing'].values

    selected_columns = X[columns_of_interest].values

    # Split the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(selected_columns, y, test_size=test_size, random_state=42)

    return X_train, X_test, y_train, y_test

def train_and_test_model(X_train, y_train, X_test, y_test, epochs, input_dim, optimizer, learning_rate, num_hidden_layers, num_neurons, activation_function, dropout_rate, l1_reg, l2_reg):
    model = Sequential()
    model.add(Dense(num_neurons, input_dim=input_dim, activation=activation_function, kernel_regularizer=l1(l1_reg), kernel_initializer='glorot_uniform'))
    for _ in range(num_hidden_layers):
        model.add(Dense(num_neurons, activation=activation_function, kernel_regularizer=l2(l2_reg), kernel_initializer='glorot_uniform'))
        model.add(Dropout(dropout_rate))
    model.add(Dense(1, activation='sigmoid'))
    
    # Define optimizer with custom learning rate
    if optimizer == 'SGD':
        opt = SGD(learning_rate=learning_rate)  # Adjusted here
    else:
        opt = Adam(learning_rate=learning_rate)  # Adjusted here
    
    model.compile(optimizer=opt, loss='binary_crossentropy', metrics=['accuracy'])
    
    model.fit(X_train, y_train, epochs=epochs, batch_size=64, verbose=0)  # Batch size set to 64 as an example

    y_pred = model.predict(X_test, verbose=0)
    threshold = 0.5

    # Calculate metrics
    y_pred_binary = (y_pred > threshold).astype(int)
    accuracy = accuracy_score(y_test, y_pred_binary)
    
    return accuracy, model


def predict_with_model(model, features, actual_results):
    features = features.reshape((features.shape[0], features.shape[2]))
    #print(features[0])
    y_pred_prob = model.predict(features,verbose=0)
    
    threshold = 0.5
    y_pred = (y_pred_prob > threshold).astype(int)

    accuracy = calculate_accur(y_pred, actual_results)
    return accuracy

def grid_hyperparameter_tuning(csv_paths, columns_of_interest_range, epochs_range, testing_sizes, learning_rates, batch_sizes, optimizers, num_hidden_layers_range, num_neurons_range, activation_functions, dropout_rates, l1_regs, l2_regs):
    results = []
    best_accuracy = 0.0
    for csv_path in csv_paths:
        for columns_of_interest_number in range(columns_of_interest_range[0], columns_of_interest_range[1] + 1):
            for epochs in epochs_range:
                for test_size in testing_sizes:
                    for learning_rate in learning_rates:
                        for batch_size in batch_sizes:
                            for optimizer in optimizers:
                                for num_hidden_layers in range(num_hidden_layers_range[0], num_hidden_layers_range[1] + 1):
                                    for num_neurons in num_neurons_range:
                                        for activation_function in activation_functions:
                                            for dropout_rate in dropout_rates:
                                                for l1_reg in l1_regs:
                                                    for l2_reg in l2_regs:
                                                        X_train, X_test, y_train, y_test = preprocess_data(csv_path, columns_of_interest[:columns_of_interest_number], test_size)
                                                        accuracy, model = train_and_test_model(X_train, y_train, X_test, y_test, epochs, X_train.shape[1], optimizer, learning_rate, num_hidden_layers, num_neurons, activation_function, dropout_rate, l1_reg, l2_reg)

                                                        ds = pd.read_csv(csv_path)
                                                        V = ds[columns_of_interest[:columns_of_interest_number]]

                                                        mean_values = V.replace(-1, np.nan).mean(axis=0)
                                                        dataset = pd.read_csv('/kaggle/input/newnew/features_dataset_new.csv')
                                                        v_copy = dataset.drop('phishing', axis=1).replace(-1, np.nan)
                                                        v_copy.fillna(mean_values, inplace=True)
                                                        v_copy = v_copy.iloc[:, :columns_of_interest_number]
                                                        features = v_copy.values

                                                        min_values = V.replace(-1, np.nan).min(axis=0)
                                                        max_values = V.max(axis=0)
                                                        scaled_features = []

                                                        for row in features:
                                                            filtered_row = row[row != -1]
                                                            scaler = MinMaxScaler()
                                                            scaler.fit([min_values, max_values])
                                                            scaled_row = scaler.transform([filtered_row])
                                                            scaled_features.append(scaled_row)

                                                        scaled_example = np.array(scaled_features)

                                                        accuracy_on_new_data = predict_with_model(model, scaled_example, dataset['phishing'])
                                                        if accuracy_on_new_data > best_accuracy:
                                                            best_accuracy = accuracy_on_new_data
                                                            best_model_name = f"/kaggle/working/{best_accuracy}.h5"
                                                            model.save(best_model_name)
                                                        parameters = {
                                                            "csv_path": csv_path,
                                                            "columns_of_interest": columns_of_interest_number,
                                                            "epochs": epochs,
                                                            "test_size": test_size,
                                                            "learning_rate": learning_rate,
                                                            "batch_size": batch_size,
                                                            "optimizer": optimizer,
                                                            "num_hidden_layers": num_hidden_layers,
                                                            "num_neurons": num_neurons,
                                                            "activation_function": activation_function,
                                                            "dropout_rate": dropout_rate,
                                                            "l1_reg": l1_reg,
                                                            "l2_reg": l2_reg,
                                                            "accuracy_on_data": accuracy,
                                                            "accuracy_on_new_data": accuracy_on_new_data
                                                        }
                                                        print(parameters)
                                                        results.append(parameters)
    return results

# Define parameters
csv_paths = ['/kaggle/input/zzzzzz/mendeley_dataset_full.csv',
             '/kaggle/input/ssssssss/dataset_small.csv']
columns_of_interest_range = [14, 20]
epochs_range = [10, 20, 30, 40, 50]
testing_sizes = [0.5, 0.4, 0.3, 0.2]
learning_rates = [0.001, 0.01, 0.1]
batch_sizes = [32, 64, 128]
optimizers = ['SGD', 'Adam']
num_hidden_layers_range = [1, 3]
num_neurons_range = [64, 128, 256]
activation_functions = ['relu', 'tanh']
dropout_rates = [0.0, 0.1, 0.2]
l1_regs = [0.0, 0.01, 0.1]
l2_regs = [0.0, 0.01, 0.1]

# Perform grid-based hyperparameter tuning
results = grid_hyperparameter_tuning(csv_paths, columns_of_interest_range, epochs_range, testing_sizes, learning_rates, batch_sizes, optimizers, num_hidden_layers_range, num_neurons_range, activation_functions, dropout_rates, l1_regs, l2_regs)

# Print results
for result in results:
    print(result)


NameError: name 'columns_of_interest' is not defined

In [None]:
best_result = max(results, key=lambda x: x['accuracy_on_new_data'])

# Print the parameters of the best result
print("Best Result:")
for key, value in best_result.items():
    print(f"{key}: {value}")