In [1]:
import os

In [2]:
os.chdir("../")

In [3]:
%pwd

'c:\\Users\\HP\\OneDrive\\Desktop\\pishing'

In [4]:
from dataclasses import dataclass
from pathlib import Path


@dataclass(frozen=True)
class ModelEvaluationConfig:
    root_dir:Path
    file_path:Path
    models_path:Path

In [5]:
from phishingdetection.constants import *
from phishingdetection.utils.common import read_yaml, create_directories
from phishingdetection.logging import logger
import pandas as pd


In [6]:
class ConfigurationManager:
    def __init__(
        self,
        config_filepath = CONFIG_FILE_PATH,
        params_filepath = PARAMS_FILE_PATH):

        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)

        create_directories([self.config.artifacts_root])


    
    def get_model_evaluation_config(self) -> ModelEvaluationConfig:
        config = self.config.model_evaluation

        create_directories([config.root_dir])
        

        model_evaluation_config = ModelEvaluationConfig(
              root_dir=config.root_dir,
              file_path= config.file_path,
              models_path=config.models_path
            
           
        )

        return model_evaluation_config


In [7]:
from phishingdetection.constants import *
from phishingdetection.utils.common import read_yaml, create_directories, load_object
from phishingdetection.logging import logger
import pandas as pd
from phishingdetection.logging import logger
import pickle

In [8]:
import pickle
import logging



class ModelEvaluation:
    def __init__(self, config: ModelEvaluationConfig):
        self.config = config

    def select_best_model(self):
        # Load the model selection data from the file
        model_selection_file_path = 'artifacts/model_trainer/model_selection.pkl'
        with open(model_selection_file_path, 'rb') as f:
            loaded_data = pickle.load(f)

        # Check if the loaded data contains the necessary information
        if isinstance(loaded_data, tuple) and len(loaded_data) == 2:
            model_dataframe, model_info_dict = loaded_data
            
            # Process DataFrame
            best_model = None
            best_accuracy = 0
            best_precision = None
            
            for index, row in model_dataframe.iterrows():
                accuracy = row['accuracy']
                if accuracy > best_accuracy:
                    best_accuracy = accuracy
                    best_model_name = row['models']
                    best_precision = row.get('precision', None)  # Precision may not always be present

            if best_model_name in model_info_dict:
                best_model_info = model_info_dict[best_model_name]
                best_model = best_model_info['model']
                
                logging.info(f"Best model with accuracy {best_accuracy} and precision {best_precision} loaded successfully.")
                
                # Save the best model to another file
                best_model_path = 'best_models.pkl'  # Choose a file path for the best model
                with open(best_model_path, 'wb') as f:
                    pickle.dump(best_model, f)
                logging.info(f"Best model saved to {best_model_path}")
            else:
                logging.error("No model found with valid accuracy.")
        else:
            logging.error("Loaded data does not contain the necessary information.")


    def save_models(self, best_model):
        logging.info("Saving models to file")
        with open(self.config.file_path, 'wb') as f:
            pickle.dump(best_model, f)
        logging.info("Models saved successfully.")

In [12]:
try:
    config = ConfigurationManager()
    model_evaluation_config = config.get_model_evaluation_config()
    model_evaluation_config = ModelEvaluation(config=model_evaluation_config)
    best_model=model_evaluation_config.select_best_model()
    model_evaluation_config.save_models(best_model)
except Exception as e:
    raise e

[2024-04-09 15:08:36,364: INFO: common: yaml file: config\config.yaml loaded successfully]
[2024-04-09 15:08:36,364: INFO: common: yaml file: params.yaml loaded successfully]
[2024-04-09 15:08:36,370: INFO: common: created directory at: artifacts]
[2024-04-09 15:08:36,373: INFO: common: created directory at: artifacts/model_evaluation]
[2024-04-09 15:08:36,534: INFO: 936551666: Best model with accuracy 0.9537507050197406 and precision 0.9537507050197406 loaded successfully.]
[2024-04-09 15:08:36,689: INFO: 936551666: Best model saved to best_models.pkl]
[2024-04-09 15:08:36,710: INFO: 936551666: Saving models to file]
[2024-04-09 15:08:36,715: INFO: 936551666: Models saved successfully.]


In [10]:
def predict(self,features):
        try:
            
            model=load_object(file_path='artifacts.model_evalution.pkl')
            preprocessor=load_object(file_path='scaler.pkl')
            print("After Loading")
            data_scaled=preprocessor.transform(features)
            preds=model.predict(data_scaled)
            return preds
        
        except Exception as e:
            raise e

In [11]:
import pickle
import pandas as pd

# Load the new data

# Preprocess the new data (if needed)
# Make sure it has the same format as the data used for training the models

# Load the models from the saved file
with open('artifacts.model_evalution.pkl', 'rb') as f:
    loaded_data = pickle.load(f)
    models_dict = loaded_data[1]  # Assuming the models are saved in the second element of the tuple

# Make predictions using the loaded models

    model = models_dict['model']
    y_pred = model.predict(X_test)  # Assuming new_data has the same features as X_train
    print(f"Model: {models_dict}, Predictions: {y_pred}")


FileNotFoundError: [Errno 2] No such file or directory: 'artifacts.model_evalution.pkl'

In [None]:
import re
import urllib.parse
import socket
import whois
import time

def extract_features(url):
    features = {}
    
    # URL parsing
    parsed_url = urllib.parse.urlparse(url)
    features['length_url'] = len(url)
    features['qty_dot_url'] = url.count('.')
    features['qty_hyphen_url'] = url.count('-')
    features['qty_underline_url'] = url.count('_')
    features['qty_slash_url'] = url.count('/')
    features['qty_questionmark_url'] = url.count('?')
    features['qty_equal_url'] = url.count('=')
    features['qty_at_url'] = url.count('@')
    features['qty_and_url'] = url.count('&')
    features['qty_exclamation_url'] = url.count('!')
    features['qty_space_url'] = url.count(' ')
    features['qty_tilde_url'] = url.count('~')
    features['qty_comma_url'] = url.count(',')
    features['qty_plus_url'] = url.count('+')
    features['qty_asterisk_url'] = url.count('*')
    features['qty_hashtag_url'] = url.count('#')
    features['qty_dollar_url'] = url.count('$')
    features['qty_percent_url'] = url.count('%')
    
    # Domain parsing
    domain = parsed_url.netloc
    features['qty_dot_domain'] = domain.count('.')
    features['qty_hyphen_domain'] = domain.count('-')
    features['qty_underline_domain'] = domain.count('_')
    features['qty_slash_domain'] = domain.count('/')
    features['qty_questionmark_domain'] = domain.count('?')
    features['qty_equal_domain'] = domain.count('=')
    features['qty_at_domain'] = domain.count('@')
    features['qty_and_domain'] = domain.count('&')
    features['qty_exclamation_domain'] = domain.count('!')
    features['qty_space_domain'] = domain.count(' ')
    features['qty_tilde_domain'] = domain.count('~')
    features['qty_comma_domain'] = domain.count(',')
    features['qty_plus_domain'] = domain.count('+')
    features['qty_asterisk_domain'] = domain.count('*')
    features['qty_hashtag_domain'] = domain.count('#')
    features['qty_dollar_domain'] = domain.count('$')
    features['qty_percent_domain'] = domain.count('%')
    features['domain_length'] = len(domain)
    
    # Check if domain resolves to an IP
    try:
        ip_address = socket.gethostbyname(domain)
        features['domain_in_ip'] = 1
    except socket.error:
        ip_address = None
        features['domain_in_ip'] = 0
        
    # Check if domain is present in URL
    features['server_client_domain'] = 1 if domain in url else 0
    
    # Extract TLD
    tld = domain.split('.')[-1]
    features['qty_tld_url'] = 1 if tld else 0
    
    # Extract directory
    directory = parsed_url.path
    features['qty_dot_directory'] = directory.count('.')
    features['qty_hyphen_directory'] = directory.count('-')
    features['qty_underline_directory'] = directory.count('_')
    features['qty_slash_directory'] = directory.count('/')
    features['qty_questionmark_directory'] = directory.count('?')
    features['qty_equal_directory'] = directory.count('=')
    features['qty_at_directory'] = directory.count('@')
    features['qty_and_directory'] = directory.count('&')
    features['qty_exclamation_directory'] = directory.count('!')
    features['qty_space_directory'] = directory.count(' ')
    features['qty_tilde_directory'] = directory.count('~')
    features['qty_comma_directory'] = directory.count(',')
    features['qty_plus_directory'] = directory.count('+')
    features['qty_asterisk_directory'] = directory.count('*')
    features['qty_hashtag_directory'] = directory.count('#')
    features['qty_dollar_directory'] = directory.count('$')
    features['qty_percent_directory'] = directory.count('%')
    features['directory_length'] = len(directory)
    
    # Extract filename
    filename = parsed_url.path.split('/')[-1]
    features['qty_dot_file'] = filename.count('.')
    features['qty_hyphen_file'] = filename.count('-')
    features['qty_underline_file'] = filename.count('_')
    features['qty_slash_file'] = filename.count('/')
    features['qty_questionmark_file'] = filename.count('?')
    features['qty_equal_file'] = filename.count('=')
    features['qty_at_file'] = filename.count('@')
    features['qty_and_file'] = filename.count('&')
    features['qty_exclamation_file'] = filename.count('!')
    features['qty_space_file'] = filename.count(' ')
    features['qty_tilde_file'] = filename.count('~')
    features['qty_comma_file'] = filename.count(',')
    features['qty_plus_file'] = filename.count('+')
    features['qty_asterisk_file'] = filename.count('*')
    features['qty_hashtag_file'] = filename.count('#')
    features['qty_dollar_file'] = filename.count('$')
    features['qty_percent_file'] = filename.count('%')
    features['file_length'] = len(filename)
    
    # Extract parameters
    parameters = parsed_url.query
    features['qty_dot_params'] = parameters.count('.')
    features['qty_hyphen_params'] = parameters.count('-')
    features['qty_underline_params'] = parameters.count('_')
    features['qty_slash_params'] = parameters.count('/')
    features['qty_questionmark_params'] = parameters.count('?')
    features['qty_equal_params'] = parameters.count('=')
    features['qty_at_params'] = parameters.count('@')
    features['qty_and_params'] = parameters.count('&')
    features['qty_exclamation_params'] = parameters.count('!')
    features['qty_space_params'] = parameters.count(' ')
    features['qty_tilde_params'] = parameters.count('~')
    features['qty_comma_params'] = parameters.count(',')
    features['qty_plus_params'] = parameters.count('+')
    features['qty_asterisk_params'] = parameters.count('*')
    features['qty_hashtag_params'] = parameters.count('#')
    features['qty_dollar_params'] = parameters.count('$')
    features['qty_percent_params'] = parameters.count('%')
    features['params_length'] = len(parameters)
    features['tld_present_params'] = 1 if tld in parameters else 0
    
    # Check if email present in URL
    features['qty_params'] = parameters.count('@')
    features['email_in_url'] = 1 if '@' in url else 0
    
    # Response time
    start_time = time.time()
    response = urllib.request.urlopen(url)
    end_time = time.time()
    features['time_response'] = end_time - start_time
    
    # SPF record of domain
    try:
        spf_record = whois.whois(domain).get('spf', None)
        features['domain_spf'] = 1 if spf_record else 0
    except Exception:
        features['domain_spf'] = 0
    
    return features




In [None]:
features=extract_features("https://pypi.org/project/python-whois/")

In [None]:
import pickle

# Load the data from the pickle file
with open('models.pkl', 'rb') as f:
    data = pickle.load(f)

# Step 1: Check the type of loaded data
print("Type of loaded data:", type(data))

# Step 2: Inspect the loaded data
print("Loaded data:", data)

# Step 3: Access the 'models' dictionary if present
if isinstance(data, dict):
    models_dict = data.get('models')
    if models_dict is not None:
        # Now you can proceed to check the types of values associated with each key in models_dict
        for key, value in models_dict.items():
            print("Key:", key, "Value type:", type(value))
    else:
        print("The loaded data does not contain a 'models' dictionary.")
else:
    print("The loaded data is not a dictionary.")



Type of loaded data: <class 'dict'>
Loaded data: {'model': RandomForestClassifier(), 'accuracy': 0.9574078912986673, 'precision': 0.9574078912986673}
The loaded data does not contain a 'models' dictionary.


In [None]:
import pickle

# Load the pickle file
with open('models.pkl', 'rb') as f:
    data = pickle.load(f)


# Access the models dictionary
models_dict = data['model']

# Assuming index 5 contains the trained DecisionTreeClassifier model
best_model = models_dict

# Now you can use the best_model for prediction
# For example, if you have some new data 'X_test', you can predict using:
y_pred = best_model.predict(X_test)



NameError: name 'X_test' is not defined

In [None]:
import pickle
import logging

# Load the model selection data from the file
model_selection_file_path = 'saved_models.pkl'
with open(model_selection_file_path, 'rb') as f:
    loaded_data = pickle.load(f)

# Check if the loaded data contains the necessary information
if 'model' in loaded_data and 'accuracy' in loaded_data:
    best_model = loaded_data['model']
    best_accuracy = loaded_data['accuracy']
    best_precision = loaded_data.get('precision', None)  # Precision may not always be present
    logging.info(f"Best model with accuracy {best_accuracy} and precision {best_precision} loaded successfully.")

    # Save the best model to another file
    best_model_path = 'best_model.pkl'  # Choose a file path for the best model
    with open(best_model_path, 'wb') as f:
        pickle.dump(best_model, f)
    logging.info(f"Best model saved to {best_model_path}")
else:
    logging.error("Loaded data does not contain the necessary information.")


ValueError: The truth value of a DataFrame is ambiguous. Use a.empty, a.bool(), a.item(), a.any() or a.all().

In [None]:
import pickle
from sklearn.ensemble import RandomForestClassifier  # Assuming RandomForestClassifier is the model you're using

# Load the model from the file
model_file_path = 'best_model.pkl'
with open(model_file_path, 'rb') as f:
    loaded_model = pickle.load(f)

# Check if the loaded object is an instance of RandomForestClassifier
if isinstance(loaded_model, RandomForestClassifier):
    print("Model loaded successfully and is an instance of RandomForestClassifier.")
else:
    print("Error: The loaded object is not an instance of RandomForestClassifier.")


Model loaded successfully and is an instance of RandomForestClassifier.


In [None]:
from phishingdetection.utils.common import expected_features, extract_features
from sklearn.ensemble import RandomForestClassifier
import pickle

In [None]:
import pickle
import pandas as pd
from phishingdetection.utils.common import expected_features, extract_features

# Load the model
model_file_path = 'best_model.pkl'
with open(model_file_path, 'rb') as f:
    model = pickle.load(f)

# Load the scaler
scaler_file_path = 'scalers.pkl'
with open(scaler_file_path, 'rb') as f:
    scaler = pickle.load(f)

# Assuming you have a URL for which you want to make a prediction
url = "https://in-a-flask-application"

# Extract features from the URL (assuming this function exists)
features = extract_features(url)

# Match expected features
features = {feature: features.get(feature, 0) for feature in expected_features}

# Sort features based on the expected feature order
features = [x for _, x in sorted(zip(expected_features, features.values()))]

# Reshape features to match the input shape expected by the model
feature_values = [features]

# Scale features
scaled_feature_values = scaler.transform(feature_values)

# Predict using the loaded machine learning model
prediction = model.predict(scaled_feature_values)

print(prediction)


[1]




In [None]:
import pickle
import pandas as pd
from phishingdetection.utils.common import expected_features, extract_features
import numpy as np


# Load the model
model_file_path = 'best_model.pkl'
with open(model_file_path, 'rb') as f:
    model = pickle.load(f)

# Load the scaler
scaler_file_path = 'scaler.pkl'
with open(scaler_file_path, 'rb') as f:
    scaler = pickle.load(f)

# Assuming you have a URL for which you want to make a prediction
url = "https://www.digitalocean.com/community/tutorials/how-to-handle-errors-in-a-flask-application"

# Extract features from the URL (assuming this function exists)
features = extract_features(url)
features = extract_features(url)

# Arrange dict_two based on the order of expected_features
arranged_dict_two = {feature: features.get(feature, 0) for feature in expected_features}
feature_values= [value for value in arranged_dict_two.values()]
feature_values = np.array(feature_values)
feature_values = feature_values.reshape(-1, 1)

scaled_feature_values = scaler.transform(feature_values)

# Predict using the loaded machine learning model
prediction = model.predict(scaled_feature_values)

print(prediction)



# Print or use arranged_dict_two



ValueError: X has 1 features, but MinMaxScaler is expecting 111 features as input.

In [None]:
from phishingdetection.utils.common import expected_features, extract_features
from sklearn.ensemble import RandomForestClassifier
import pickle
from sklearn.metrics import accuracy_score, precision_score
import pandas as pd
from sklearn.preprocessing import MinMaxScaler


# Load the model from the file
model_file_path = 'best_model.pkl'
with open(model_file_path, 'rb') as f:
    model = pickle.load(f)
    scaler=MinMaxScaler()
    # Load test data
    test_data = pd.read_csv(r"artifacts\data_ingestion\test.csv")
    x_test = test_data.iloc[:, :-1]  # Features
    y_test = test_data.iloc[:, -1]   # Target
    x_test= scaler.fit_transform(x_test)


    # Predict using the loaded machine learning model
    y_pred = model.predict(x_test)

    # Evaluate model
    precision = precision_score(y_test, y_pred, average='micro')
    accuracy = accuracy_score(y_test, y_pred)

print("Precision:", precision)
print("Accuracy:", accuracy)




Precision: 0.8397352147025521
Accuracy: 0.8397352147025521


In [None]:
x_test.shape ,y_test.shape

((11481, 111), (11481,))

In [None]:
import pickle
from sklearn.ensemble import RandomForestClassifier

# Load the model from the file
model_file_path = 'best_model.pkl'
with open(model_file_path, 'rb') as f:
    loaded_model = pickle.load(f)

# Check if the loaded object is an instance of RandomForestClassifier
if isinstance(loaded_model, RandomForestClassifier):
    print("Model loaded successfully and is an instance of RandomForestClassifier.")
else:
    print("Error: The loaded object is not an instance of RandomForestClassifier.")


Model loaded successfully and is an instance of RandomForestClassifier.


In [None]:
features = extract_features("https://www.digitalocean.com/community/tutorials/how-to-handle-errors-in-a-flask-application")

# Count how many expected features are present in the features dictionary
matching_count = sum(1 for feature in expected_features if feature in features)

print("Number of matching features:", matching_count)

Number of matching features: 98


In [None]:
import pickle

# Load the data from the pickle file
with open('models.pkl', 'rb') as f:
    data = pickle.load(f)

# Step 1: Check the type of loaded data
print("Type of loaded data:", type(data))

# Step 2: Inspect the loaded data
print("Loaded data:", data)

# Step 3: Access the 'models' dictionary if present
if isinstance(data, dict):
    models_dict = data.get('models')
    if models_dict is not None:
        # Now you can proceed to check the types of values associated with each key in models_dict
        for key, value in models_dict.items():
            print("Key:", key, "Value type:", type(value))
    else:
        print("The loaded data does not contain a 'models' dictionary.")
else:
    print("The loaded data is not a dictionary.")



Type of loaded data: <class 'dict'>
Loaded data: {'model': RandomForestClassifier(), 'accuracy': 0.9574078912986673, 'precision': 0.9574078912986673}
The loaded data does not contain a 'models' dictionary.


In [None]:
from phishingdetection.utils.common import expected_features, extract_features
extracted_features = extract_features("https://www.digitalocean.com/community/tutorials/how-to-handle-errors-in-a-flask-application")

# Combine expected features with extracted features
sorted_extracted_features = [x for _, x in sorted(zip(expected_features, extracted_features))]

print(sorted_extracted_features)



['qty_dot_file', 'qty_tld_url', 'server_client_domain', 'time_response', 'qty_dot_params', 'qty_dot_domain', 'tld_present_params', 'qty_exclamation_directory', 'qty_exclamation_domain', 'qty_exclamation_file', 'qty_exclamation_params', 'qty_at_url', 'qty_hashtag_directory', 'qty_hashtag_domain', 'qty_hashtag_file', 'qty_hashtag_params', 'qty_plus_url', 'qty_and_directory', 'qty_and_domain', 'qty_and_file', 'qty_and_params', 'qty_equal_url', 'qty_plus_directory', 'qty_plus_domain', 'qty_plus_file', 'qty_plus_params', 'qty_tilde_url', 'qty_percent_directory', 'qty_percent_domain', 'qty_percent_file', 'qty_percent_params', 'qty_hashtag_url', 'qty_hyphen_directory', 'qty_hyphen_domain', 'qty_hyphen_file', 'qty_hyphen_params', 'length_url', 'qty_at_directory', 'qty_at_domain', 'qty_at_file', 'qty_at_params', 'qty_questionmark_url', 'qty_space_directory', 'qty_space_domain', 'qty_space_file', 'qty_space_params', 'qty_and_url', 'qty_dollar_directory', 'qty_dollar_domain', 'qty_dollar_file', '