In [2]:
import os
os.chdir("../")
%pwd

'c:\\Users\\HP\\OneDrive\\Desktop\\pishing'

In [3]:
from dataclasses import dataclass
from pathlib import Path


@dataclass(frozen=True)
class ModelTrainerConfig:
    root_dir:Path
    file_path:Path

In [4]:
from phishingdetection.constants import *
from phishingdetection.utils.common import read_yaml, create_directories
from phishingdetection.logging import logger
import pandas as pd

In [5]:
class ConfigurationManager:
    def __init__(
        self,
        config_filepath=CONFIG_FILE_PATH,
        params_filepath=PARAMS_FILE_PATH):

        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)

        create_directories([self.config.artifacts_root])

    
    def get_model_trainer_config(self) -> ModelTrainerConfig:
        config = self.config.model_trainer
        
        create_directories([config.root_dir])

        model_trainer_config = ModelTrainerConfig(
            root_dir=config.root_dir,
            file_path=config.file_path,
            # Add other attributes as needed from your ModelTrainerConfig class
        )

        return model_trainer_config


In [6]:

from sklearn.linear_model import LogisticRegression, RidgeClassifier
from sklearn.svm import LinearSVC, SVC
from sklearn.neighbors import KNeighborsClassifier, RadiusNeighborsClassifier
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier
from sklearn.neural_network import MLPClassifier



In [7]:
import logging
import pandas as pd
import pickle
from sklearn.metrics import accuracy_score, precision_score



class ModelTrainer:
    def __init__(self, config: ModelTrainerConfig):
        self.config = config

    def input_data(self):
        logging.info("Splitting training and test input data")
        train_data = pd.read_csv(r"artifacts\data_transformation\train_scaled.csv")
        test_data = pd.read_csv(r"artifacts\data_transformation\test_scaled.csv")
        logging.info("Split training and test input data")
        X_train, y_train, X_test, y_test = (
            train_data.iloc[:, :-1],
            train_data.iloc[:, -1],
            test_data.iloc[:, :-1],
            test_data.iloc[:, -1]
        )
        
        models = [
            LogisticRegression(max_iter=1000),
            RidgeClassifier(alpha=0.005),
            LinearSVC(dual=False),
            SVC(),
            KNeighborsClassifier(n_neighbors=5),
            DecisionTreeClassifier(),
            RandomForestClassifier(),
            AdaBoostClassifier(),
            MLPClassifier()
        ]
            
        
        return X_train, y_train, X_test, y_test, models

    

In [8]:
class ModelTrainer:
    def __init__(self, config: ModelTrainerConfig):
        self.config = config

    def input_data(self):
        logging.info("Splitting training and test input data")
        train_data_scaled = pd.read_csv(r"artifacts\data_transformation\train_scaled.csv")
        test_data_scaled = pd.read_csv(r"artifacts\data_transformation\test_scaled.csv")
        train_data=pd.read_csv(r"artifacts\data_transformation\train_y_scaled.csv")
        test_data=pd.read_csv(r"artifacts\data_transformation\test_y_scaled.csv")
        logging.info("Split training and test input data")
        X_train, y_train, X_test, y_test = (
            train_data_scaled.iloc[:, :],
            train_data.iloc[:,:],
            test_data_scaled.iloc[:, :],
            test_data.iloc[:, :]
        )
        
        models = [
            LogisticRegression(max_iter=1000),
            RidgeClassifier(alpha=0.005),
            LinearSVC(dual=False),
            SVC(),
            KNeighborsClassifier(n_neighbors=5),
            DecisionTreeClassifier(),
            RandomForestClassifier(),
            AdaBoostClassifier(),
            MLPClassifier()
        ]

        return X_train, y_train, X_test, y_test, models

    def model_selection(self, X_train, y_train, X_test, y_test, models):
        accuracy_result = []
        precision_result = []
        models_dict = {}

        for model in models:
            model.fit(X_train, y_train.values.ravel())
            y_pred = model.predict(X_test)
            precision = precision_score(y_test, y_pred, average='micro')
            accuracy = accuracy_score(y_test, y_pred)
            accuracy_result.append(accuracy)
            precision_result.append(precision)
            models_dict[str(model)] = {'model': model, 'accuracy': accuracy, 'precision': precision}

        return pd.DataFrame({'models': list(models_dict.keys()), 'accuracy': accuracy_result, 'precision': precision_result}), models_dict

    def save_models(self, models_dict):
        logging.info("Saving models to file")
        with open(self.config.file_path, 'wb') as f:
            pickle.dump(models_dict, f)
        logging.info("Models saved successfully.")

In [9]:
try:
    config = ConfigurationManager()
    model_trainer_config = config.get_model_trainer_config()
    model_trainer_config = ModelTrainer(config=model_trainer_config)
    X_train, y_train, X_test, y_test,models = model_trainer_config.input_data()
    models_dict= model_trainer_config.model_selection(X_train, y_train, X_test, y_test, models)
    model_trainer_config.save_models(models_dict)
except Exception as e:
    raise e


[2024-04-09 14:57:08,216: INFO: common: yaml file: config\config.yaml loaded successfully]
[2024-04-09 14:57:08,218: INFO: common: yaml file: params.yaml loaded successfully]
[2024-04-09 14:57:08,221: INFO: common: created directory at: artifacts]
[2024-04-09 14:57:08,223: INFO: common: created directory at: artifacts/model_trainer]
[2024-04-09 14:57:08,224: INFO: 3652498402: Splitting training and test input data]
[2024-04-09 14:57:08,532: INFO: 3652498402: Split training and test input data]
[2024-04-09 15:03:15,555: INFO: 3652498402: Saving models to file]
[2024-04-09 15:03:15,655: INFO: 3652498402: Models saved successfully.]




In [10]:
print(X_train.shape,y_train.shape,X_test.shape,y_test.shape) 

(70917, 22) (70917, 1) (17730, 22) (17730, 1)


In [11]:
y_train

Unnamed: 0,phishing
0,0
1,0
2,0
3,1
4,1
...,...
70912,1
70913,0
70914,1
70915,0


In [12]:
import pickle

# Load the .pkl file
with open('artifacts\model_trainer\model_selection.pkl', 'rb') as f:
    data = pickle.load(f)

# Check what keys are present in the loaded data
print("Keys in the pickle file:", data.keys())

# Assuming the models are stored under a key named 'models'
if 'models' in data:
    # Print or iterate through the list of models
    print("List of models:")
    for model in data['models']:
        print(model)
else:
    print("No 'models' key found in the pickle file.")


AttributeError: 'tuple' object has no attribute 'keys'

In [None]:

import pickle
from sklearn.tree import DecisionTreeClassifier

# Load the pickle file
with open('saved_models.pkl', 'rb') as f:
    data = pickle.load(f)

data



FileNotFoundError: [Errno 2] No such file or directory: 'saved_models.pkl'

In [None]:
import pickle

# Load the data from the pickle file
with open('artifacts\model_trainer\model_selection.pkl', 'rb') as f:
    data = pickle.load(f)

# Step 1: Check the type of loaded data
print("Type of loaded data:", type(data))

# Step 2: Inspect the loaded data
print("Loaded data:", data)

# Step 3: Access the 'models' dictionary if present
if isinstance(data, dict):
    models_dict = data.get('models')
    if models_dict is not None:
        # Now you can proceed to check the types of values associated with each key in models_dict
        for key, value in models_dict.items():
            print("Key:", key, "Value type:", type(value))
    else:
        print("The loaded data does not contain a 'models' dictionary.")
else:
    print("The loaded data is not a dictionary.")



FileNotFoundError: [Errno 2] No such file or directory: 'artifacts\\model_trainer\\model_selection.pkl'

In [None]:
import pickle

# Load the pickle file
with open('models.pkl', 'rb') as f:
    data = pickle.load(f)


# Access the models dictionary
models_dict = data['model']

# Assuming index 5 contains the trained DecisionTreeClassifier model
best_model = models_dict

# Now you can use the best_model for prediction
# For example, if you have some new data 'X_test', you can predict using:
y_pred = best_model.predict(X_test)



FileNotFoundError: [Errno 2] No such file or directory: 'models.pkl'

In [None]:
# Check if the second element of the loaded data is a dictionary
if isinstance(data[1], dict):
    print("Models are saved in the file.")
else:
    print("Models are not saved in the file.")


Models are saved in the file.


In [None]:
# Access the models dictionary from the loaded data
models_dict = data[1]

# Now you can access each model object along with its associated metrics
for model_name, model_data in models_dict.items():
    model = model_data['model']
    accuracy = model_data['accuracy']
    precision = model_data['precision']
    print(f"Model: {model_name}, Accuracy: {accuracy}, Precision: {precision}")

# You can also access a specific model by its name
specific_model_name = 'LogisticRegression(max_iter=1000)'  # Example model name
specific_model_data = models_dict.get(specific_model_name)
if specific_model_data:
    specific_model = specific_model_data['model']
    specific_accuracy = specific_model_data['accuracy']
    specific_precision = specific_model_data['precision']
    print(f"Specific Model: {specific_model}, Accuracy: {specific_accuracy}, Precision: {specific_precision}")
else:
    print(f"Model '{specific_model_name}' not found.")


Model: LogisticRegression(max_iter=1000), Accuracy: 0.8932148767528961, Precision: 0.8932148767528961
Model: RidgeClassifier(alpha=0.005), Accuracy: 0.8767528960891908, Precision: 0.8767528960891908
Model: LinearSVC(dual=False), Accuracy: 0.8966117933977876, Precision: 0.8966117933977876
Model: SVC(), Accuracy: 0.8927793746189356, Precision: 0.8927793746189356
Model: KNeighborsClassifier(), Accuracy: 0.9324971692361292, Precision: 0.9324971692361292
Model: DecisionTreeClassifier(), Accuracy: 0.9324971692361292, Precision: 0.9324971692361292
Model: RandomForestClassifier(), Accuracy: 0.9574078912986673, Precision: 0.9574078912986673
Model: AdaBoostClassifier(), Accuracy: 0.9043637313822838, Precision: 0.9043637313822838
Model: MLPClassifier(), Accuracy: 0.9383328978311993, Precision: 0.9383328978311993
Specific Model: LogisticRegression(max_iter=1000), Accuracy: 0.8932148767528961, Precision: 0.8932148767528961


In [None]:
import pickle
import pandas as pd

# Load the new data

# Preprocess the new data (if needed)
# Make sure it has the same format as the data used for training the models

# Load the models from the saved file
with open('saved_models.pkl', 'rb') as f:
    loaded_data = pickle.load(f)
    models_dict = loaded_data[1]  # Assuming the models are saved in the second element of the tuple

# Make predictions using the loaded models
for model_name, model_data in models_dict.items():
    model = model_data['model']
    y_pred = model.predict(X_test)  # Assuming new_data has the same features as X_train
    print(f"Model: {model_name}, Predictions: {y_pred}")


Model: LogisticRegression(max_iter=1000), Predictions: [0 0 0 ... 1 0 1]
Model: RidgeClassifier(alpha=0.005), Predictions: [0 0 0 ... 1 0 1]
Model: LinearSVC(dual=False), Predictions: [0 0 0 ... 1 0 1]
Model: SVC(), Predictions: [0 0 0 ... 1 0 1]
Model: KNeighborsClassifier(), Predictions: [0 0 1 ... 1 0 1]
Model: DecisionTreeClassifier(), Predictions: [0 0 1 ... 1 0 1]
Model: RandomForestClassifier(), Predictions: [0 0 1 ... 1 0 1]
Model: AdaBoostClassifier(), Predictions: [0 0 1 ... 1 0 1]
Model: MLPClassifier(), Predictions: [0 0 1 ... 1 0 1]


In [None]:
import pickle

# Load the pickle file
with open('models.pkl', 'rb') as f:
    data = pickle.load(f)


# Access the models dictionary
models_dict = data['model']

# Assuming index 5 contains the trained DecisionTreeClassifier model
best_model = models_dict

# Now you can use the best_model for prediction
# For example, if you have some new data 'X_test', you can predict using:
y_pred = best_model.predict(X_test)
y_pred



FileNotFoundError: [Errno 2] No such file or directory: 'models.pkl'

In [None]:
import pickle
from sklearn.ensemble import RandomForestClassifier  # Assuming RandomForestClassifier is the model you're using

# Load the model from the file
model_file_path = 'models.pkl'  # Update with your file path
with open(model_file_path, 'rb') as f:
    loaded_model = pickle.load(f)

# Check if the loaded object is an instance of RandomForestClassifier
if isinstance(loaded_model, RandomForestClassifier):
    print("Model loaded successfully and is an instance of RandomForestClassifier.")
else:
    print("Error: The loaded object is not an instance of RandomForestClassifier.")


FileNotFoundError: [Errno 2] No such file or directory: 'models.pkl'