In [1]:
import os
import torch
import torch.nn as nn
from torchvision import datasets, transforms, models
from torch.utils.data import DataLoader
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split
import numpy as np
from PIL import Image

In [2]:
# Hyperparameters
BATCH_SIZE = 4
EPOCHS = 30
LEARNING_RATE = 0.001
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [3]:
# Dataset paths
DATA_DIR = "/home/mahdi/Phishing_Project/datasets"  # Dataset with logo folders
OUTPUT_DIR = "/home/mahdi/Phishing_Project/trained_model"

In [4]:
# Data Transforms
data_transforms = {
    "train": transforms.Compose([
        transforms.Resize((224, 224)),
        transforms.RandomHorizontalFlip(),
        transforms.RandomRotation(10),
        transforms.ToTensor(),
        transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
    ]),
    "val": transforms.Compose([
        transforms.Resize((224, 224)),
        transforms.ToTensor(),
        transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
    ]),
}

In [5]:
#Load Dataset
dataset = datasets.ImageFolder(DATA_DIR, transform=data_transforms["train"])
train_idx, val_idx = train_test_split(list(range(len(dataset))), test_size=0.2, random_state=42)
train_dataset = torch.utils.data.Subset(dataset, train_idx)
val_dataset = torch.utils.data.Subset(dataset, val_idx)

dataloaders = {
    "train": DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True),
    "val": DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False),
}
dataset_sizes = {"train": len(train_dataset), "val": len(val_dataset)}
class_names = dataset.classes


In [6]:
# Load Pretrained EfficientNet
model = models.efficientnet_b0(weights=models.EfficientNet_B0_Weights.IMAGENET1K_V1)
num_features = model.classifier[1].in_features
model.classifier[1] = nn.Linear(num_features, len(class_names))  # Replace final layer for custom classes
model = model.to(DEVICE)


In [7]:
 #Loss and Optimizer
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE)

In [8]:
# Training Function
def train_model(model, dataloaders, criterion, optimizer, num_epochs=EPOCHS):
    for epoch in range(num_epochs):
        print(f"Epoch {epoch+1}/{num_epochs}")
        print("-" * 10)

        for phase in ["train", "val"]:
            if phase == "train":
                model.train()
            else:
                model.eval()

            running_loss = 0.0
            running_corrects = 0

            for inputs, labels in dataloaders[phase]:
                inputs, labels = inputs.to(DEVICE), labels.to(DEVICE)

                optimizer.zero_grad()

                with torch.set_grad_enabled(phase == "train"):
                    outputs = model(inputs)
                    _, preds = torch.max(outputs, 1)
                    loss = criterion(outputs, labels)

                    if phase == "train":
                        loss.backward()
                        optimizer.step()

                running_loss += loss.item() * inputs.size(0)
                running_corrects += torch.sum(preds == labels.data)

            epoch_loss = running_loss / dataset_sizes[phase]
            epoch_acc = running_corrects.double() / dataset_sizes[phase]

            print(f"{phase} Loss: {epoch_loss:.4f} Acc: {epoch_acc:.4f}")

        print()

    return model

In [9]:
# Train and Save Model
model = train_model(model, dataloaders, criterion, optimizer)
os.makedirs(OUTPUT_DIR, exist_ok=True)
torch.save(model.state_dict(), os.path.join(OUTPUT_DIR, "efficientnet_finetuned_1.pth"))
print("Model training complete and saved.")


Epoch 1/30
----------




train Loss: 0.5512 Acc: 0.7901
val Loss: 0.2595 Acc: 0.9048

Epoch 2/30
----------
train Loss: 0.4071 Acc: 0.8395
val Loss: 0.4424 Acc: 0.7143

Epoch 3/30
----------
train Loss: 0.3725 Acc: 0.8642
val Loss: 0.8003 Acc: 0.7619

Epoch 4/30
----------
train Loss: 0.4172 Acc: 0.8272
val Loss: 0.2804 Acc: 0.9048

Epoch 5/30
----------
train Loss: 0.2839 Acc: 0.9136
val Loss: 0.2161 Acc: 0.9048

Epoch 6/30
----------
train Loss: 0.0868 Acc: 0.9877
val Loss: 0.0405 Acc: 1.0000

Epoch 7/30
----------
train Loss: 0.1625 Acc: 0.9753
val Loss: 0.4959 Acc: 0.9048

Epoch 8/30
----------
train Loss: 0.2670 Acc: 0.9012
val Loss: 0.2600 Acc: 0.9524

Epoch 9/30
----------
train Loss: 0.1446 Acc: 0.9383
val Loss: 0.3306 Acc: 0.8095

Epoch 10/30
----------
train Loss: 0.1349 Acc: 0.9630
val Loss: 0.3232 Acc: 0.8571

Epoch 11/30
----------
train Loss: 0.2246 Acc: 0.9506
val Loss: 0.6845 Acc: 0.8571

Epoch 12/30
----------
train Loss: 0.5143 Acc: 0.8395
val Loss: 0.1790 Acc: 0.9048

Epoch 13/30
----------


In [10]:
# Similarity Function
def get_image_embedding(image_path, model, transform):
    model.eval()
    img = Image.open(image_path).convert("RGB")
    img = transform(img).unsqueeze(0).to(DEVICE)

    with torch.no_grad():
        features = model.extract_features(img)
    return features.squeeze().cpu().numpy()

In [11]:
def find_most_similar(input_image_path, valid_images_folder, model, transform):
    input_embedding = get_image_embedding(input_image_path, model, transform)

    most_similar_image = None
    highest_similarity = -1

    for valid_image_name in os.listdir(valid_images_folder):
        valid_image_path = os.path.join(valid_images_folder, valid_image_name)
        valid_embedding = get_image_embedding(valid_image_path, model, transform)
        similarity = cosine_similarity(
            [input_embedding], [valid_embedding]
        )[0][0]

        if similarity > highest_similarity:
            highest_similarity = similarity
            most_similar_image = valid_image_name

    return most_similar_image, highest_similarity

In [12]:
# Example: Find most similar logo
# 'BM_LOGO-04.png'
img1 = 'test_2/309_21.jpg'
valid_img = ['BM_LOGO-00.png' , 'BM_LOGO-01.png' ,  'BM_LOGO-02.png' , 'BM_LOGO-03.png' , 'BM_LOGO-04.png','BM_LOGO-05.png']
input_image_path = f"/home/mahdi/Phishing_Project/images/{img1}"

# valid_img_path = '/home/mahdi/Phishing_Project/Valid_images/'

# input_image_path = "/path/to/input/logo.jpg"
valid_images_folder = '/home/mahdi/Phishing_Project/Valid_images/'
most_similar, similarity_score = find_most_similar(input_image_path, valid_images_folder, model, data_transforms["val"])
print(f"Most similar image: {most_similar} with similarity score: {similarity_score:.4f}")

AttributeError: 'EfficientNet' object has no attribute 'extract_features'

# Test Model

In [13]:
# Define the model architecture with a matching classifier
model = models.efficientnet_b0(weights=None)  # Initialize without pre-trained weights
num_classes = 2  # Replace with the number of classes you trained on
model.classifier[1] = torch.nn.Linear(model.classifier[1].in_features, num_classes)

# Load the saved weights
model_path = "/home/mahdi/Phishing_Project/trained_model/efficientnet_finetuned.pth"
model.load_state_dict(torch.load(model_path, map_location=torch.device("cpu")))
model.eval()  # Set model to evaluation mode
model = model.to(torch.device("cuda" if torch.cuda.is_available() else "cpu"))


  model.load_state_dict(torch.load(model_path, map_location=torch.device("cpu")))


In [14]:
state_dict = torch.load(model_path, map_location=torch.device("cpu"))
print(state_dict.keys())

odict_keys(['features.0.0.weight', 'features.0.1.weight', 'features.0.1.bias', 'features.0.1.running_mean', 'features.0.1.running_var', 'features.0.1.num_batches_tracked', 'features.1.0.block.0.0.weight', 'features.1.0.block.0.1.weight', 'features.1.0.block.0.1.bias', 'features.1.0.block.0.1.running_mean', 'features.1.0.block.0.1.running_var', 'features.1.0.block.0.1.num_batches_tracked', 'features.1.0.block.1.fc1.weight', 'features.1.0.block.1.fc1.bias', 'features.1.0.block.1.fc2.weight', 'features.1.0.block.1.fc2.bias', 'features.1.0.block.2.0.weight', 'features.1.0.block.2.1.weight', 'features.1.0.block.2.1.bias', 'features.1.0.block.2.1.running_mean', 'features.1.0.block.2.1.running_var', 'features.1.0.block.2.1.num_batches_tracked', 'features.2.0.block.0.0.weight', 'features.2.0.block.0.1.weight', 'features.2.0.block.0.1.bias', 'features.2.0.block.0.1.running_mean', 'features.2.0.block.0.1.running_var', 'features.2.0.block.0.1.num_batches_tracked', 'features.2.0.block.1.0.weight',

  state_dict = torch.load(model_path, map_location=torch.device("cpu"))


In [15]:
import numpy as np
from PIL import Image
from torchvision import transforms

# Define transform for preprocessing
data_transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]),
])

def extract_features(image_path, model, transform):
    """
    Extract feature embeddings from an image using the saved model.

    Args:
        image_path (str): Path to the image.
        model (torch.nn.Module): Loaded model for feature extraction.
        transform (callable): Data transform for preprocessing.

    Returns:
        numpy.ndarray: Feature embedding of the image.
    """
    device = next(model.parameters()).device
    image = Image.open(image_path).convert("RGB")
    image = transform(image).unsqueeze(0).to(device)  # Add batch dimension

    with torch.no_grad():
        features = model(image).squeeze().cpu().numpy()

    return features


In [16]:
from sklearn.metrics.pairwise import cosine_similarity

def find_most_similar(input_image_path, valid_images_folder, model, transform, threshold=0.8):
    """
    Compare an input image against valid images to find the most similar one.

    Args:
        input_image_path (str): Path to the input image.
        valid_images_folder (str): Path to the folder containing valid images.
        model (torch.nn.Module): Loaded model for feature extraction.
        transform (callable): Data transform for preprocessing.
        threshold (float): Similarity threshold for classification.

    Returns:
        str, float: Most similar valid image and similarity score, or "not similar".
    """
    # Extract features for the input image
    input_features = extract_features(input_image_path, model, transform)

    most_similar_image = None
    highest_similarity = -1

    # Loop through valid images
    for valid_image_name in os.listdir(valid_images_folder):
        valid_image_path = os.path.join(valid_images_folder, valid_image_name)
        
        # Skip non-image files
        if not valid_image_name.lower().endswith(('.png', '.jpg', '.jpeg', '.bmp', '.gif', '.svg')):
            continue
        
        # Extract features for the valid image
        valid_features = extract_features(valid_image_path, model, transform)

        # Compute cosine similarity
        similarity = cosine_similarity([input_features], [valid_features])[0][0]

        # Update the most similar image if the current similarity is higher
        if similarity > highest_similarity:
            highest_similarity = similarity
            most_similar_image = valid_image_name

    # Decision based on threshold
    if highest_similarity >= threshold:
        return most_similar_image, highest_similarity
    else:
        return "not similar", highest_similarity


In [24]:
input_image_path = "/path/to/input/image.jpg"
# valid_images_folder = "/path/to/valid/images"
threshold = 0.8

img1 = 'test_2/302_29.jpg'

# valid_img = ['BM_LOGO-00.png' , 'BM_LOGO-01.png' ,  'BM_LOGO-02.png' , 'BM_LOGO-03.png' , 'BM_LOGO-04.png','BM_LOGO-05.png']
input_image_path = f"/home/mahdi/Phishing_Project/images/{img1}"

# valid_img_path = '/home/mahdi/Phishing_Project/Valid_images/'

# input_image_path = "/path/to/input/logo.jpg"
valid_images_folder = '/home/mahdi/Phishing_Project/Valid_images'



result, similarity_score = find_most_similar(input_image_path, valid_images_folder, model, data_transform, threshold)
if result != "not similar":
    print(f"Input image is similar to {result} with similarity score: {similarity_score:.4f}")
else:
    print(f"Input image is not similar to any valid image with similarity score: {similarity_score:.4f}.")
    

Input image is similar to BM_LOGO-03.png with similarity score: 0.9995
