In [1]:
import numpy as np
import pandas as pd
import torch
from torch.utils.data import DataLoader
from PIL import Image
from torchvision import transforms
import torch.optim as optim
from torch.optim.lr_scheduler import ExponentialLR
import datetime
import matplotlib.pyplot as plt

txtData = pd.read_csv('phishing_urls.csv')
print(txtData.head())
print("\nShape:", txtData.shape)

urls = txtData['url']

statuses = txtData['status']
statuses = torch.tensor(statuses)

                                                 url  status
0                mp3raid.com/music/krizz_kaliko.html       1
1                    bopsecrets.org/rexroth/cr/1.htm       1
2  http://buzzfil.net/m/show-art/ils-etaient-loin...       1
3      espn.go.com/nba/player/_/id/3457/brandon-rush       1
4     yourbittorrent.com/?q=anthony-hamilton-soulife       1

Shape: (188222, 2)


In [2]:
dictionary = 'abcdefghijklmnopqrstuvwxyz_0123456789-;.!?:/\\|#$%^&~’+=<>(),"’|^'

image_transform = transforms.Compose([
        transforms.ToTensor()  # Convert url images to tensors 
    ])

def one_hot_encode(text, dictionary, target_length):
    encoded_text = []
    for char in text:
        one_hot = [int(char == c) for c in dictionary]
        encoded_text.append(one_hot)
        
    encoded_array = np.array(encoded_text, dtype=np.uint8)
    
    # Pad or truncate the first dimension to the target length (256)
    processed_array = np.pad(encoded_array, ((0, max(0, target_length - encoded_array.shape[0])), (0, 0)), mode='constant')[:, :target_length]
    
    # Clip the vectors to be at most 256 elements
    processed_array = processed_array[:256, :]
    
    # Add channel and batch dimensions
    processed_array = processed_array.reshape(1, 1, processed_array.shape[0], processed_array.shape[1])
        
    # Convert to pixel values (0 and 1) for visualization
    pixel_values = (processed_array * 255).astype(np.uint8)
    
    # Create a PIL Image
    encoded_image = Image.fromarray(pixel_values[0, 0].squeeze(), mode='L')
    
    # Apply image transformation
    encoded_image = image_transform(encoded_image)
    
    return encoded_image

one_hot_urls = [one_hot_encode(url, dictionary, 256) for url in urls]

url_set = [(url, status) for url, status in zip(one_hot_urls, statuses)]

In [3]:
print("URL: ", url_set[1][0])
print("Status: ", url_set[1][1])
print(url_set[0][0].shape)
      
batch_size = 128

train_size = int(0.85 * len(url_set))
test_size = int(0.05 * len(url_set))
valid_size = len(url_set) - train_size - test_size

train_set, test_set, valid_set = torch.utils.data.random_split(url_set, [train_size, test_size, valid_size])

train_loader = DataLoader(train_set, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_set, batch_size=batch_size, shuffle=False)
valid_loader = DataLoader(valid_set, batch_size=batch_size, shuffle=True)
print("# of batches:", len(train_loader))

URL:  tensor([[[0., 1., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         ...,
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.]]])
Status:  tensor(1)
torch.Size([1, 256, 64])
# of batches: 1250


In [4]:
import torch.nn as nn

class CNN(nn.Module):

    def __init__(self, encoder=None, classifier=None):
        super(CNN, self).__init__()
        
        self.encoder = nn.Sequential(
            nn.Conv2d(1, 8, kernel_size=(4, 4), stride=(2, 2)),
            nn.ReLU(),
            nn.MaxPool2d(2, 1),
            nn.Conv2d(8, 16, kernel_size=(3, 3), stride=(2, 2)),
            nn.ReLU(),
            nn.MaxPool2d(2, 1),
            nn.Conv2d(16, 32, kernel_size=(2, 2), stride=(2, 2), padding=0),
            nn.ReLU(),
            nn.MaxPool2d(5, 1),
        )

        self.classifier = nn.Sequential(
            nn.Linear(32*26*2, 2),
            nn.Linear(2, 1),
            nn.Sigmoid(),
        )

        self.init_encoder_weights(mean=0.0, std=0.01)

        self.init_classifier_weights(mean=0.0, std=0.01)

    def init_encoder_weights(self, mean, std):
        for param in self.encoder.parameters():
            nn.init.normal_(param, mean=mean, std=std)

    def init_classifier_weights(self, mean, std):
        for param in self.classifier.parameters():
            nn.init.normal_(param, mean=mean, std=std)

    def forward(self, x):

#         print("After Transpose Shape:", x.shape)
        # print(x.shape)
        x = self.encoder(x)
        # print(x.shape)
        x = x.view(x.size(0), -1)
#         print(x.shape)
#         print("Flattened Shape", x.shape)
        return self.classifier(x)

In [5]:
def train(n_epochs, optimizer, model, scheduler, loss_fn, train_loader, device,
          save_classifier_path, save_encoder_path, save_plot_path):
    print("training...")

    avg_loss = []
    losses_valid = []
    epochs = []

    for epoch in range(1, n_epochs + 1):
        print('Epoch', epoch)

        # Initialize a new list for this epoch
        loss_train = 0.00

        data_iter = iter(train_loader)

        model.train()  # Keep track of gradient for backtracking

        # Iterate through batches
        for batch in range(int(len(train_loader))):
            urls, labels = next(data_iter)
            # Move tensors to the configured device
            urls = urls.to(device)
            labels = labels.to(device)
            optimizer.zero_grad()
            # Forward pass through model
            # print("Input Shape Before Forward Pass:", urls.shape)
            outputs = model(urls)
            
            loss = loss_fn(outputs.squeeze(1), labels.float())
            loss.backward()
            optimizer.step()
            loss_train += loss.item()

        # Calculate the average loss over batches for the entire epoch
        avg_loss += [loss_train / len(train_loader)]

        # Validation loop
        model.eval()  # Set the model to evaluation mode
        loss_valid = 0.0
        with torch.no_grad():
            for urls, labels in valid_loader:
                urls = urls.to(device)
                labels = labels.to(device)
                outputs = model(urls)
                loss = loss_fn(outputs.squeeze(1), labels.float())
                loss_valid += loss.item()

        # Calculate the average validation loss for the entire epoch
        avg_valid_loss = loss_valid / len(valid_loader)
        losses_valid.append(avg_valid_loss)

        scheduler.step()

        # Arrays for plotting loss
        epochs.append(epoch)

        print('{} Epoch {}, Training loss {}, Validation loss {}'.format(datetime.datetime.now(), epoch,
                                                                         loss_train / len(train_loader),
                                                                         loss_valid / len(valid_loader)))

    torch.save(model.classifier.state_dict(), save_classifier_path)
    torch.save(model.encoder.state_dict(), save_encoder_path)

    # Plot training and validation loss over epochs
    plt.plot(epochs, avg_loss, label='Training Loss', color='blue')
    plt.plot(epochs, losses_valid, label='Validation Loss', color='red')
    plt.legend()
    plt.xlabel('Epoch')
    plt.ylabel('Loss')
    plt.title('Training and Validation Loss')
    plt.savefig(save_plot_path)
    plt.show()
    plt.close()

In [None]:
# batch size of 128 was used
n_epochs = 60
loss_fn = nn.BCELoss()
save_classifier_path = 'CNNclassifier.pth'
save_encoder_path = 'CNNencoder.pth'
save_plot_path = 'loss.CNN.png'

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)

model = CNN()
model.to(device)

# summary(model, (1, 128, 16))

# optimizer = optimizer = optim.SGD(model.parameters(), lr=0.0005, weight_decay=0.0005, momentum=0.9)
optimizer = optim.Adam(model.parameters(), lr=0.001)

scheduler = ExponentialLR(optimizer=optimizer, gamma=0.9)

train(n_epochs, optimizer, model, scheduler, loss_fn, train_loader, device,
          save_classifier_path, save_encoder_path, save_plot_path)

In [72]:
from sklearn.metrics import confusion_matrix

def test(model, test_loader, device, loss_fn):
    print("testing...")

    model.eval()  # Set the model to evaluation mode
    loss_test = 0.0
    correct_predictions = 0
    total_samples = 0
    all_predictions = []
    all_labels = []

    with torch.no_grad():
        for urls, labels in test_loader:
            urls = urls.to(device)
            labels = labels.to(device)
            outputs = model(urls)
            outputs = outputs.squeeze(1)
            loss = loss_fn(outputs, labels.float())
            loss_test += loss.item()

            # Calculate accuracy
            predictions = torch.round(outputs)
            correct_predictions += (predictions == labels).sum().item()
            total_samples += labels.size(0)

            # Collect predictions and targets
            all_predictions.extend(predictions.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

    avg_test_loss = loss_test / len(test_loader)
    accuracy = correct_predictions / total_samples

    cm = confusion_matrix(all_labels, all_predictions)
    print("Confusion Matrix:")
    print(cm)

    print('{} Test Loss: {}, Accuracy: {:.2%}'.format(datetime.datetime.now(), avg_test_loss, accuracy))
    
    return all_predictions, all_labels



In [73]:
encoder_path = '/kaggle/working/CNNencoder.pth'
classifier_path = '/kaggle/working/CNNclassifier.pth'

test_model = CNN()

encoder_weights = torch.load(encoder_path, map_location=torch.device(device))   
classifier_weights = torch.load(classifier_path, map_location=torch.device(device))

test_model.to(device)

test_model.encoder.load_state_dict(encoder_weights)
test_model.classifier.load_state_dict(classifier_weights)

preds, labels = test(test_model, test_loader, device, loss_fn)

testing...
Confusion Matrix:
[[4416  394]
 [ 178 4423]]
2023-11-30 02:32:58.110859 Test Loss: 0.1669419023233491, Accuracy: 93.92%


In [71]:
for i in range(20):
    print(preds[i], labels[i])

0.0 0
0.0 0
0.0 0
1.0 1
1.0 1
0.0 0
1.0 1
1.0 1
0.0 0
0.0 0
0.0 0
1.0 1
1.0 1
1.0 1
1.0 1
1.0 1
0.0 0
0.0 0
1.0 1
1.0 1
