## Pretrain a model that classifies fraudulent emails

In [13]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import torch
import torch.nn as nn

In [14]:
kaggle_pt = torch.load('kaggle_hidden_states.pt').float()
labels_pt = torch.from_numpy(np.array(torch.load('kaggle_labels.pt'))).float()
print(f"kaggle_pt.shape: {kaggle_pt.shape}, labels_pt.shape: {labels_pt.shape}")

kaggle_pt.shape: torch.Size([11928, 64, 768]), labels_pt.shape: torch.Size([11928])


`kaggle_pt` represents the kaggle hidden states observed by the pre-trained BERT model

With the shape of [n, 64, 768]

In [5]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

class MLP(nn.Module):
    def __init__(self, dropout_rate=0.2):
        super(MLP, self).__init__()
        self.conv1 = nn.Conv1d(in_channels=64, out_channels=32, kernel_size=3, dilation=2)
        self.fc1 = nn.Linear(32 * 382, 512)
        self.bn1 = nn.BatchNorm1d(512)
        self.dropout1 = nn.Dropout(p=dropout_rate)
        self.fc2 = nn.Linear(512, 64)
        self.bn2 = nn.BatchNorm1d(64)
        self.dropout2 = nn.Dropout(p=dropout_rate)
        self.fc3 = nn.Linear(64, 1)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        x = self.conv1(x)
        x = F.relu(x)
        x = F.max_pool1d(x, kernel_size=2)  # Add max pooling layer
        x = x.view(-1, 32 * 382)
        x = self.fc1(x)
        x = self.bn1(x)
        x = F.relu(x)
        x = self.dropout1(x)
        x = self.fc2(x)
        x = self.bn2(x)
        x = F.relu(x)
        x = self.dropout2(x)
        output = self.fc3(x)
        output = self.sigmoid(output)
        return output.squeeze(dim=1)

In [10]:
cuda_device = "cuda:3"
# define your model
model = MLP().to(cuda_device)
# define your loss function
criterion = nn.BCELoss()
# define your optimizer
optimizer = optim.Adam(model.parameters(), lr=1e-4)

In [26]:
# kaggle_pt[:32].shape, labels_pt[:32].shape, model(kaggle_pt[:32]).shape
# loss = criterion(model(kaggle_pt[:32]), labels_pt[:32].float())
# loss.item()

In [29]:
from sklearn.model_selection import train_test_split
from tqdm.notebook import tqdm
from sklearn.metrics import f1_score, roc_auc_score, average_precision_score
def train(model, inputs, labels, optimizer, criterion, epochs, batch_size):
    # split data into training and validation sets
    X_train, X_val, y_train, y_val = train_test_split(
        inputs, labels, test_size=0.7, random_state=12
    )

    num_train_samples = X_train.shape[0]
    num_val_samples = X_val.shape[0]

    num_train_batches = num_train_samples // batch_size
    num_val_batches = num_val_samples // batch_size

    for epoch in tqdm(range(epochs), desc="Epochs", ascii=True):
        train_loss = 0.0
        val_loss = 0.0

        # loop over training batches
        for i in range(num_train_batches):
            start_idx = i * batch_size
            end_idx = (i + 1) * batch_size

            # get the current batch
            batch_inputs = X_train[start_idx:end_idx].to(cuda_device)
            batch_labels = y_train[start_idx:end_idx].to(cuda_device)

            optimizer.zero_grad()
            outputs = model(batch_inputs)
            loss = criterion(outputs, batch_labels)
            loss.backward(retain_graph=True)  # add retain_graph=True here
            optimizer.step()
            with torch.no_grad():
                train_loss += loss.item()

        # loop over validation batches
        for i in range(num_val_batches):
            start_idx = i * batch_size
            end_idx = (i + 1) * batch_size

            # get the current batch
            batch_inputs = X_val[start_idx:end_idx].to(cuda_device)
            batch_labels = y_val[start_idx:end_idx].to(cuda_device)

            # evaluate the model on validation set
            with torch.no_grad():
                outputs = model(batch_inputs)
                loss = criterion(outputs, batch_labels)
                val_loss += loss.item()

        train_loss /= num_train_batches
        val_loss /= num_val_batches

        if epoch % 1 == 0:
            # get the predictions on the training set and calculate f1 score
            with torch.no_grad():
                train_preds = model(X_train.to(cuda_device)).cpu().numpy()
                train_f1 = f1_score(
                    y_train.cpu().numpy(), (train_preds >= 0.5).astype(int)
                )

            # get the predictions on the validation set and calculate f1 score, auc roc, and auc pr
            with torch.no_grad():
                val_preds = model(X_val.to(cuda_device)).cpu().numpy()
                val_f1 = f1_score(y_val.cpu().numpy(), (val_preds >= 0.5).astype(int))
                val_auc_roc = roc_auc_score(y_val.cpu().numpy(), val_preds)
                val_auc_pr = average_precision_score(y_val.cpu().numpy(), val_preds)

            tqdm.write(
                f"Epoch: {epoch+1}, Train BCE Loss: {train_loss:.2f}, "
                + f"Val BCE Loss: {val_loss:.2f}, F1 Score: {train_f1:.2f}/{val_f1:.2f}, "
                + f"AUC ROC: {val_auc_roc:.2f}, AUC PR: {val_auc_pr:.2f}"
            )


In [None]:
# train your model for 100 epochs
train(model, kaggle_pt, labels_pt, optimizer, criterion, epochs=100, batch_size=256)

In [10]:
kaggle_pt.shape, labels_pt.shape

(torch.Size([11928, 64, 768]), (11928,))

## Now we retrain for the personal model

In [15]:
import torch
pre_trained_model = torch.load('pre-trained model with kaggle.pt')
cuda_device = "cuda:3"
# define your model
pre_trained_model = pre_trained_model.to(cuda_device)
# define your loss function
criterion = nn.BCELoss()
# define your optimizer
optimizer = optim.Adam(pre_trained_model.parameters(), lr=1e-4)

In [16]:
personal_pt = torch.load('personal_hidden_states.pt').float()
labels_pt = torch.from_numpy(np.array(torch.load('personal_labels.pt'))).float()
print(f"personal_pt.shape: {personal_pt.shape}, labels_pt.shape: {labels_pt.shape}")

personal_pt.shape: torch.Size([251, 64, 768]), labels_pt.shape: torch.Size([251])


In [1]:
from transformers import BertTokenizer
import numpy as np
import torch
from transformers import BertModel

def encode_text(text_np):
    # Load the BERT tokenizer
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

    # Tokenize the sentences
    input_ids = []
    attention_masks = []

    for sentence in text_np:
        encoded_dict = tokenizer.encode_plus(
                            sentence,                      # Sentence to encode
                            add_special_tokens = True,     # Add '[CLS]' and '[SEP]'
                            max_length = 64,               # Pad & truncate all sentences.
                            padding = 'max_length',
                            truncation = True,
                            return_attention_mask = True,  # Construct attn. masks.
                            return_tensors = 'pt'          # Return pytorch tensors.
                    )
        
        # Add the encoded sentence to the list
        input_ids.append(encoded_dict['input_ids'])
        
        # Add its attention mask (differentiates padding from non-padding)
        attention_masks.append(encoded_dict['attention_mask'])

    # Convert the lists to tensors
    input_ids = torch.cat(input_ids, dim=0)
    attention_masks = torch.cat(attention_masks, dim=0)
    # Load the BERT model
    model = BertModel.from_pretrained('bert-base-uncased')
    # Pass the input tensors through the BERT model
    outputs = model(input_ids=input_ids, attention_mask=attention_masks)
    return outputs.last_hidden_state

In [30]:
train(pre_trained_model, personal_pt, labels_pt, optimizer, criterion, epochs=15, batch_size=8)

Epochs:   0%|          | 0/15 [00:00<?, ?it/s]

Epoch: 1, Train BCE Loss: 0.00, Val BCE Loss: 0.00, F1 Score: 1.00/1.00, AUC ROC: 1.00, AUC PR: 1.00
Epoch: 2, Train BCE Loss: 0.00, Val BCE Loss: 0.00, F1 Score: 1.00/1.00, AUC ROC: 1.00, AUC PR: 1.00
Epoch: 3, Train BCE Loss: 0.00, Val BCE Loss: 0.00, F1 Score: 1.00/1.00, AUC ROC: 1.00, AUC PR: 1.00
Epoch: 4, Train BCE Loss: 0.00, Val BCE Loss: 0.00, F1 Score: 1.00/1.00, AUC ROC: 1.00, AUC PR: 1.00
Epoch: 5, Train BCE Loss: 0.00, Val BCE Loss: 0.00, F1 Score: 1.00/1.00, AUC ROC: 1.00, AUC PR: 1.00
Epoch: 6, Train BCE Loss: 0.00, Val BCE Loss: 0.00, F1 Score: 1.00/1.00, AUC ROC: 1.00, AUC PR: 1.00
Epoch: 7, Train BCE Loss: 0.00, Val BCE Loss: 0.00, F1 Score: 1.00/1.00, AUC ROC: 1.00, AUC PR: 1.00
Epoch: 8, Train BCE Loss: 0.00, Val BCE Loss: 0.00, F1 Score: 1.00/1.00, AUC ROC: 1.00, AUC PR: 1.00
Epoch: 9, Train BCE Loss: 0.00, Val BCE Loss: 0.00, F1 Score: 1.00/1.00, AUC ROC: 1.00, AUC PR: 1.00
Epoch: 10, Train BCE Loss: 0.00, Val BCE Loss: 0.00, F1 Score: 1.00/1.00, AUC ROC: 1.00, AU

In [19]:
asd = torch.load('kaggle_labels.pt')

In [21]:
sum(asd), len(asd), sum(asd)/len(asd)

(5186, 11928, 0.4347753185781355)

In [31]:
sum(labels_pt), len(labels_pt), sum(labels_pt)/len(labels_pt)

(tensor(135.), 251, tensor(0.5378))