In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import torch
import torch
import torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader
from sklearn.model_selection import train_test_split

import re
from sklearn.utils import shuffle
from sklearn.metrics import classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer

from google.colab import drive
drive.mount('/content/drive')

# Loading Dataset
df = pd.read_csv('/content/drive/MyDrive/NMA Deep learning Project/fake_job_postings_processed.csv', on_bad_lines='skip')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


FileNotFoundError: [Errno 2] No such file or directory: '/content/drive/MyDrive/NMA Deep learning Project/fake_job_postings_processed.csv'

In [None]:
fake = df[df["fraudulent"] == 1]
real = df[df["fraudulent"] == 0]

# We undersample the majority class, this gives us less data but we don't have that much computational power anyways
n_minority = len(df[df['fraudulent'] == 1])
fake_sample = fake[["fraudulent", "text_processed"]].sample(n_minority)
real_sample = real[["fraudulent", "text_processed"]].sample(n_minority)

sample = pd.concat((fake_sample, real_sample))

text = [s for s in sample["text_processed"]]
labels = np.array([int(l) for l in sample["fraudulent"]])

In [None]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(device)

In [None]:
from transformers import BertTokenizer, BertModel
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained("bert-base-uncased").to(device)


def generate_embeddings(text: str) -> torch.Tensor:
    # Tokenize the text
    inputs = tokenizer(
        text,
        max_length=300,
        padding='max_length',
        truncation=True,
        return_tensors='pt'
    )
    inputs = {key: val.to(device) for key, val in inputs.items()}

    # Generate embeddings
    with torch.no_grad():
        outputs = model(**inputs)

    token_embeddings = outputs.last_hidden_state
    attention_mask = inputs['attention_mask'].unsqueeze(-1).expand(token_embeddings.size())
    return (token_embeddings * attention_mask).squeeze(0)

In [None]:
embeddings = np.array(
    [generate_embeddings(t).cpu().numpy() for t in text]
)

In [None]:
embeddings.shape, labels.shape

In [None]:
X_train,X_test,y_train,y_test=train_test_split(embeddings, labels, test_size=0.2, random_state=0, stratify=labels)
train_data = TensorDataset(torch.from_numpy(X_train), torch.from_numpy(y_train))
val_data = TensorDataset(torch.from_numpy(X_test), torch.from_numpy(y_test))

# Batch size (this is an important hyperparameter)
batch_size = 40

# dataloaders
# make sure to SHUFFLE your data
train_loader = DataLoader(
    train_data, shuffle=True, batch_size=batch_size, drop_last=True
)
val_loader = DataLoader(val_data, shuffle=True, batch_size=batch_size, drop_last=True)

In [None]:
class JobRNN(nn.Module):
    def __init__(self, n_layers, hidden_dim, embedding_dim, output_dim, drop_prob=0.1, device=device):
        super(JobRNN, self).__init__()

        self.output_dim = output_dim
        self.hidden_dim = hidden_dim
        self.embedding_dim = embedding_dim
        self.n_layers = n_layers
        self.drop_prob = drop_prob
        self.device = device

        self.lstm = nn.LSTM(
            input_size=self.embedding_dim,
            hidden_size=self.hidden_dim,
            num_layers=self.n_layers,
            batch_first=True,
            dropout=self.drop_prob
        )

        self.dropout = nn.Dropout(p=self.drop_prob)

        self.fc = nn.Linear(self.hidden_dim, self.output_dim)
        self.sig = nn.Sigmoid()

    def forward(self, x, hidden):
        batch_size = x.size(0)

        if x.dim() == 2: # We must be working with single vector embeddings
            x = x.unsqueeze(1)
        # Shape = [batch_size, max_len, embeddings_dim]

        lstm_out, hidden = self.lstm(x, hidden)
        # shape = [batch_size, hidden_dim]

        lstm_out = torch.mean(lstm_out, 1).contiguous()

        out = self.dropout(lstm_out)
        out = self.fc(out)
        out = self.sig(out)

        return out, hidden

    def init_hidden(self, batch_size):
        h0 = torch.zeros(
            (self.n_layers, batch_size, self.hidden_dim)
        ).to(self.device)
        c0 = torch.zeros(
            (self.n_layers, batch_size, self.hidden_dim)
        ).to(self.device)

        return (h0, c0)

model = JobRNN(n_layers=2, hidden_dim=60, embedding_dim=768, output_dim=1, drop_prob=0.6)
model.to(device)
print(model)

In [None]:
lr = 0.01
criterion = nn.BCELoss()
optim = torch.optim.Adam(model.parameters(), lr=lr)



def acc(pred, label):
    pred = torch.round(pred.squeeze())
    return torch.sum(pred == label.squeeze()).item()


epochs = 20
clip = 5  # maximum for |gradient|
valid_loss_min = np.inf  # initial loss value

epoch_tr_loss, epoch_vl_loss = [], []
epoch_tr_acc, epoch_vl_acc = [], []


for epoch in range(epochs):
    train_losses = []
    train_acc = 0.0
    model.train()

    for inputs, labels in train_loader:
        h = model.init_hidden(batch_size)
        h = tuple([each.data.to(device) for each in h])

        inputs, labels = inputs.to(device), labels.to(device)

        model.zero_grad()

        output, h = model.forward(inputs, h)
        loss = criterion(output.squeeze(), labels.float())

        loss.backward()
        train_losses.append(loss.item())

        accuracy = acc(output, labels)
        train_acc += accuracy

        # nn.utils.clip_grad_norm_(
        #     model.parameters(), clip
        # )  # helps prevent exploding gradient RNN/LSTM
        optim.step()

    # validation set
    val_losses = []
    val_acc = 0.0
    model.eval()

    for inputs, labels in val_loader:
        val_h = model.init_hidden(batch_size)
        val_h = tuple([each.data.to(device) for each in val_h])
        inputs, labels = inputs.to(device), labels.to(device)
        output, h = model.forward(inputs, val_h)
        val_loss = criterion(output.squeeze(), labels.float())
        val_losses.append(val_loss.item())
        accuracy = acc(output, labels)
        val_acc += accuracy

    epoch_train_loss = np.mean(train_losses)
    epoch_val_loss = np.mean(val_losses)
    epoch_train_acc = train_acc / len(train_loader.dataset)
    epoch_val_acc = val_acc / len(val_loader.dataset)
    epoch_tr_loss.append(epoch_train_loss)
    epoch_vl_loss.append(epoch_val_loss)
    epoch_tr_acc.append(epoch_train_acc)
    epoch_vl_acc.append(epoch_val_acc)
    print(f"Epoch {epoch+1}")
    print(f"train_loss : {epoch_train_loss} val_loss : {epoch_val_loss}")
    print(f"train_accuracy : {epoch_train_acc*100} val_accuracy : {epoch_val_acc*100}")
    if epoch_val_loss <= valid_loss_min:
        print(
            "Validation loss decreased ({:.6f} --> {:.6f}).  Saving model ...".format(
                valid_loss_min, epoch_val_loss
            )
        )
    # torch.save(model.state_dict(), '../working/state_dict.pt')
    valid_loss_min = epoch_val_loss

    print("=="*25)

In [None]:
print(f"\nAverage Training Accuracy over {epochs} epochs: {np.mean(epoch_tr_acc)*100:.2f}%")
print(f"Average Validation Accuracy over {epochs} epochs: {np.mean(epoch_vl_acc)*100:.2f}%")