<a href="https://colab.research.google.com/github/MPdeM/Final_Project/blob/master/nlp_bert_colab.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## **Install Pytorch/ Import Dependencies**

In [None]:
pip install pytorch-pretrained-bert

In [1]:
# Dependencies
import pandas as pd 
import numpy as np 
import torch.nn as nn
from pytorch_pretrained_bert import BertTokenizer, BertModel
import torch
from keras.preprocessing.sequence import pad_sequences
from sklearn.metrics import classification_report

import warnings
warnings.filterwarnings("ignore")

Using TensorFlow backend.


## **Load Data**

In [2]:
# Read in csv file
url = "https://job-postings-dataviz.s3.amazonaws.com/fake_jobs_clean.csv"

fake_jobs_df = pd.read_csv(url, sep=",",  encoding = "UTF-8")
fake_jobs_df

Unnamed: 0,job_id,city,state/province,country,title,department,industry,function,salary_range,salary_provided,...,requirements,benefits,benefits_provided,telecommuting,has_company_logo,has_questions,employment_type,required_experience,required_education,fraudulent
0,1,New York,NY,US,Marketing Intern,Marketing,,Marketing,,0,...,Experience with content management systems a m...,,0,0,1,0,Other,Internship,,0
1,2,Auckland,,NZ,Customer Service - Cloud Video Production,Success,Marketing and Advertising,Customer Service,,0,...,What we expect from you:Your key responsibilit...,What you will get from usThrough being part of...,1,0,1,0,Full-time,Not Applicable,,0
2,3,Wever,IA,US,Commissioning Machinery Assistant (CMA),,,,,0,...,Implement pre-commissioning and commissioning ...,,0,0,1,0,,,,0
3,4,Washington,DC,US,Account Executive - Washington DC,Sales,Computer Software,Sales,,0,...,"EDUCATION: Bachelor’s or Master’s in GIS, busi...",Our culture is anything but corporate—we have ...,1,0,1,0,Full-time,Mid-Senior level,Bachelor's Degree,0
4,5,Fort Worth,FL,US,Bill Review Manager,,Hospital & Health Care,Health Care Provider,,0,...,QUALIFICATIONS:RN license in the State of Texa...,Full Benefits Offered,1,0,1,1,Full-time,Mid-Senior level,Bachelor's Degree,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17875,17876,Toronto,ON,CA,Account Director - Distribution,Sales,Computer Software,Sales,,0,...,To ace this role you:Will eat comprehensive St...,What can you expect from us?We have an open cu...,1,0,1,1,Full-time,Mid-Senior level,,0
17876,17877,Philadelphia,PA,US,Payroll Accountant,Accounting,Internet,Accounting/Auditing,,0,...,- B.A. or B.S. in Accounting- Desire to have f...,Health &amp; WellnessMedical planPrescription ...,1,0,1,1,Full-time,Mid-Senior level,Bachelor's Degree,0
17877,17878,Houston,TX,US,Project Cost Control Staff Engineer - Cost Con...,,,,,0,...,At least 12 years professional experience.Abil...,,0,0,0,0,Full-time,,,0
17878,17879,Lagos,LA,NG,Graphic Designer,,Graphic Design,Design,,0,...,1. Must be fluent in the latest versions of Co...,Competitive salary (compensation will be based...,1,0,0,1,Contract,Not Applicable,Professional,0


## **Data Cleaning**

In [3]:
# Take a look at job description and fraudulent cols
df = fake_jobs_df[['description', 'fraudulent']]
df

Unnamed: 0,description,fraudulent
0,"Food52, a fast-growing, James Beard Award-winn...",0
1,Organised - Focused - Vibrant - Awesome!Do you...,0
2,"Our client, located in Houston, is actively se...",0
3,THE COMPANY: ESRI – Environmental Systems Rese...,0
4,JOB TITLE: Itemization Review ManagerLOCATION:...,0
...,...,...
17875,Just in case this is the first time you’ve vis...,0
17876,The Payroll Accountant will focus primarily on...,0
17877,Experienced Project Cost Control Staff Enginee...,0
17878,Nemsia Studios is looking for an experienced v...,0


In [4]:
# Count how many Real vs Fraudulent postings 
from collections import Counter
print(Counter(df['fraudulent'].values))

Counter({0: 17014, 1: 866})


In [5]:
# Drop duplicate descriptions
df = df.drop_duplicates()
df

Unnamed: 0,description,fraudulent
0,"Food52, a fast-growing, James Beard Award-winn...",0
1,Organised - Focused - Vibrant - Awesome!Do you...,0
2,"Our client, located in Houston, is actively se...",0
3,THE COMPANY: ESRI – Environmental Systems Rese...,0
4,JOB TITLE: Itemization Review ManagerLOCATION:...,0
...,...,...
17873,RESPONSIBILITIES:Will facilitate the recruitin...,0
17875,Just in case this is the first time you’ve vis...,0
17876,The Payroll Accountant will focus primarily on...,0
17877,Experienced Project Cost Control Staff Enginee...,0


In [6]:
# Count how many Real vs Fraudulent postings 
print(Counter(df['fraudulent'].values))

Counter({0: 14171, 1: 632})


In [7]:
# Make sure number of real vs fraudulent are more balanced
# Using UNDERsampling for balancing of classification groups
df_fraud= df[df['fraudulent'] == 1] 
df_normal = df[df['fraudulent'] == 0] 

df_normal = df_normal.sample(n=(len(df_fraud)), random_state=22)
df = df_normal.append(df_fraud)

df

Unnamed: 0,description,fraudulent
13898,Currently hiring per-diem caregivers to work o...,0
1688,"Μεγαλώνουμε την ομάδα μας,και χρειαζόμαστε ένα...",0
13100,PowToon is looking for a team-oriented Marketi...,0
5579,Wealth Management Advisor We are continuing ou...,0
15961,RECRUITING MILITARY VETERANSLooking to offer o...,0
...,...,...
17827,Student Positions Part-Time and Full-Time.You ...,1
17828,LEARN TO EARN AN EXECUTIVE LEVEL INCOMEFULL TR...,1
17829,inFullMobile Sp. z o.o. is a mobile software d...,1
17830,JOB DESCRIPTIONWe are seeking a full time payr...,1


In [8]:
# Shuffle the dataframe so fraudulent postings more evenly distributed 
from sklearn.utils import shuffle
df = shuffle(df, random_state=22)
df = df.reset_index(drop=True)
df

Unnamed: 0,description,fraudulent
0,We have an immediate opening for a receptionis...,1
1,6* Ultra Luxury American Cruise Company is urg...,1
2,"Our client, located in Buckhannon, WV, is acti...",0
3,Apply using below link#URL_3fd69c66d9c4b82a75b...,1
4,What we prefer in a candidate:Great attitudeCa...,1
...,...,...
1259,CTO for a tourism start-upWe are just now star...,0
1260,Responsible for all food production including ...,1
1261,We are looking for a full-time self-sufficient...,1
1262,Senior Java software engineer with previous on...,0


In [9]:
# Explicitly define data types
df['description'] = df['description'].astype(str)
df['fraudulent'] = df['fraudulent'].astype(int)

In [10]:
# Datatypes
df.dtypes

description    object
fraudulent      int64
dtype: object

In [11]:
# Count Real vs Fraudulent again to make sure we've achieved the desired result
print(Counter(df['fraudulent'].values))

Counter({1: 632, 0: 632})


## **Training/Testing Data**

In [12]:
# Split the data into training and testing
train_data = df.head(885)
test_data = df.tail(len(df)-885)

print(train_data.shape, test_data.shape)

(885, 2) (379, 2)


In [13]:
# Generate a list of dictionaries with ‘description’ and ‘fraudulent’ keys
# Guidance provided by https://towardsdatascience.com/fake-job-classification-with-bert-6575c908a9aa
train_data = [{'description': description, 'fraudulent': fraudulent } for description in list(train_data['description']) for fraudulent in list(train_data['fraudulent'])]

test_data = [{'description': description, 'fraudulent': fraudulent } for description in list(test_data['description'])for fraudulent in list(test_data['fraudulent'])]

In [14]:
# Generate a list of tuples from the list of dictionaries
train_texts, train_labels = list(zip(*map(lambda d: (d['description'], d['fraudulent']), train_data)))
test_texts, test_labels = list(zip(*map(lambda d: (d['description'], d['fraudulent']), test_data)))

## **Pre-Processing**

In [15]:
# Generate tokens and token ids
# Bert Tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

In [16]:
# Tokens
train_tokens = list(map(lambda t: ['[CLS]'] + tokenizer.tokenize(t)[:511], train_texts))
test_tokens = list(map(lambda t: ['[CLS]'] + tokenizer.tokenize(t)[:511], test_texts))

In [17]:
# Token Ids
train_tokens_ids = list(map(tokenizer.convert_tokens_to_ids, train_tokens))
test_tokens_ids = list(map(tokenizer.convert_tokens_to_ids, test_tokens))

In [18]:
train_tokens_ids = pad_sequences(train_tokens_ids, maxlen=512, truncating="post", padding="post", dtype="int")
test_tokens_ids = pad_sequences(test_tokens_ids, maxlen=512, truncating="post", padding="post", dtype="int")

In [19]:
# Generate a boolean array based on the value of ‘fraudulent’ for our testing and training sets
train_y = np.array(train_labels) == 1
test_y = np.array(test_labels) == 1

## **BERT Classifier**

In [20]:
# Create BERT classifier - contains ‘initialization’ method and ‘forward’ method; returns token probabilities
class BertBinaryClassifier(nn.Module):
    def __init__(self, dropout=0.1):
        super(BertBinaryClassifier, self).__init__()
        self.bert = BertModel.from_pretrained('bert-base-uncased')
        self.dropout = nn.Dropout(dropout)
        self.linear = nn.Linear(768, 1)
        self.sigmoid = nn.Sigmoid()
    
    def forward(self, tokens, masks=None):
        _, pooled_output = self.bert(tokens, attention_mask=masks, output_all_encoded_layers=False)
        dropout_output = self.dropout(pooled_output)
        linear_output = self.linear(dropout_output)
        proba = self.sigmoid(linear_output)
        return proba

In [21]:
# Generate training and testing masks
train_masks = [[float(i > 0) for i in ii] for ii in train_tokens_ids]
test_masks = [[float(i > 0) for i in ii] for ii in test_tokens_ids]
train_masks_tensor = torch.tensor(train_masks)
test_masks_tensor = torch.tensor(test_masks)

In [22]:
# Generate token tensors for training and testing
train_tokens_tensor = torch.tensor(train_tokens_ids)
train_y_tensor = torch.tensor(train_y.reshape(-1, 1)).float()
test_tokens_tensor = torch.tensor(test_tokens_ids)
test_y_tensor = torch.tensor(test_y.reshape(-1, 1)).float()

In [23]:
# Prepare data loaders
BATCH_SIZE = 1

train_dataset =  torch.utils.data.TensorDataset(train_tokens_tensor, train_masks_tensor, train_y_tensor)
train_sampler =  torch.utils.data.RandomSampler(train_dataset)
train_dataloader =  torch.utils.data.DataLoader(train_dataset, sampler=train_sampler, batch_size=BATCH_SIZE)

test_dataset =  torch.utils.data.TensorDataset(test_tokens_tensor, test_masks_tensor, test_y_tensor)
test_sampler =  torch.utils.data.SequentialSampler(test_dataset)
test_dataloader =  torch.utils.data.DataLoader(test_dataset, sampler=test_sampler, batch_size=BATCH_SIZE)

## **Train Model**

In [None]:
# Use the Adam optimizer to minimize the Binary Cross Entropy loss 
# Train with a batch size of 1 for 1 EPOCHS
BATCH_SIZE = 1
EPOCHS = 1

bert_clf = BertBinaryClassifier()
optimizer = torch.optim.Adam(bert_clf.parameters(), lr=3e-6)

for epoch_num in range(EPOCHS):
    bert_clf.train()
    train_loss = 0

    for step_num, batch_data in enumerate(train_dataloader):
        token_ids, masks, labels = tuple(t for t in batch_data)
        probas = bert_clf(token_ids, masks)
        loss_func = nn.BCELoss()
        batch_loss = loss_func(probas, labels)
        train_loss += batch_loss.item()
        bert_clf.zero_grad()
        batch_loss.backward()
        optimizer.step()
        print('Epoch: ', epoch_num + 1)
        print("\r" + "{0}/{1} loss: {2} ".format(step_num, len(train_data) / BATCH_SIZE, train_loss / (step_num + 1)))

## **Evaluate**

In [0]:
# Evaluate our model
bert_clf.eval()
bert_predicted = []
all_logits = []
with torch.no_grad():
    for step_num, batch_data in enumerate(test_dataloader):

token_ids, masks, labels = tuple(t for t in batch_data)

logits = bert_clf(token_ids, masks)
        loss_func = nn.BCELoss()
        loss = loss_func(logits, labels)
        numpy_logits = logits.cpu().detach().numpy()
        
        bert_predicted += list(numpy_logits[:, 0] > 0.5)
        all_logits += list(numpy_logits[:, 0])
        
print(classification_report(test_y, bert_predicted))