## **Install Pytorch/ Import Dependencies**

In [1]:
pip install pytorch-pretrained-bert

Collecting pytorch-pretrained-bert
[?25l  Downloading https://files.pythonhosted.org/packages/d7/e0/c08d5553b89973d9a240605b9c12404bcf8227590de62bae27acbcfe076b/pytorch_pretrained_bert-0.6.2-py3-none-any.whl (123kB)
[K     |██▋                             | 10kB 23.3MB/s eta 0:00:01[K     |█████▎                          | 20kB 15.3MB/s eta 0:00:01[K     |████████                        | 30kB 13.4MB/s eta 0:00:01[K     |██████████▋                     | 40kB 12.4MB/s eta 0:00:01[K     |█████████████▎                  | 51kB 8.2MB/s eta 0:00:01[K     |███████████████▉                | 61kB 8.7MB/s eta 0:00:01[K     |██████████████████▌             | 71kB 8.7MB/s eta 0:00:01[K     |█████████████████████▏          | 81kB 9.6MB/s eta 0:00:01[K     |███████████████████████▉        | 92kB 9.3MB/s eta 0:00:01[K     |██████████████████████████▌     | 102kB 7.9MB/s eta 0:00:01[K     |█████████████████████████████▏  | 112kB 7.9MB/s eta 0:00:01[K     |███████████████████

In [2]:
# Dependencies
import pandas as pd 
import numpy as np 

import torch.nn as nn
from pytorch_pretrained_bert import BertTokenizer, BertModel

from keras.preprocessing.sequence import pad_sequences
from sklearn.metrics import classification_report

import re

import tensorflow as tf

import warnings
warnings.filterwarnings("ignore")

## **Load Data**

In [3]:
# Read in csv file
# url = "https://job-postings-dataviz.s3.amazonaws.com/fake_job_postings.csv"
path = "/content/fake_job_postings.csv" # if read from Drive

df = pd.read_csv(path, encoding = "UTF-8")
df.head()

Unnamed: 0,job_id,title,location,department,salary_range,company_profile,description,requirements,benefits,telecommuting,has_company_logo,has_questions,employment_type,required_experience,required_education,industry,function,fraudulent
0,1,Marketing Intern,"US, NY, New York",Marketing,,"We're Food52, and we've created a groundbreaki...","Food52, a fast-growing, James Beard Award-winn...",Experience with content management systems a m...,,0,1,0,Other,Internship,,,Marketing,0
1,2,Customer Service - Cloud Video Production,"NZ, , Auckland",Success,,"90 Seconds, the worlds Cloud Video Production ...",Organised - Focused - Vibrant - Awesome!Do you...,What we expect from you:Your key responsibilit...,What you will get from usThrough being part of...,0,1,0,Full-time,Not Applicable,,Marketing and Advertising,Customer Service,0
2,3,Commissioning Machinery Assistant (CMA),"US, IA, Wever",,,Valor Services provides Workforce Solutions th...,"Our client, located in Houston, is actively se...",Implement pre-commissioning and commissioning ...,,0,1,0,,,,,,0
3,4,Account Executive - Washington DC,"US, DC, Washington",Sales,,Our passion for improving quality of life thro...,THE COMPANY: ESRI – Environmental Systems Rese...,"EDUCATION: Bachelor’s or Master’s in GIS, busi...",Our culture is anything but corporate—we have ...,0,1,0,Full-time,Mid-Senior level,Bachelor's Degree,Computer Software,Sales,0
4,5,Bill Review Manager,"US, FL, Fort Worth",,,SpotSource Solutions LLC is a Global Human Cap...,JOB TITLE: Itemization Review ManagerLOCATION:...,QUALIFICATIONS:RN license in the State of Texa...,Full Benefits Offered,0,1,1,Full-time,Mid-Senior level,Bachelor's Degree,Hospital & Health Care,Health Care Provider,0


In [4]:
# remove the NaN (they tend to mess up)
df.fillna(" ", inplace=True)
df.head()

Unnamed: 0,job_id,title,location,department,salary_range,company_profile,description,requirements,benefits,telecommuting,has_company_logo,has_questions,employment_type,required_experience,required_education,industry,function,fraudulent
0,1,Marketing Intern,"US, NY, New York",Marketing,,"We're Food52, and we've created a groundbreaki...","Food52, a fast-growing, James Beard Award-winn...",Experience with content management systems a m...,,0,1,0,Other,Internship,,,Marketing,0
1,2,Customer Service - Cloud Video Production,"NZ, , Auckland",Success,,"90 Seconds, the worlds Cloud Video Production ...",Organised - Focused - Vibrant - Awesome!Do you...,What we expect from you:Your key responsibilit...,What you will get from usThrough being part of...,0,1,0,Full-time,Not Applicable,,Marketing and Advertising,Customer Service,0
2,3,Commissioning Machinery Assistant (CMA),"US, IA, Wever",,,Valor Services provides Workforce Solutions th...,"Our client, located in Houston, is actively se...",Implement pre-commissioning and commissioning ...,,0,1,0,,,,,,0
3,4,Account Executive - Washington DC,"US, DC, Washington",Sales,,Our passion for improving quality of life thro...,THE COMPANY: ESRI – Environmental Systems Rese...,"EDUCATION: Bachelor’s or Master’s in GIS, busi...",Our culture is anything but corporate—we have ...,0,1,0,Full-time,Mid-Senior level,Bachelor's Degree,Computer Software,Sales,0
4,5,Bill Review Manager,"US, FL, Fort Worth",,,SpotSource Solutions LLC is a Global Human Cap...,JOB TITLE: Itemization Review ManagerLOCATION:...,QUALIFICATIONS:RN license in the State of Texa...,Full Benefits Offered,0,1,1,Full-time,Mid-Senior level,Bachelor's Degree,Hospital & Health Care,Health Care Provider,0


In [15]:
# Check if there is any relationship between the target class and required_education
df.pivot_table(index=['fraudulent'], columns='required_education', aggfunc='size', fill_value=0)
# there is no relationship and can be removed

required_education,Unnamed: 1_level_0,Associate Degree,Bachelor's Degree,Certification,Doctorate,High School or equivalent,Master's Degree,Professional,Some College Coursework Completed,Some High School Coursework,Unspecified,Vocational,Vocational - Degree,Vocational - HS Diploma
fraudulent,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
0,7654,268,5045,151,25,1910,385,70,99,7,1336,49,6,9
1,451,6,100,19,1,170,31,4,3,20,61,0,0,0


In [16]:
# Check if there is any relationship between the target class and required_experience
df.pivot_table(index=['fraudulent'], columns='required_experience', aggfunc='size', fill_value=0)
# there is no relationship and can be removed

required_experience,Unnamed: 1_level_0,Associate,Director,Entry level,Executive,Internship,Mid-Senior level,Not Applicable
fraudulent,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,6615,2255,372,2518,131,371,3696,1056
1,435,42,17,179,10,10,113,60


In [17]:
# Check if there is any relationship between the target class and employment_type
df.pivot_table(index=['fraudulent'], columns='employment_type', aggfunc='size', fill_value=0)
# there is no relationship and can be removed

employment_type,Unnamed: 1_level_0,Contract,Full-time,Other,Part-time,Temporary
fraudulent,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,3230,1480,11130,212,723,239
1,241,44,490,15,74,2


In [5]:
# concatenate the data 

df['text'] = df['title']+ " " + df['department'] + " " + df['company_profile']+ " " + df['description']+ " " + df['requirements'] + " " + df['benefits'] + " "
 

In [6]:
df['text'] 

0        Marketing Intern Marketing We're Food52, and w...
1        Customer Service - Cloud Video Production Succ...
2        Commissioning Machinery Assistant (CMA)   Valo...
3        Account Executive - Washington DC Sales Our pa...
4        Bill Review Manager   SpotSource Solutions LLC...
                               ...                        
17875    Account Director - Distribution  Sales Vend is...
17876    Payroll Accountant Accounting WebLinc is the e...
17877    Project Cost Control Staff Engineer - Cost Con...
17878    Graphic Designer     Nemsia Studios is looking...
17879    Web Application Developers Engineering Vend is...
Name: text, Length: 17880, dtype: object

In [7]:
list(df.columns)


['job_id',
 'title',
 'location',
 'department',
 'salary_range',
 'company_profile',
 'description',
 'requirements',
 'benefits',
 'telecommuting',
 'has_company_logo',
 'has_questions',
 'employment_type',
 'required_experience',
 'required_education',
 'industry',
 'function',
 'fraudulent',
 'text']

In [8]:
delete_list=['job_id',
 'title',
 'location',
 'department',
 'salary_range',
 'company_profile',
 'description',
 'requirements',
 'benefits',
 'telecommuting',
 'has_company_logo',
 'has_questions',
 'employment_type',
 'required_experience',
 'required_education',
 'industry',
 'function'
]

for column in delete_list:
  del df[column]

In [9]:
df.head()

Unnamed: 0,fraudulent,text
0,0,"Marketing Intern Marketing We're Food52, and w..."
1,0,Customer Service - Cloud Video Production Succ...
2,0,Commissioning Machinery Assistant (CMA) Valo...
3,0,Account Executive - Washington DC Sales Our pa...
4,0,Bill Review Manager SpotSource Solutions LLC...


## **Data Cleaning**

In [10]:
df['text']=df['text'].str.replace('\n','')
df['text']=df['text'].str.replace('\r','')
df['text']=df['text'].str.replace('\t','')

In [11]:
# remove numbers and simbols

df['text'] = df['text'].apply(lambda x: re.sub(r'[0-9]','',x))
df['text'] = df['text'].apply(lambda x: re.sub(r'[/(){}\[\]\|@,;.:-]',' ',x))

In [12]:
# convert all to lowers

df["text"] = df['text'].apply(lambda s: s.lower() if type(s) == str else s)

In [13]:
# remove unnecesary white spaces

df['text'] = df['text'].str.replace('  ', ' ')

## **Bert Pretrained layer**

In [25]:
# Using Google TensorFlow library for the Bert model 

# install tesorflow bert package
!pip install bert-for-tf2

import tensorflow as tf
import tensorflow_hub as hub
from tensorflow.keras import layers
import bert

Collecting bert-for-tf2
[?25l  Downloading https://files.pythonhosted.org/packages/a5/a1/acb891630749c56901e770a34d6bac8a509a367dd74a05daf7306952e910/bert-for-tf2-0.14.9.tar.gz (41kB)
[K     |████████                        | 10kB 25.6MB/s eta 0:00:01[K     |████████████████                | 20kB 17.2MB/s eta 0:00:01[K     |███████████████████████▉        | 30kB 15.1MB/s eta 0:00:01[K     |███████████████████████████████▉| 40kB 14.0MB/s eta 0:00:01[K     |████████████████████████████████| 51kB 5.7MB/s 
[?25hCollecting py-params>=0.9.6
  Downloading https://files.pythonhosted.org/packages/aa/e0/4f663d8abf83c8084b75b995bd2ab3a9512ebc5b97206fde38cef906ab07/py-params-0.10.2.tar.gz
Collecting params-flow>=0.8.0
  Downloading https://files.pythonhosted.org/packages/a9/95/ff49f5ebd501f142a6f0aaf42bcfd1c192dc54909d1d9eb84ab031d46056/params-flow-0.8.2.tar.gz
Building wheels for collected packages: bert-for-tf2, py-params, params-flow
  Building wheel for bert-for-tf2 (setup.py) ... 

In [26]:
#Loding pretrained bert layer
BertTokenizer = bert.bert_tokenization.FullTokenizer
bert_layer = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/1",
                            trainable=False)

In [27]:
# Loading tokenizer from the bert layer
vocab_file = bert_layer.resolved_object.vocab_file.asset_path.numpy()
do_lower_case = bert_layer.resolved_object.do_lower_case.numpy()
tokenizer = BertTokenizer(vocab_file, do_lower_case)

## Bert Encoding
- each sentence is tolkenized into tolkens
- a [CLS] token is added at the beginning of the first sentence and [SEP] token at the end
- tokens that coply with fixed vocabulary are fetched and assigned with: 
  - token IDs, to assign Unique token-id from BERT's tokenizer
  - padding ID (Mask-Id), to indicate which elements in the sequence are tokens and which are padding elements
  - Segment IDs, to distinguish different sentences

In [32]:
text = " All I need is within me now"
# tokenize
tokens_list = tokenizer.tokenize(text)
print('Text after tokenization')
print(tokens_list)
# initilize dimension
max_len =25
text = tokens_list[:max_len-2]
input_sequence = ["[CLS]"] + text + ["[SEP]"]
print("After adding  flasges -[CLS] and [SEP]: ")
print(input_sequence)

tokens = tokenizer.convert_tokens_to_ids(input_sequence )
print("tokens to id ")
print(tokens)

pad_len = max_len -len(input_sequence)
tokens += [0] * pad_len
print("tokens: ")
print(tokens)

pad_masks = [1] * len(input_sequence) + [0] * pad_len
print("Pad Masking: ")
print(pad_masks)

segment_ids = [0] * max_len
print("Segment Ids: ")
print(segment_ids)

Text after tokenization
['all', 'i', 'need', 'is', 'within', 'me', 'now']
After adding  flasges -[CLS] and [SEP]: 
['[CLS]', 'all', 'i', 'need', 'is', 'within', 'me', 'now', '[SEP]']
tokens to id 
[101, 2035, 1045, 2342, 2003, 2306, 2033, 2085, 102]
tokens: 
[101, 2035, 1045, 2342, 2003, 2306, 2033, 2085, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
Pad Masking: 
[1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
Segment Ids: 
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]


In [33]:
# Make sure number of real vs fraudulent are more balanced
# Using UNDERsampling for balancing of classification groups
df_fake = df[df['fraudulent'] == 1] 
df_real = df[df['fraudulent'] == 0] 

n_f=len(df_fake)
n_n=len(df_real)
print(n_f)
print(n_n)

866
17014


In [34]:
# Create a dataset with a more balanced distribution df_under(5:1 real:fake posts)
df_new= df_real.sample( 5*len(df_fake), random_state=580)
df_unders = df_new.append(df_fake)
print(len(df_new))
print(len(df_unders))

4330
5196


In [35]:
# Shuffle the dataframe 
# from sklearn.utils import shuffle
# df_underst = shuffle(df_unders, random_state=22)
df_underst = df_unders.sample(len(df_unders), random_state=580) 
df = df_underst.reset_index(drop=True)
df.head()

Unnamed: 0,fraudulent,text
0,0,vp product strategy & analysis credit strategy...
1,0,digital analytics and strategy manager freque...
2,0,tech support specialist support hirehopes is t...
3,0,technical support engineer operations upstream...
4,0,senior network engineer engineering upstream’s...


In [37]:
X = df['text']        #Tokenize the dataset
Y = df['fraudulent']   #Assign the value of y  
print(Y.shape)

(5196,)


In [39]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X ,Y , test_size = 0.20,random_state=41)

Bert

In [42]:


def bert_encode(texts, tokenizer, max_len=512):
    all_tokens = []
    all_masks = []
    all_segments = []
    
    for text in texts:
        text = tokenizer.tokenize(text)
            
        text = text[:max_len-2]
        input_sequence = ["[CLS]"] + text + ["[SEP]"]
        pad_len = max_len - len(input_sequence)
        
        tokens = tokenizer.convert_tokens_to_ids(input_sequence)
        tokens += [0] * pad_len
        pad_masks = [1] * len(input_sequence) + [0] * pad_len
        segment_ids = [0] * max_len
        
        all_tokens.append(tokens)
        all_masks.append(pad_masks)
        all_segments.append(segment_ids)
    
    return np.array(all_tokens), np.array(all_masks), np.array(all_segments)


MAX_LEN = 64

# encode train set 
train_input = bert_encode(X_train, tokenizer, max_len=MAX_LEN)
# encode  test set 
test_input = bert_encode(X_test, tokenizer, max_len= MAX_LEN )
train_labels = y_train    

In [43]:
# lets see encoded train set 
train_input

(array([[  101,  2406,  3208, ...,  5198, 10095,   102],
        [  101,  2451,  1004, ...,  9273,  2040,   102],
        [  101,  3026,  8290, ...,  3265,  2007,   102],
        ...,
        [  101,  4748, 10020, ...,  2154,  2000,   102],
        [  101,  2393,  4624, ..., 24529,  2099,   102],
        [  101,  3208,  8013, ...,  1037, 13024,   102]]),
 array([[1, 1, 1, ..., 1, 1, 1],
        [1, 1, 1, ..., 1, 1, 1],
        [1, 1, 1, ..., 1, 1, 1],
        ...,
        [1, 1, 1, ..., 1, 1, 1],
        [1, 1, 1, ..., 1, 1, 1],
        [1, 1, 1, ..., 1, 1, 1]]),
 array([[0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        ...,
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0]]))

In [45]:
def build_model(bert_layer, max_len=512):
    input_word_ids = Input(shape=(max_len,), dtype=tf.int32, name="input_word_ids")
    input_mask = Input(shape=(max_len,), dtype=tf.int32, name="input_mask")
    segment_ids = Input(shape=(max_len,), dtype=tf.int32, name="segment_ids")

    _, sequence_output = bert_layer([input_word_ids, input_mask, segment_ids])
    clf_output = sequence_output[:, 0, :]
    out = Dense(1, activation='sigmoid')(clf_output)
    
    model = Model(inputs=[input_word_ids, input_mask, segment_ids], outputs=out)
    model.compile(Adam(lr=2e-6), loss='binary_crossentropy', metrics=['accuracy'])
    
    return model

In [46]:
module_url = "https://tfhub.dev/tensorflow/bert_en_uncased_L-24_H-1024_A-16/1"
bert_layer = hub.KerasLayer(module_url, trainable=True)

In [38]:
df.head()

Unnamed: 0,fraudulent,text
0,0,marketing intern marketing we're food and we'v...
1,0,customer service cloud video production succes...
2,0,commissioning machinery assistant cma valor se...
3,0,account executive washington dc sales our pass...
4,0,bill review manager spotsource solutions llc i...


(17880,)


In [47]:
model = build_model(bert_layer, max_len=160)
model.summary()

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_word_ids (InputLayer)     [(None, 160)]        0                                            
__________________________________________________________________________________________________
input_mask (InputLayer)         [(None, 160)]        0                                            
__________________________________________________________________________________________________
segment_ids (InputLayer)        [(None, 160)]        0                                            
__________________________________________________________________________________________________
keras_layer_2 (KerasLayer)      [(None, 1024), (None 335141889   input_word_ids[0][0]             
                                                                 input_mask[0][0]             

## Running the model

In [48]:


train_history = model.fit(
    train_input, train_labels,
    validation_split=0.2,
    epochs=3,
    batch_size=16
)


Epoch 1/3


ValueError: ignored

ModuleNotFoundError: ignored

In [42]:
from bert import bert_tokenization
BertTokenizer = bert_tokenization.FullTokenizer
vocab_file = bert_layer.resolved_object.vocab_file.asset_path.numpy()
do_lower_case = bert_layer.resolved_object.do_lower_case.numpy()
tokenizer = tokenization.FullTokenizer(vocab_file, do_lower_case)

ModuleNotFoundError: ignored

In [22]:

# verify GPU availability
import tensorflow as tf

device_name = tf.test.gpu_device_name()
if device_name != '/device:GPU:0':
  raise SystemError('GPU device not found')
print('Found GPU at: {}'.format(device_name))

Found GPU at: /device:GPU:0


In [23]:
!pip install pytorch-pretrained-bert pytorch-nlp

# BERT imports
import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from pytorch_pretrained_bert import BertTokenizer, BertConfig
from pytorch_pretrained_bert import BertAdam, BertForSequenceClassification
from tqdm import tqdm, trange
import pandas as pd
import io
import numpy as np
import matplotlib.pyplot as plt
% matplotlib inline

# specify GPU device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
n_gpu = torch.cuda.device_count()
torch.cuda.get_device_name(0)

Collecting pytorch-nlp
[?25l  Downloading https://files.pythonhosted.org/packages/4f/51/f0ee1efb75f7cc2e3065c5da1363d6be2eec79691b2821594f3f2329528c/pytorch_nlp-0.5.0-py3-none-any.whl (90kB)
[K     |████████████████████████████████| 92kB 5.7MB/s 
Installing collected packages: pytorch-nlp
Successfully installed pytorch-nlp-0.5.0


'Tesla T4'

## **Training/Testing Data**

In [11]:
# Split the data into training and testing
train_data = df.head(885)
test_data = df.tail(885)

print(train_data.shape, test_data.shape)

(885, 2) (885, 2)


In [12]:
# Generate a list of dictionaries with ‘description’ and ‘fraudulent’ keys
# Guidance provided by https://towardsdatascience.com/fake-job-classification-with-bert-6575c908a9aa
train_data = [{'description': description, 'fraudulent': fraudulent } for description in list(train_data['description']) for fraudulent in list(train_data['fraudulent'])]

test_data = [{'description': description, 'fraudulent': fraudulent } for description in list(test_data['description'])for fraudulent in list(test_data['fraudulent'])]

In [13]:
# Generate a list of tuples from the list of dictionaries
train_texts, train_labels = list(zip(*map(lambda d: (d['description'], d['fraudulent']), train_data)))


In [15]:
test_texts, test_labels = list(zip(*map(lambda d: (d['description'], d['fraudulent']), test_data)))

## ** Bert pre-processing**


Reference: https://www.kaggle.com/ratan123/in-depth-guide-to-google-s-bert/notebook

In [16]:
# Generate tokens and token ids
# Bert Tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

In [17]:
# Tokens
#train_tokens = list(map(lambda t: ['[CLS]'] + tokenizer.tokenize(t)[:511], train_texts))
train_tokens = list(map(lambda t: ['[CLS]'] + tokenizer.tokenize(t)[:128], train_texts))


In [None]:
test_tokens = list(map(lambda t: ['[CLS]'] + tokenizer.tokenize(t)[:128], test_texts))

In [None]:
# Token Ids
train_tokens_ids = list(map(tokenizer.convert_tokens_to_ids, train_tokens))
test_tokens_ids = list(map(tokenizer.convert_tokens_to_ids, test_tokens))

In [None]:
train_tokens_ids = pad_sequences(train_tokens_ids, maxlen=512, truncating="post", padding="post", dtype="int")
test_tokens_ids = pad_sequences(test_tokens_ids, maxlen=512, truncating="post", padding="post", dtype="int")

In [None]:
# Generate a boolean array based on the value of ‘fraudulent’ for our testing and training sets
train_y = np.array(train_labels) == 1
test_y = np.array(test_labels) == 1

## **Building the BERT Classifier**

In [None]:
# Create BERT classifier - contains ‘initialization’ method and ‘forward’ method; returns token probabilities
class BertBinaryClassifier(nn.Module):
    def __init__(self, dropout=0.1):
        super(BertBinaryClassifier, self).__init__()
        self.bert = BertModel.from_pretrained('bert-base-uncased')
        self.dropout = nn.Dropout(dropout)
        self.linear = nn.Linear(768, 1)
        self.sigmoid = nn.Sigmoid()
    
    def forward(self, tokens, masks=None):
        _, pooled_output = self.bert(tokens, attention_mask=masks, output_all_encoded_layers=False)
        dropout_output = self.dropout(pooled_output)
        linear_output = self.linear(dropout_output)
        proba = self.sigmoid(linear_output)
        return proba

In [None]:
# Generate training and testing masks
train_masks = [[float(i > 0) for i in ii] for ii in train_tokens_ids]
test_masks = [[float(i > 0) for i in ii] for ii in test_tokens_ids]
train_masks_tensor = torch.tensor(train_masks)
test_masks_tensor = torch.tensor(test_masks)

In [None]:
# Generate token tensors for training and testing
train_tokens_tensor = torch.tensor(train_tokens_ids)
train_y_tensor = torch.tensor(train_y.reshape(-1, 1)).float()
test_tokens_tensor = torch.tensor(test_tokens_ids)
test_y_tensor = torch.tensor(test_y.reshape(-1, 1)).float()

In [None]:
# Prepare data loaders
BATCH_SIZE = 1

train_dataset =  torch.utils.data.TensorDataset(train_tokens_tensor, train_masks_tensor, train_y_tensor)
train_sampler =  torch.utils.data.RandomSampler(train_dataset)
train_dataloader =  torch.utils.data.DataLoader(train_dataset, sampler=train_sampler, batch_size=BATCH_SIZE)

test_dataset =  torch.utils.data.TensorDataset(test_tokens_tensor, test_masks_tensor, test_y_tensor)
test_sampler =  torch.utils.data.SequentialSampler(test_dataset)
test_dataloader =  torch.utils.data.DataLoader(test_dataset, sampler=test_sampler, batch_size=BATCH_SIZE)

## **Train Model**

In [None]:
# Use the Adam optimizer to minimize the Binary Cross Entropy loss 
# Train with a batch size of 1 for 1 EPOCHS
BATCH_SIZE = 1
EPOCHS = 1

bert_clf = BertBinaryClassifier()
optimizer = torch.optim.Adam(bert_clf.parameters(), lr=3e-6)

for epoch_num in range(EPOCHS):
    bert_clf.train()
    train_loss = 0

    for step_num, batch_data in enumerate(train_dataloader):
        token_ids, masks, labels = tuple(t for t in batch_data)
        probas = bert_clf(token_ids, masks)
        loss_func = nn.BCELoss()
        batch_loss = loss_func(probas, labels)
        train_loss += batch_loss.item()
        bert_clf.zero_grad()
        batch_loss.backward()
        optimizer.step()
        print('Epoch: ', epoch_num + 1)
        print("\r" + "{0}/{1} loss: {2} ".format(step_num, len(train_data) / BATCH_SIZE, train_loss / (step_num + 1)))

## **Evaluate**

In [None]:
# Evaluate our model
bert_clf.eval()
bert_predicted = []
all_logits = []
with torch.no_grad():
    for step_num, batch_data in enumerate(test_dataloader):

token_ids, masks, labels = tuple(t for t in batch_data)

logits = bert_clf(token_ids, masks)
        loss_func = nn.BCELoss()
        loss = loss_func(logits, labels)
        numpy_logits = logits.cpu().detach().numpy()
        
        bert_predicted += list(numpy_logits[:, 0] > 0.5)
        all_logits += list(numpy_logits[:, 0])
        
print(classification_report(test_y, bert_predicted))