In [1]:
import pandas as pd 
import numpy as np 
import torch.nn as nn
from pytorch_pretrained_bert import BertTokenizer, BertModel
import torch
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.metrics import classification_report


pd.set_option('display.max_columns', None)
# df = pd.read_csv("fake_job_postings.csv")
# # #print(df.head())


# df = df[['description', 'fraudulent']]
# print(df.head())

df = pd.read_csv("all.tsv", delimiter='\t')

df.rename(columns={'label':'fraudulent', 'tweet':'description'}, inplace=True)
df = df[['description', 'fraudulent']]
for i in range(len(df)):
    if(df.at[i,'fraudulent'] == 'real'):
        df.at[i,'fraudulent'] = 0
    else:
        df.at[i,'fraudulent'] = 1




##############################################################

from collections import Counter
print(Counter(df['fraudulent'].values))

df_fraudulent= df[df['fraudulent'] == 1] 
df_normal = df[df['fraudulent'] == 0] 
df_normal = df_normal.sample(n=len(df_fraudulent))
df = df_normal.append(df_fraudulent)
df = df.sample(frac=1, random_state = 24).reset_index(drop=True)

print(Counter(df['fraudulent'].values))


train_data = df.head(10)
print(train_data)
test_data = df.tail(10)
print(test_data)

train_data = [{'description': description, 'fraudulent': fraudulent } for description in list(train_data['description']) for fraudulent in list(train_data['fraudulent'])]
test_data = [{'description': description, 'fraudulent': fraudulent } for description in list(test_data['description']) for fraudulent in list(test_data['fraudulent'])]

train_texts, train_labels = list(zip(*map(lambda d: (d['description'], d['fraudulent']), train_data)))
test_texts, test_labels = list(zip(*map(lambda d: (d['description'], d['fraudulent']), test_data)))

KeyError: "['description'] not in index"

In [2]:
print(train_data[0]['description'])

Our daily update is published. We’ve now tracked 5.4 million tests up 257k from yesterday. Another huge testing day. 4/22 was a clear inflection point. Note that we can only track tests that a state reports. For details see: https://t.co/PZrmH4bl5Y https://t.co/2PuyjEbdyA


In [3]:
print(len(train_data))

100


In [4]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)
train_tokens = list(map(lambda t: ['[CLS]'] + tokenizer.tokenize(t)[:199], train_texts))
test_tokens = list(map(lambda t: ['[CLS]'] + tokenizer.tokenize(t)[:199], test_texts))
train_tokens_ids = list(map(tokenizer.convert_tokens_to_ids, train_tokens))
test_tokens_ids = list(map(tokenizer.convert_tokens_to_ids, test_tokens))
train_tokens_ids = pad_sequences(train_tokens_ids, maxlen=200, truncating="post", padding="post", dtype="int")
test_tokens_ids = pad_sequences(test_tokens_ids, maxlen=200, truncating="post", padding="post", dtype="int")

train_y = np.array(train_labels) == 1
test_y = np.array(test_labels) == 1


class BertBinaryClassifier(nn.Module):
    def __init__(self, dropout=0.1):
        super(BertBinaryClassifier, self).__init__()
        self.bert = BertModel.from_pretrained('bert-base-uncased')
        self.dropout = nn.Dropout(dropout)
        self.linear = nn.Linear(768, 1)
        self.sigmoid = nn.Sigmoid()
    
    def forward(self, tokens, masks=None):
        _, pooled_output = self.bert(tokens, attention_mask=masks, output_all_encoded_layers=False)
        dropout_output = self.dropout(pooled_output)
        linear_output = self.linear(dropout_output)
        proba = self.sigmoid(linear_output)
        return proba

In [5]:
train_masks = [[float(i > 0) for i in ii] for ii in train_tokens_ids]
test_masks = [[float(i > 0) for i in ii] for ii in test_tokens_ids]
train_masks_tensor = torch.tensor(train_masks)
test_masks_tensor = torch.tensor(test_masks)

train_tokens_tensor = torch.tensor(train_tokens_ids)
train_y_tensor = torch.tensor(train_y.reshape(-1, 1)).float()
test_tokens_tensor = torch.tensor(test_tokens_ids)
test_y_tensor = torch.tensor(test_y.reshape(-1, 1)).float()


BATCH_SIZE = 1
train_dataset =  torch.utils.data.TensorDataset(train_tokens_tensor, train_masks_tensor, train_y_tensor)
train_sampler =  torch.utils.data.RandomSampler(train_dataset)
train_dataloader =  torch.utils.data.DataLoader(train_dataset, sampler=train_sampler, batch_size=BATCH_SIZE)
test_dataset =  torch.utils.data.TensorDataset(test_tokens_tensor, test_masks_tensor, test_y_tensor)
test_sampler =  torch.utils.data.SequentialSampler(test_dataset)
test_dataloader =  torch.utils.data.DataLoader(test_dataset, sampler=test_sampler, batch_size=BATCH_SIZE)


In [6]:
EPOCHS = 1
bert_clf = BertBinaryClassifier()
optimizer = torch.optim.Adam(bert_clf.parameters(), lr=3e-6)
for epoch_num in range(EPOCHS):
    bert_clf.train()
    train_loss = 0
    for step_num, batch_data in enumerate(train_dataloader):
        token_ids, masks, labels = tuple(t for t in batch_data)
        probas = bert_clf(token_ids, masks)
        loss_func = nn.BCELoss()
        batch_loss = loss_func(probas, labels)
        train_loss += batch_loss.item()
        bert_clf.zero_grad()
        batch_loss.backward()
        optimizer.step()
        print('Epoch: ', epoch_num + 1)
        print("\r" + "{0}/{1} loss: {2} ".format(step_num, len(train_data) / BATCH_SIZE, train_loss / (step_num + 1)))
        
bert_clf.eval()
bert_predicted = []
all_logits = []
with torch.no_grad():
    for step_num, batch_data in enumerate(test_dataloader):

        token_ids, masks, labels = tuple(t for t in batch_data)

        logits = bert_clf(token_ids, masks)
        loss_func = nn.BCELoss()
        loss = loss_func(logits, labels)
        numpy_logits = logits.cpu().detach().numpy()
        
        bert_predicted += list(numpy_logits[:, 0] > 0.5)
        all_logits += list(numpy_logits[:, 0])
        
print(classification_report(test_y, bert_predicted))

Epoch:  1
0/100.0 loss: 0.8984752893447876 
Epoch:  1
1/100.0 loss: 0.7197897136211395 
Epoch:  1
2/100.0 loss: 0.6767944494883219 
Epoch:  1
3/100.0 loss: 0.6325837001204491 
Epoch:  1
4/100.0 loss: 0.6079475462436676 
Epoch:  1
5/100.0 loss: 0.6056357771158218 
Epoch:  1
6/100.0 loss: 0.5756295280797141 
Epoch:  1
7/100.0 loss: 0.6322903521358967 
Epoch:  1
8/100.0 loss: 0.6649254328674741 
Epoch:  1
9/100.0 loss: 0.6496612280607224 
Epoch:  1
10/100.0 loss: 0.6337287371808832 
Epoch:  1
11/100.0 loss: 0.6145353019237518 
Epoch:  1
12/100.0 loss: 0.5952109946654394 
Epoch:  1
13/100.0 loss: 0.5860455334186554 
Epoch:  1
14/100.0 loss: 0.5821415026982625 
Epoch:  1
15/100.0 loss: 0.5712498743087053 
Epoch:  1
16/100.0 loss: 0.562231274212108 
Epoch:  1
17/100.0 loss: 0.5584121147791544 
Epoch:  1
18/100.0 loss: 0.5471213444283134 
Epoch:  1
19/100.0 loss: 0.5771749094128609 
Epoch:  1
20/100.0 loss: 0.5668790553297315 
Epoch:  1
21/100.0 loss: 0.5570424984801899 
Epoch:  1
22/100.0 lo

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [7]:
bert_clf.summary()

AttributeError: 'BertBinaryClassifier' object has no attribute 'summary'