In [None]:
import torch
import pandas as pd
import numpy as np
from sklearn import metrics
# from utils import get_data

In [None]:

from google.colab import files

uploaded = files.upload()

for fn in uploaded.keys():
    print('User uploaded file "{name}" with length {length} bytes'.format(
      name=fn, length=len(uploaded[fn])))



In [None]:
def get_data(file_path):
    data = pd.read_csv(file_path, names=['SourceatBot7', 'Content'], sep='\t')
    return data

In [None]:
train_data = get_data("tweets_bert.train")
val_data = get_data("tweets_bert.test")
test_data = get_data("tweets_bert.test")
train_data['comment_without_stopwords'] = train_data['Content']
train_data['comment'] = train_data['Content']
train_data['label'] = train_data['SourceatBot7']
val_data['comment'] = val_data['Content']
val_data['label'] = val_data['SourceatBot7']
test_data['comment'] = test_data['Content']
test_data['label'] = test_data['SourceatBot7']

In [None]:
!pip install transformers 

In [None]:
from transformers import BertTokenizer, BertForSequenceClassification, AdamW

tokenization => text => token
word => sub-tokens

vectorization==embedding: token => embedding/vector

In [None]:
model_name = 'bert-base-uncased' #'bert-base-uncased'
tokenizer = BertTokenizer.from_pretrained(model_name)

train_comments = train_data['comment']
train_labels = train_data['label']
print(train_comments[0])
print(tokenizer.convert_ids_to_tokens(tokenizer.encode(train_comments[0])))

In [None]:
#Convert each sentence to a number (greater than 126 for truncation, 
#less than 126 for Padding, plus the first two identifiers, total length equals 128)
def convert_text_to_token(tokenizer, sentence, limit_size = 126):
    try:
        tokens = tokenizer.encode(sentence[:limit_size])       #  truncatution
    except Exception as e:
        print(e)
        return []
    if len(tokens) < limit_size + 2:                       # padding
        tokens.extend([0] * (limit_size + 2 - len(tokens)))
    return tokens

input_ids = [convert_text_to_token(tokenizer, sen) for sen in train_comments if convert_text_to_token(tokenizer, sen)]

input_tokens = torch.tensor(input_ids)
  

In [None]:
print(input_tokens.shape) 

In [None]:
train_labels = [int(c) for c in train_data['label']]
#train_labels

In [None]:
# masking
def attention_masks(input_ids):
    atten_masks = []
    for seq in input_ids:                       # [10000, 128]
        seq_mask = [float(i > 0) for i in seq]  # Padding: 0; otherwise: 1
        atten_masks.append(seq_mask)
    return atten_masks

atten_masks = attention_masks(input_ids)
attention_tokens = torch.tensor(atten_masks)

train_labels = torch.tensor(train_labels)


print(attention_tokens.shape)
print(input_tokens[0])
print(attention_tokens[0])

In [None]:
from google.colab import drive
drive.mount('/content/drive')

import sys
sys.path.insert(0, '/content/drive/My_Drive/Colab Notebooks/proj/bert/BERT')
%cd drive/My\ Drive/Colab Notebooks/proj/bert/BERT

# !pip install config

In [None]:
from config_edit import Config
cfg = Config()
cfg.__dict__
print(cfg.batch_size)

In [None]:
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler

train_data = TensorDataset(input_tokens, attention_tokens, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=cfg.batch_size)

val_ids = [convert_text_to_token(tokenizer, sen) for sen in val_data['comment']]
val_tokens = torch.tensor(val_ids)
val_masks = attention_masks(val_ids)
val_masks = torch.tensor(val_masks)

val_data = TensorDataset(val_tokens, val_masks, torch.tensor(val_data['label']))
val_sampler = RandomSampler(val_data)
val_dataloader = DataLoader(val_data, sampler=val_sampler, batch_size=cfg.batch_size)

In [None]:
for i, (train, mask, label) in enumerate(train_dataloader): 
    print(train.shape, mask.shape, label.shape)
    break

print('len(train_dataloader) = ', len(train_dataloader))    # 500

In [None]:
model = BertForSequenceClassification.from_pretrained(model_name, num_labels = 2) # num_labels, binary label,1 for bot, 0 human
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

In [None]:
optimizer = AdamW(model.parameters(), lr = cfg.learning_rate)

In [None]:
# Learning rate warm-up, training starts with a small learning rate
epochs = cfg.epochs
# training steps's count: [number of batches] x [number of epochs].
total_steps = len(train_dataloader) * epochs

# construct a learning rate scheduler.
from transformers import get_linear_schedule_with_warmup
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps = 0, 
                                            num_training_steps = total_steps)

In [None]:
def binary_acc(preds, labels): # preds.shape = [16, 2] labels.shape = [16, 1]
    # torch.max: [0] max, [1]is the maximum index
    correct = torch.eq(torch.max(preds, dim=1)[1], labels.flatten()).float()
    acc = correct.sum().item() / len(correct)
    return acc

In [None]:
from tqdm import tqdm
# Gradient cropping to alleviate the gradient disappearance problem of neural network training: 
#since the gradient disappearance (that is, the bias derivative is infinitely close to 0, 
#resulting in long-time memory can not be updated) will be generated in the BP process, 
#then the simplest and most brutal way to set the threshold, when the gradient is less than the threshold, 
#the updated gradient is the threshold
from torch.nn.utils import clip_grad_norm_

def train(model, optimizer):
    avg_loss, avg_acc = [],[]

    model.train()
    for step, batch in enumerate(tqdm(train_dataloader)):
        b_input_ids, b_input_mask, b_labels = batch[0].long().to(device), batch[1].long().to(device), batch[2].long().to(device)

        output = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask, labels=b_labels)
        loss, logits = output[0], output[1]      # loss, logits: predict

        avg_loss.append(loss.item())

        acc = binary_acc(logits, b_labels)       # (predict, label)
        avg_acc.append(acc)

        optimizer.zero_grad()
        loss.backward()
        clip_grad_norm_(model.parameters(), 1.0) # normalize, 1.0, to prevent exploding gradient
        optimizer.step()                         # updating model paramters
        scheduler.step()                         # updating learning rate

    avg_acc = np.array(avg_acc).mean()
    avg_loss = np.array(avg_loss).mean()
    return avg_loss, avg_acc

In [None]:
# Call the model model without passing in the label value
def evaluate(model):
    avg_acc = []
    model.eval()         # Indicates entering test mode

    with torch.no_grad():
        for batch in val_dataloader:
            b_input_ids, b_input_mask, b_labels = batch[0].long().to(device), batch[1].long().to(device), batch[2].long().to(device)

            output = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask)

            acc = binary_acc(output[0], b_labels)
            avg_acc.append(acc)

    avg_acc = np.array(avg_acc).mean()
    return avg_acc

In [None]:
for epoch in range(epochs):

    train_loss, train_acc = train(model, optimizer)
    print('epoch={},training_accuracy={}，loss={}'.format(epoch, train_acc, train_loss))
    
    val_acc = evaluate(model)
    print("epoch={},testing_accuracy={}".format(epoch, val_acc))

In [None]:
def predict(sen):

    input_id = convert_text_to_token(tokenizer, sen)
    input_token = torch.tensor(input_id).long().to(device)           

    atten_mask = [float(i>0) for i in input_id]
    attention_token = torch.tensor(atten_mask).long().to(device)   

    output = model(input_token.view(1, -1), token_type_ids=None, attention_mask=attention_token.view(1, -1))     #torch.Size([64])->torch.Size([1, 64])
    res = int(torch.max(output[0], dim=1)[1])

    return res

label = predict('"RT @paigeleskin: late scoop: @Shopify is giving its employees �?5,000 worldwide �?each $1,000 to buy whatever equipment they need to work from home during the coronavirus outbreak: https://www.businessinsider.com/coronavirus-shopify-employees-work-from-home-employees-1000-bonus-office-2020-3"')
print('Bot' if label==1 else 'Human')


In [None]:
# Call the model model without passing in the label value
def evaluate_on_testset(model):
    test_comments, test_labels = test_data['comment'], test_data['label'].tolist()
    pred_labels = []
    for comment in tqdm(test_comments):
        pred_labels.append(predict(comment))
    print(metrics.classification_report(test_labels, pred_labels))
    return sum([1 if pred_labels[i] == test_labels[i] else 0 for i in range(len(pred_labels))]) / len(pred_labels)

print("testing_set_accuracy={}".format(evaluate_on_testset(model)))

In [None]:
label = predict('we live in the world')
print('Bot' if label==1 else 'Human')

In [None]:
label = predict('RT @Jamierodr14: NEVER FORGET: Pelosi Held Up Vote on Coronavirus So Democrats Could Run Ads Against GOP on Super Tuesday! Now Democrats & the Fake Mainstream Media is blaming President @realDonaldTrump for the coronavirus! Disgraceful Liars! Agree🙋‍♀�?https://www.thegatewaypundit.com/2020/03/never-forget-pelosi-held-up-vote-on-coronavirus-so-democrats-could-run-ads-against-gop-on-super-tuesday/')
print('Bot' if label==1 else 'Human')