In [None]:
!pip install transformers --upgrade

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
import transformers
import pandas as pd
import numpy as np
import sys
import logging
from sklearn.model_selection import train_test_split
import time
import datetime
import random
import os
from os import listdir

In [None]:
from transformers import BertTokenizer
from transformers import BertForSequenceClassification, AdamW, BertConfig
from transformers import get_linear_schedule_with_warmup
tokenizer = BertTokenizer.from_pretrained('bert-base-cased')

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [None]:
formal_male_data = []
formal_female_data = []
informal_male_data = []
informal_female_data = []

input_path_formal_male = 'Gender tagged corpus English/male/'
input_path_formal_female = 'Gender tagged corpus English/female/'

# input_path_formal_male = 'Gender tagged corpus English/male/'
# input_path_formal_female = 'Gender tagged corpus English/female/'

for fileName in listdir(input_path_formal_male)[:50000]:
    file = open(input_path_formal_male + fileName, 'r', encoding='utf-8')
    text = file.read()
    formal_male_data.append(text)
    file.close()
    
for fileName in listdir(input_path_formal_female)[:50000]:
    file = open(input_path_formal_female + fileName, 'r', encoding='utf-8')
    text = file.read()
    formal_female_data.append(text)
    file.close()
    
# for fileName in listdir(input_path_informal_male):
#     file = open(input_path_informal_male + fileName, 'r', encoding='utf-8')
#     text = file.read()
#     informal_male_data.append(text)
#     file.close()
    
# for fileName in listdir(input_path_informal_female):
#     file = open(input_path_informal_female + fileName, 'r', encoding='utf-8')
#     text = file.read()
#     informal_female_data.append(text)
#     file.close()

# all_data = informal_male_data + informal_female_data + formal_male_data + formal_female_data
# all_labels = [1] * (len(informal_male_data) + len(informal_female_data)) + [0] * (len(formal_male_data) + len(formal_female_data))
all_data = formal_male_data + formal_female_data
all_labels = [1] * len(formal_male_data) + [0] * len(formal_female_data)
zipped_list = list(zip(all_data, all_labels))
random.shuffle(zipped_list)
all_data, all_labels = zip(*zipped_list)
all_data = list(all_data)
all_labels = list(all_labels)

In [None]:
train_text, test_text, train_labels, test_labels = train_test_split(all_data, all_labels, test_size=0.2, shuffle=True)
print(len(train_text))
print(len(train_labels))
print(len(test_text))
print(len(test_labels))

In [None]:
print('Original: ', train_text[0], train_labels[0])
print("len(Original) = ", len(train_text[0]))
print("\n")

# Print the sentence split into tokens.
print('Tokenized: ', tokenizer.tokenize(train_text[0]))
print("len(Tokenized) = ", len(tokenizer.tokenize(train_text[0])))
print("\n")

# Print the sentence mapped to token ids.
print('Token IDs: ', tokenizer.convert_tokens_to_ids(tokenizer.tokenize(train_text[0])))
print("len(Token IDs) = ", len(tokenizer.convert_tokens_to_ids(tokenizer.tokenize(train_text[0]))))
print("\n")

In [None]:
def text_to_id(tokenizer, text_list):
    """
    It is a function to transform text to id.
    # `encode` will:
    #   (1) Tokenize the sentence.
    #   (2) Prepend the `[CLS]` token to the start.
    #   (3) Append the `[SEP]` token to the end.
    #   (4) Map tokens to their IDs.
    """
    ids_list = []
    
    for item in text_list:
        # Sentence to id and add [CLS] and [SEP]
        encoded_item = tokenizer.encode(item, add_special_tokens=True)
        ids_list.append(encoded_item)
    
    return ids_list

In [None]:
train_text_ids = text_to_id(tokenizer, train_text)
test_text_ids = text_to_id(tokenizer, test_text)


# Print sentence 0, now as a list of IDs.
print('Original: {}\n'.format(train_text[0]))
print('Token IDs: {}\n'.format(train_text_ids[0]))
print("len(train_text_ids) = {}\n".format(len(train_text_ids)))
print("len(test_text_ids) = {}".format(len(test_text_ids)))

In [None]:
print('Train: max sentence length: ', max([len(sen) for sen in train_text_ids]))
print('Train: Min sentence length: ', min([len(sen) for sen in train_text_ids]))
print('Test: max sentence length: ', max([len(sen) for sen in test_text_ids]))
print('Test: Min sentence length: ', min([len(sen) for sen in test_text_ids]))

# for aaa in train_text_ids:
#     if len(aaa) == 1558:
#         print(aaa)
#         break
# print('\n' * 20)
# for each in test_text_ids:
#     if len(each) == 1558:
#         print(each)
#         break

In [None]:
def padding_truncating(input_ids_list, max_length):
    processed_input_ids_list = []
    for item in input_ids_list:
        seq_list = []
        if len(item) < max_length:
            seq_list = [0] * (max_length - len(item))
            item = item + seq_list
        
        elif len(item) >= max_length:
            item = item[:max_length]
            
        processed_input_ids_list.append(item)
    return processed_input_ids_list

def get_attention_masks(pad_input_ids_list):
    """
    It is a function to get attention masks:
    
    - If a token ID is 0, then it's padding, set the mask to 0.
    - If a token ID is > 0, then it's a real token, set the mask to 1.
    """
    attention_masks_list = []
    
    for item in pad_input_ids_list:
        
        mask_list = []
        for subitem in item:
            if subitem > 0:
                mask_list.append(1)
            else:
                mask_list.append(0)
        attention_masks_list.append(mask_list)
    
    return attention_masks_list

In [None]:
train_text_avg = int(np.mean([len(sen) for sen in train_text_ids]))
print("train_text_avg:", train_text_avg)
train_padding_list = padding_truncating(train_text_ids, max_length=train_text_avg)
test_padding_list = padding_truncating(test_text_ids, max_length=train_text_avg)


train_attention_masks = get_attention_masks(train_padding_list)
test_attention_masks = get_attention_masks(test_padding_list)

assert len(train_text) == len(train_labels) == len(train_attention_masks) == len(train_padding_list)
assert len(test_text) == len(test_labels) == len(test_attention_masks) == len(test_padding_list)

In [None]:
train_padding_list, validation_padding_list, train_labels, validation_labels, train_attention_masks, validation_attention_masks = train_test_split(train_padding_list, train_labels, train_attention_masks, test_size=0.25)

# train_padding_list = train_padding_list[:3]
# validation_padding_list = [train_padding_list[-1]]
# train_labels = train_labels[:3]
# validation_labels = [train_labels[-1]]
# train_attention_masks = train_attention_masks[:3]
# validation_attention_masks = [train_attention_masks[-1]]

assert len(train_labels) == len(train_attention_masks) == len(train_padding_list)
assert len(validation_labels) == len(validation_attention_masks) == len(validation_padding_list)
assert len(test_labels) == len(test_attention_masks) == len(test_padding_list)

print("len(train_labels) = {}\nlen(validation_labels) = {}\nlen(test_labels) = {}".format(len(train_labels), len(validation_labels), len(test_labels)))

In [None]:
train_inputs = torch.tensor(train_padding_list)
validation_inputs = torch.tensor(validation_padding_list)
test_inputs = torch.tensor(test_padding_list)

train_labels = torch.tensor(train_labels)
validation_labels = torch.tensor(validation_labels)
test_labels = torch.tensor(test_labels)

train_masks = torch.tensor(train_attention_masks)
validation_masks = torch.tensor(validation_attention_masks)
test_masks = torch.tensor(test_attention_masks)

In [None]:
batch_size = 64

# Create the DataLoader for our training set.
train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

# Create the DataLoader for our validation set.
validation_data = TensorDataset(validation_inputs, validation_masks, validation_labels)
validation_sampler = RandomSampler(validation_data)
validation_dataloader = DataLoader(validation_data, sampler=validation_sampler, batch_size=batch_size)

# Create the DataLoader for our test set.
test_data = TensorDataset(test_inputs, test_masks, test_labels)
test_sampler = SequentialSampler(test_data)
test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=batch_size)

In [None]:
model = BertForSequenceClassification.from_pretrained("bert-base-cased", num_labels = 2, output_attentions = False, output_hidden_states = False)
model.to(device)

In [None]:
optimizer = AdamW(model.parameters(), lr = 2e-5, eps = 1e-8)

In [None]:
epochs = 3
total_steps = len(train_dataloader) * epochs
print("total_steps = {}".format(total_steps))
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps = 0, num_training_steps = total_steps)

In [None]:
def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)

def format_time(elapsed):
    elapsed_rounded = int(round((elapsed)))
    return str(datetime.timedelta(seconds=elapsed_rounded))

In [None]:
# seed_val = 12345
# random.seed(seed_val)
# np.random.seed(seed_val)
# torch.manual_seed(seed_val)
loss_values = []

for epoch_i in range(epochs):
    print("")
    print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
    print('Training...')  
    t0 = time.time()
    total_loss = 0
    model.train()
    
    for step, batch in enumerate(train_dataloader):
        if step % 1 == 0 and not step == 0:
            elapsed = format_time(time.time() - t0)
            print('  Batch {:>5,}  of  {:>5,}.    Elapsed: {:}.'.format(step, len(train_dataloader), elapsed))
    
        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)

        model.zero_grad() 
        outputs = model(b_input_ids, attention_mask=b_input_mask, labels=b_labels)
        loss = outputs[0]
        total_loss += loss.item()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        scheduler.step()

    avg_train_loss = total_loss / len(train_dataloader) 
    loss_values.append(avg_train_loss)
    print("")
    print("  Average training loss: {0:.2f}".format(avg_train_loss))
    print("  Training epcoh took: {:}".format(format_time(time.time() - t0)))
    
    print("")
    print("Running Validation...")
    t0 = time.time()
    model.eval()
    eval_loss, eval_accuracy = 0, 0
    nb_eval_steps, nb_eval_examples = 0, 0
    for batch in validation_dataloader:
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_input_mask, b_labels = batch
        with torch.no_grad(): 
            outputs = model(b_input_ids, attention_mask=b_input_mask)
            
        logits = outputs[0]         
        logits = logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()
        tmp_eval_accuracy = flat_accuracy(logits, label_ids)
        eval_accuracy += tmp_eval_accuracy
        nb_eval_steps += 1
        
    print("  Accuracy: {0:.2f}".format(eval_accuracy/nb_eval_steps))
    print("  Validation took: {:}".format(format_time(time.time() - t0)))

print("")
print("Training complete!")

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(style='darkgrid')
sns.set(font_scale=1.5)
plt.rcParams["figure.figsize"] = (12,6)
plt.plot(loss_values, 'b-o')
plt.title("Training loss")
plt.xlabel("Epoch")
plt.ylabel("Loss")
plt.show()

In [None]:
print('Predicting labels for {:,} test sentences...'.format(len(test_inputs)))
model.eval()
predictions , true_labels = [], []
idx = 0
correct = 0
for batch in test_dataloader:
    print("Batch {}".format(idx + 1))
    idx += 1
    batch = tuple(t.to(device) for t in batch)
    b_input_ids, b_input_mask, b_labels = batch
    with torch.no_grad():
        outputs = model(b_input_ids, attention_mask=b_input_mask)
    logits = outputs[0]
    pred = np.argmax(logits, axis=1)
    correct += (pred == b_labels).sum().item()
    print("correct = {}\n".format(correct))

print('DONE.')
print("Total correct = ", correct)
print("Test accuracy = {0:.2f}".format(correct / len(test_inputs)))

In [None]:
saved_model_dir = "saved models/Pytorch"

if not os.path.exists(saved_model_dir):
    os.makedirs(saved_model_dir)

model.save_pretrained(saved_model_dir)
tokenizer.save_pretrained(saved_model_dir)

In [None]:
load_model = BertForSequenceClassification.from_pretrained(saved_model_dir)       
load_tokenizer = BertTokenizer.from_pretrained(saved_model_dir)