In [44]:
from transformers import BertTokenizer, BertModel
from transformers import pipeline
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained("bert-base-uncased")

In [45]:
# LOAD PROCESSED CORPUS INTO NOTEBOOK

import os

folder_path = './corpus-processed'
authors_data = {}

for filename in os.listdir(folder_path):
    args = filename.split(",")
    if args[0] not in authors_data:
        authors_data[args[0]] = []

    file_path = os.path.join(folder_path, filename)
    if os.path.isfile(file_path):
        with open(file_path, 'r') as file:
            file_lines = [line.rstrip('\n') for line in file]
            authors_data[args[0]].extend(file_lines)

In [46]:
# EXTRACT AUTHOR NAMES

authors = list(authors_data.keys())
author_ids = {}
for i in range(len(authors)):
    author_ids[authors[i]] = i
print(author_ids)

{'dekker': 0, 'fletcher': 1, 'ford': 2, 'jonson': 3, 'massinger': 4, 'middleton': 5, 'rowley': 6, 'shakespeare': 7, 'webster': 8}


In [47]:
# TOKENIZE CORPUS PER AUTHOR

import json

data = []
author_data_tokenized = {}
for author in authors:
    author_data = authors_data[author]
    tokenized_sentences = [tokenizer.tokenize(s) for s in author_data]
    author_data_tokenized[author] = tokenized_sentences

with open('./torch-cache/tokenized_author_data.json', 'w') as file:
    json.dump(author_data_tokenized, file)

In [48]:
# GENERATE BERT INPUT SEQUENCES, MASKS, AND LABELS

MAX_SEQUENCE_LEN = 256  # can go up to 512
bert_inputs = []
bert_inputs_readable = []
bert_input_masks = []
data_labels = []

# TODO: missing period between sentences!! oof

for author in authors:
    sentences = author_data_tokenized[author]
    label = author_ids[author]

    current_input = ["CLS"]
    for s in sentences:
        if len(s) + len(current_input) <= MAX_SEQUENCE_LEN - 1:
            current_input.extend(s)
        else:
            current_input.append("[SEP]")
            mask = [1 for _ in range(len(current_input))]

            while len(current_input) != MAX_SEQUENCE_LEN:
                current_input.append("[PAD]")
                mask.append(0)

            bert_inputs.append(tokenizer.convert_tokens_to_ids(current_input))
            bert_inputs_readable.append(current_input)
            bert_input_masks.append(mask)
            data_labels.append(label)
            current_input = ["CLS"]

print(f"Total number of inputs: {len(bert_inputs)}")

Total number of inputs: 10058


In [49]:
# SAVE DATA TO DISK

import torch

x_inputs = torch.tensor(bert_inputs)
x_masks = torch.tensor(bert_input_masks)
y_labels = torch.tensor(data_labels)

torch.save(x_inputs, './torch-cache/x_inputs_256.pt')
torch.save(x_masks, './torch-cache/x_masks_256.pt')
torch.save(y_labels, './torch-cache/y_labels_256.pt')

with open('./torch-cache/authors.json', 'w') as file:
    json.dump(authors, file)