In [79]:
from transformers import BertTokenizer, BertModel
from transformers import pipeline
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained("bert-base-uncased")

In [80]:
# LOAD PROCESSED CORPUS INTO NOTEBOOK

import os

folder_path = './corpus-processed'
authors_data = {}

for filename in os.listdir(folder_path):
    args = filename.split(",")
    if args[0] not in authors_data:
        authors_data[args[0]] = []

    file_path = os.path.join(folder_path, filename)
    if os.path.isfile(file_path):
        with open(file_path, 'r') as file:
            file_lines = [line.rstrip('\n') for line in file]
            authors_data[args[0]].extend(file_lines)

In [81]:
# EXTRACT AUTHOR NAMES

authors = list(authors_data.keys())
author_ids = {}
for i in range(len(authors)):
    author_ids[authors[i]] = i
print(author_ids)
print(authors)

{'dekker': 0, 'fletcher': 1, 'ford': 2, 'jonson': 3, 'massinger': 4, 'middleton': 5, 'rowley': 6, 'shakespeare': 7, 'webster': 8}
['dekker', 'fletcher', 'ford', 'jonson', 'massinger', 'middleton', 'rowley', 'shakespeare', 'webster']


In [82]:
# TOKENIZE CORPUS PER AUTHOR

import json

data = []
author_data_tokenized = {}
for author in authors:
    author_data = authors_data[author]
    tokenized_sentences = [tokenizer.tokenize(s) for s in author_data]
    author_data_tokenized[author] = tokenized_sentences

with open('./torch-cache/tokenized_author_data.json', 'w') as file:
    json.dump(author_data_tokenized, file)

In [83]:
# GENERATE BERT INPUT SEQUENCES, MASKS, AND LABELS

MAX_SEQUENCE_LEN = 128  # can go up to 512
bert_inputs = []
bert_inputs_readable = []
bert_input_masks = []
data_labels = []

for author in authors:
    sentences = author_data_tokenized[author]
    label = author_ids[author]

    current_input = ["[CLS]"]
    for s in sentences:
        if len(s) + len(current_input) + 1 <= MAX_SEQUENCE_LEN - 1:
            current_input.extend(s)
            current_input.append(".")
        else:
            current_input.append("[SEP]")
            mask = [1 for _ in range(len(current_input))]

            while len(current_input) != MAX_SEQUENCE_LEN:
                current_input.append("[PAD]")
                mask.append(0)

            bert_inputs.append(tokenizer.convert_tokens_to_ids(current_input))
            bert_inputs_readable.append(current_input)
            bert_input_masks.append(mask)
            data_labels.append(label)
            current_input = ["[CLS]"]

print(f"Total number of inputs: {len(bert_inputs)}")

Total number of inputs: 20112


In [84]:
# SAVE DATA TO DISK

import torch

x_inputs = torch.tensor(bert_inputs)
x_masks = torch.tensor(bert_input_masks)
y_labels = torch.tensor(data_labels)

torch.save(x_inputs, f'./torch-cache/x_inputs_s{MAX_SEQUENCE_LEN}.pt')
torch.save(x_masks, f'./torch-cache/x_masks_s{MAX_SEQUENCE_LEN}.pt')
torch.save(y_labels, f'./torch-cache/y_labels_s{MAX_SEQUENCE_LEN}.pt')

with open('./torch-cache/authors.json', 'w') as file:
    json.dump(authors, file)

In [85]:
test = torch.load('./torch-cache/x_inputs.pt', weights_only=True)
test_2 = torch.load('./torch-cache/y_labels.pt', weights_only=True)
print(test[0], test_2[0])

tensor([  100,  1037,  2395,  4607,  2957,  3804,  1997, 13370,  2894,  2085,
         2003,  1996,  3467,  1997,  2256, 27648,  2081, 14013,  2621,  2011,
         2023,  2365,  1997,  2259,  1998,  2035,  1996,  8044,  2008, 10223,
         5596,  2588,  2256,  2160,  1999,  1996,  2784,  8945, 25426,  1997,
         1996,  4153,  3950,  2085,  2024,  2256, 11347,  5391,  2007, 13846,
        29586,  2015,  2256, 18618,  2608,  5112,  2039,  2005, 10490,  2256,
         8665, 21862,  6824,  2015,  2904,  2000, 12831,  6295,  2256, 21794,
        20691,  2000, 26380,  5761, 11844,  9425,  5999,  2162,  6045,  2232,
        17966,  2010, 15968,  2392,  1998,  2085,  2612,  1997, 15986, 25007,
        26261,  2098,  2015,  2000, 25966,  1996,  9293,  1997, 19725,  4748,
        14028, 12086,  2002,  4880,  2869,  9152, 14905,  2135,  1999,  1037,
         3203,  1005,  1055,  4574,  2000,  1996,  5869,  6895, 24918, 24820,
         1997,  1037, 11320,  2618,   102,     0,     0,     0])

In [86]:
print(x_inputs[0], y_labels[0])

tensor([  101,  4607,  2048, 24812,  2635,  9098,  2189,  9391,  2306,  1012,
         4607,  2007,  2048,  3564,  2006,  1037,  3242,  3402,  9891,  1996,
        11002,  1012,  2006,  2033,  2515,  2189,  5247,  2023,  2614,  1012,
         2006,  2033,  2008,  5223,  2035,  8499,  1012,  2026, 24665, 20113,
         2935,  1012,  2024,  2017,  2045,  2007,  2115, 26892, 17125,  1012,
         5292,  8024,  2017,  7179,  1012,  2106,  2025,  1045, 14187,  2017,
         2006,  2115,  3268,  2000,  3422,  2008,  3904, 22995,  1005,  1040,
         2149,  1012,  2045,  2125,  2003,  1005,  1056,  2017,  2008, 12419,
         2033,  2007,  2023,  5005,  1012,  4654, 13765,  3372,  2048, 24812,
         1012,  2339,  2003,  2026,  2293,  1005,  1055,  2004,  4502,  1000,
        14931,  2061, 11844,  1998,  7570, 18752,  2094,  1012,   102,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0])