In [2]:
import torch
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

In [217]:
import wandb

LEARNING_RATE = 0.005
BATCH_SIZE = 256
HIDDEN_SIZE = 16
NUM_EPOCHS = 4
CONTEXT_SIZE = 64

EXTRA_MODEL_PARAMS = dict(hidden_size=HIDDEN_SIZE, num_heads=4, embedding_size=32, num_outputs=2)

experiment_config = {
    "learning_rate": LEARNING_RATE,
    "architecture": "transformer",
    "dataset": "synthetic-wiki-one-meelyun-sentences",
    "epochs": NUM_EPOCHS,
    "batch_size": BATCH_SIZE,
    "hidden_size": HIDDEN_SIZE,
    "context_size": CONTEXT_SIZE,
    "extra_model_params": EXTRA_MODEL_PARAMS
}

wandb.init(
    project="tiny_sentence_tokenizer",  # https://wandb.ai/josephc/tiny_sentence_tokenizer
    config=experiment_config
)

# Datasets:

In [196]:
import bisect
import bz2
import os
import random
from typing import List, Optional, Union

from torch.utils.data import Dataset

class SentenceSplitDataset:
    def __init__(self, path_or_sentences: Union[List, os.PathLike], context_size: int, sentence_breaks: List[str] = [" ", "  ", "\n", "\r\n", "\t", "", "\n\n\n\n", "    "]):
        self.context_size = context_size
        self.characters_read = 0
        self.sentence_offsets = list()
        self.sentences = list()
        self.sentence_breaks = sentence_breaks
        
        input_lines = None
        fin = None
        if isinstance(path_or_sentences, list):
            input_lines = path_or_sentences
        else:
            fin = bz2.open(path_or_sentences, 'rt')
            input_lines = fin
        for line in input_lines:
            self.sentence_offsets.append(self.characters_read)
            line = line.strip()
            self.sentences.append(line)
            self.characters_read += len(line)+1
        if fin is not None:
            fin.close()
    def __len__(self):
        #return len(self.sentences)
        return self.characters_read
    def get_sentence(self, idx: int) -> str:
        return self.sentences[idx]
    def get_sentence_idx_with_character(self, idx: int) -> int:
        return bisect.bisect_right(self.sentence_offsets, idx)-1
    def get_sentence_with_character(self, idx: int, context: Optional[int] = None) -> str:
        # Map the index to the sentence.
        sentence_idx = self.get_sentence_idx_with_character(idx)
        sentence = self.sentences[sentence_idx]
        position_in_sentence = idx - self.sentence_offsets[sentence_idx]
        assert position_in_sentence >= 0
        if context is None:
            return sentence
        return sentence[max(0, position_in_sentence-context):position_in_sentence], sentence[position_in_sentence:position_in_sentence+context]
    def __getitem__(self, idx: int):
        prefix, suffix = self.get_sentence_with_character(idx, context=self.context_size)
        end_of_sentence = len(suffix) == 0
        suffix += random.choice(self.sentence_breaks) + self.get_sentence(self.get_sentence_idx_with_character(idx+1))
        prefix = prefix.rjust(self.context_size)[-self.context_size:]
        suffix = suffix.ljust(self.context_size)[:self.context_size]
        #suffix = suffix.ljust(self.context_size)
        return prefix, suffix, end_of_sentence

In [14]:
ds = SentenceSplitDataset(path_to_sentences="./one_meelyun_sentences.bz2", context_size=CONTEXT_SIZE)

In [15]:
print(ds[0])
print(ds[5001])
print(ds[5036])
print(ds[5037])
print(ds[5038])
for i in range(0, 2000):
    a, b, c = ds[i]
    if c:
        print(f"{i}: {ds[i]}")

('                        ', 'County and municipal cou', False)
('            The Exchange', ' Building opened in 1854', False)
('pened in 1854, part of t', 'he building was later us', False)
('ened in 1854, part of th', 'e building was later use', False)
('ned in 1854, part of the', ' building was later used', False)
69: ('lected every four years.', '  The by-census indicate', True)
215: (' by more than 1 million.', " 'On the Marble Cliffs' ", True)
378: ("ion in Hitler's Germany.", '\tHomer brief description', True)
423: ("ion in the 'Iliad'Homer.", '\n\n\n\nPublic hearings were', True)
545: ('co, and Washington, D.C.', '\tOn July 10, both forces', True)
596: ('ced each other in Kyoto.', "    Monmouth's status as", True)
733: (' November 2002 election.', '\n\n\n\nOpiates are hypothes', True)
796: ('ate aggression and rage.', ' The town celebrated its', True)
840: (' its centennial in 2004.', '  In 1681 Anthony Ashley', True)
964: (' or recourse to a trial.', " Crater Lake's features 

In [198]:
import bisect
import bz2
import os
import random
from typing import List, Optional, Union

from torch.utils.data import Dataset

class BalancedEOSDataset:
    def __init__(
            self, 
            path_or_sentences: Union[List[str], os.PathLike], 
            context_size: int, 
            sentence_breaks: List[str] = ["", " ", " ", " ", "  ", "\n", "\r\n", "\t", "\n\n\n\n", "    ",],
            bad_splits: List[str] = ["\n", "\n", "\n", "\n\n", "\n\n\n", "\r\n", "\n\t", "\n>"],
            p_bad_split: float = 0.01,
            p_change_punctuation: float = 0.1,
            p_drop_punctuation: float = 0.001,
            p_all_lower: float = 0.1,
            p_all_caps: float = 0.1,
    ):
        self.context_size = context_size
        self.sentences = list()
        self.sentence_breaks = sentence_breaks
        self.bad_splits = bad_splits
        self.p_bad_split = p_bad_split
        self.p_change_punctuation = p_change_punctuation
        self.p_drop_punctuation = p_drop_punctuation
        self.p_all_lower = p_all_lower
        self.p_all_caps = p_all_caps
        if isinstance(path_or_sentences, list):
            self.sentences = path_or_sentences
        else:
            with bz2.open(path_or_sentences, 'rt') as fin:
                for line in fin:
                    line = line.strip()
                    self.sentences.append(line)
    def __len__(self):
        return (len(self.sentences)-2)*2  # Double since evens will be 'not end of sentence'.
    def __getitem__(self, idx: int):
        end_of_sentence = (idx%2 != 0)
        if not end_of_sentence:
            sentence = self.sentences[idx//2]
            split_point = random.randint(1, len(sentence)-1)
            prefix = sentence[:split_point]
            suffix = sentence[split_point:]
            prefix = random.choice(self.sentences) + " " + prefix
            suffix = suffix + " " + random.choice(self.sentences)
            if random.random() < self.p_bad_split:  # Small chance of a bad split:
                prefix += random.choice(self.bad_splits)
            prefix = prefix.rjust(self.context_size)
            suffix = suffix.ljust(self.context_size)
        else: # End of sentence.
            prefix = self.sentences[idx//2]
            if random.random() > 0.5: # 50/50 shot of just padding vs adding a previous sentence
                prefix = random.choice(self.sentences) + " " + prefix
            if prefix[-1] == "." and random.random() < self.p_change_punctuation:
                prefix = prefix[:-1] + random.choice([".", "!", "?"])  # Small chance to omit, too.
            if prefix[-1] == "." and random.random() < self.p_drop_punctuation:
                prefix = prefix[:-1]
            prefix = prefix.rjust(self.context_size+1)
            suffix = self.sentences[(idx//2)+1]
            if random.random() > 0.5:
                suffix = suffix + " " + random.choice(self.sentences)
            suffix = (random.choice(self.sentence_breaks) + suffix.ljust(self.context_size))
        # Cap size at th same number of _bytes_.
        prefix = prefix[-self.context_size:]
        suffix = suffix[:self.context_size]
        if random.random() < self.p_all_lower:
            prefix = prefix.lower()
            suffix = suffix.lower()
        elif random.random() < self.p_all_caps:
            prefix = prefix.upper()
            suffix = suffix.upper()
        return prefix, suffix, end_of_sentence

In [199]:
ds = BalancedEOSDataset("./one_meelyun_sentences.bz2", context_size=CONTEXT_SIZE)
#ds = BalancedEOSDataset(ds.sentences, context_size=CONTEXT_SIZE)

In [201]:
print(ds[0])
print(ds[5001])
print(ds[5036])
print(ds[5037])
print(ds[5038])

('redit for the torigking expedition. county and municipal council', 's are popularly elected every four years. the first two years of', False)
('sure of President George W. Bush and Vice President Dick Cheney.', ' Sabine Baring-Gould says of this saint "he was much edified wit', True)
(' was routine administration and quite limited. By 2006, there we', 're 36 airports and one heliport. The final stage consisted of a ', False)
('               BY 2006, THERE WERE 36 AIRPORTS AND ONE HELIPORT.', '    KENNEDY LATER SAID THAT HIS FOUR DAY-VISIT TO IRELAND WAS ON', True)
('that his four day-visit to Ireland was one of his most enjoyable', '. Nevertheless, because non-Mormons in the east would not have c', False)


In [210]:
validation_sentences = [
    "Hello, world!",
    "Dr. M. Emdee, M.D. said that her upbringing in Washington D.C. was formative.",
    "He said, \"Hey, can you hear me?\"",
    "Obviously, this is self-contained.",
    "I think it's safe to say that this is self contained: a container.",
    "A statement followed by a question and separated by a colon: an effective journalistic technique?",
    "Every room is a panic room if you just give me a minute.",
    "Has anyone really been as far as want even to look more like?",
    "The... the thing?",
    "1. The first number.",
    '"Bruh, can you believe it?"',
    '"I can\'t believe it bruh."',
]
# NOTE: Using the SentenceSplitDataset instead of the BalancedEOSDataset.
validate_ds = SentenceSplitDataset(path_or_sentences=validation_sentences, context_size=CONTEXT_SIZE, sentence_breaks=[" ", "\n"])

# Models:

In [None]:
from typing import Iterator, List, Union
import torch.nn as nn
import torch.nn.functional as F
from unidecode import unidecode


class RNN(nn.Module):
    def __init__(self, hidden_size, output_size):
        super(RNN, self).__init__()

        self.hidden_size = hidden_size

        self.i2h = nn.Linear(256, hidden_size)
        self.h2h = nn.Linear(hidden_size, hidden_size)
        self.h2o = nn.Linear(hidden_size, output_size)
        self.softmax = nn.LogSoftmax(dim=1)

    @staticmethod
    def strings_to_tensor(sentences: list) -> torch.Tensor:
        longest = max([len(sentence.encode("utf-8")) for sentence in sentences])
        out = torch.zeros(len(sentences), longest, 256)
        for batch_idx, s in enumerate(sentences):
            s = s.encode("utf-8")
            for byte_idx in range(len(s)):
                out[batch_idx, byte_idx, int(s[byte_idx])] = 1.0
        return out

    def forward(self, x, hidden = None):
        hidden = F.tanh(self.i2h(x) + self.h2h(hidden))
        output = self.h2o(hidden)
        output = self.softmax(output)
        return output, hidden

    def _init_hidden(self, height: int = 1):
        return torch.zeros(height, self.hidden_size)

    def split_paragraph_iter(self, p: str, min_threshold: Optional[float] = None) -> Iterator[str]:
        self.eval()
        h = self._init_hidden()
        i = torch.zeros(1, 256)
        last_sentence = ""
        for character in p:
            last_sentence += character
            # Convert character to a byte.
            b = character.encode("utf-8")
            for b_value in b:
                i[0, int(b_value)] = 1.0
                out, h = self.forward(i, h)
                i[0, int(b_value)] = 0.0
            out = out.cpu().numpy()
            if out[0,1] >= out[0,0] or (min_threshold is not None and out[0,1] > min_threshold):
                yield last_sentence
                last_sentence = ""
        yield last_sentence


def run_inference(m, prefix: List[str], suffix: Optional[List[str]]):
    prefix = RNN.strings_to_tensor(prefix).to(DEVICE)
    if suffix is not None:
        suffix = RNN.strings_to_tensor(suffix).to(DEVICE)
    hidden = m._init_hidden().to(DEVICE)
    for i in range(0, prefix.shape[1]):
        out, hidden = m(prefix[:,i,:], hidden)
    if suffix is not None:
        for i in range(0, suffix.shape[1]):
            out, hidden = m(suffix[:,i,:], hidden)
    return out


n_hidden = HIDDEN_SIZE
n_categories = 2
model = RNN(n_hidden, n_categories)
model = model.to(DEVICE)
optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE)
loss_fn = nn.NLLLoss()

In [82]:
from typing import Iterator, List, Union
import torch.nn as nn
import torch.nn.functional as F
from unidecode import unidecode

class TFSentenceSplit(nn.Module):
    def __init__(self, hidden_size: int, num_heads: int, embedding_size: int, num_outputs: int):
        super().__init__()
        self.position_embedding = nn.Embedding(num_embeddings=256, embedding_dim=embedding_size)
        self.encoder_layer = nn.TransformerEncoderLayer(d_model=embedding_size, nhead=num_heads, batch_first=True)
        self.inference_head = nn.Linear(embedding_size, num_outputs)

    def forward(self, x):
        # Assumes x is batch first like a normal, sensible person.
        # out = self.encode_layer(torch.rand(batch_size, seq_length, embedding_size))
        # Use torch.LongTensor to encode.
        x = self.position_embedding(x)
        x = self.encoder_layer(x) # Out: [batch_size, seq_len, embedding_size]
        x = self.inference_head(x)
        x = F.softmax(x, dim=-1)
        return x[:,-1,:].squeeze(1)

    @staticmethod
    def strings_to_tensor(sentences: list) -> torch.Tensor:
        longest = max([len(sentence.encode("utf-8")) for sentence in sentences])
        out = torch.zeros((len(sentences), longest), dtype=torch.int64)  # torch.LongTensor
        for batch_idx, s in enumerate(sentences):
            s = s.rjust(longest).encode("utf-8")[-longest:]  # Pad the left with spaces so it's aligned, then convert to bytes and truncate.
            for byte_idx in range(len(s)):
                out[batch_idx, byte_idx] = int(s[byte_idx])
        return out

def run_inference(m, prefix: List[str], suffix: List[str]):
    prefix = TFSentenceSplit.strings_to_tensor(prefix).to(DEVICE)
    #suffix = TFSentenceSplit.strings_to_tensor(suffix).to(DEVICE)
    return m.forward(prefix)

model = TFSentenceSplit(**EXTRA_MODEL_PARAMS)
model = model.to(DEVICE)
optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE)
loss_fn = nn.CrossEntropyLoss()

In [211]:
from typing import Iterator, List, Union
import torch.nn as nn
import torch.nn.functional as F
from unidecode import unidecode

class BidirectionalTransformerSentenceSplit(nn.Module):
    def __init__(self, hidden_size: int, num_heads: int, embedding_size: int, num_outputs: int):
        super().__init__()
        self.position_embedding = nn.Embedding(num_embeddings=256, embedding_dim=embedding_size)
        self.prefix_encoder_layer = nn.TransformerEncoderLayer(d_model=embedding_size, nhead=num_heads, batch_first=True)
        self.suffix_encoder_layer = nn.TransformerEncoderLayer(d_model=embedding_size, nhead=num_heads, batch_first=True)
        self.inference_head = nn.Linear(embedding_size, num_outputs)

    def forward(self, prefix, suffix):
        # Assumes x is batch first like a normal, sensible person.
        # out = self.encode_layer(torch.rand(batch_size, seq_length, embedding_size))
        # Use torch.LongTensor to encode.
        prefix = self.position_embedding(prefix)
        suffix = self.position_embedding(suffix)
        x = self.prefix_encoder_layer(prefix) + self.suffix_encoder_layer(suffix) # Out: [batch_size, seq_len, embedding_size]
        x = self.inference_head(x)
        x = F.softmax(x, dim=-1)
        return x[:,-1,:].squeeze(1)

    @staticmethod
    def strings_to_tensor(sentences: list) -> torch.Tensor:
        longest = max([len(sentence.encode("utf-8")) for sentence in sentences])
        out = torch.zeros((len(sentences), longest), dtype=torch.int64)  # torch.LongTensor
        for batch_idx, s in enumerate(sentences):
            s = s.rjust(longest).encode("utf-8")[-longest:]  # Pad the left with spaces so it's aligned, then convert to bytes and truncate.
            for byte_idx in range(len(s)):
                out[batch_idx, byte_idx] = int(s[byte_idx])
        return out

def run_inference(m, prefix: List[str], suffix: List[str]):
    prefix = TFSentenceSplit.strings_to_tensor(prefix).to(DEVICE)
    suffix = TFSentenceSplit.strings_to_tensor(suffix).to(DEVICE)
    width = min(prefix.shape[1], suffix.shape[1])
    return m.forward(prefix[:,-width:], suffix[:,:width])

model = BidirectionalTransformerSentenceSplit(**EXTRA_MODEL_PARAMS)
model = model.to(DEVICE)
optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE)
loss_fn = nn.CrossEntropyLoss()

# Evaluation and Training Prep:

In [212]:
from torch.utils.data import DataLoader
total_examples = len(ds)  # Hack -- dunno the total example count.
train_size = int(total_examples*0.9)
validation_size = int(total_examples - train_size)
#test_size = total_examples - (train_size + validation_size)
#train_ds, validate_ds, test_ds = torch.utils.data.random_split(ds, [train_size, validation_size, test_size])
#train_ds, validate_ds = torch.utils.data.random_split(ds, [train_size, validation_size])

In [213]:
#train_dataloader = DataLoader(train_ds, batch_size=BATCH_SIZE, shuffle=True)
train_dataloader = DataLoader(ds, batch_size=BATCH_SIZE, shuffle=True)
#validate_dataloader = DataLoader(validate_ds, batch_size=BATCH_SIZE)
validate_dataloader = DataLoader(validate_ds, batch_size=BATCH_SIZE)  # Note this is defined in the data at the top.
#test_dataloader = DataLoader(test_ds, batch_size=BATCH_SIZE)

In [214]:
def compute_tpr_fpr_tnr_fnr(predictions_logits, ground_truth_labels):
    # Assume predictions are a tensor of shape [batch, 2], same with ground truth.
    # Assume predictions are NORMALIZED along axis=1 ([_,0], [_,1]).
    # Assume [:,1] means 'yes, this is a break'.
    with torch.no_grad():
        pred = predictions_logits.cpu().numpy()
        gt = ground_truth_labels.cpu().numpy()
        
        tpr = 0
        fpr = 0
        tnr = 0
        fnr = 0
        for idx in range(0, pred.shape[0]):
            if gt[idx] < 0.5 or gt[idx] == False: # GT: Negative
                if pred[idx,0] > pred[idx,1]: # Pred: Negative
                    tnr += 1
                else: # Pred: Positive
                    fpr += 1
            else: # GT: Positive
                if pred[idx,0] > pred[idx,1]: # Pred: Negative:
                    fnr += 1
                else: # Pred: Positive
                    tpr += 1
        return tpr, fpr, tnr, fnr
                

In [215]:
assert compute_tpr_fpr_tnr_fnr(torch.tensor([[0.0, 1.0], [0.0, 1.0], [1.0, 0.0], [1.0, 0.0]]), torch.tensor([0, 1, 0, 1])) == (1, 1, 1, 1)
print(compute_tpr_fpr_tnr_fnr(torch.tensor([[0.1, 0.9], [0.2, 0.8], [0.3, 0.7], [0.4, 0.6], [0.6, 0.4]]), torch.tensor([0, 0, 0, 0, 0])))
print(compute_tpr_fpr_tnr_fnr(torch.tensor([[0.1, 0.9], [0.2, 0.8], [0.3, 0.7], [0.4, 0.6], [0.6, 0.4]]), torch.tensor([1, 1, 1, 1, 1])))
print(run_inference(model, ["Only a."], ["butthead"]))
print(compute_tpr_fpr_tnr_fnr(run_inference(model, ["Only a test", "Mostly a test", "Ignore me"], ["aaaa", "bbbb", "CCCC"]), torch.tensor([0, 1, 0])))

(0, 4, 1, 0)
(4, 0, 0, 1)
tensor([[0.6693, 0.3307]], device='cuda:0', grad_fn=<SqueezeBackward1>)
(0, 0, 2, 1)


In [216]:
print(TFSentenceSplit.strings_to_tensor(["Only a test.", "Mostly a", "Ignore"]))
print(run_inference(model, ["Only a test."], ["Test"]))
print(compute_tpr_fpr_tnr_fnr(run_inference(model, ["Only a test.", "Mostly a", "Ignore"], ["aaaa", "bbbb", "CCCC"]), torch.tensor([1, 0, 0])))

tensor([[ 79, 110, 108, 121,  32,  97,  32, 116, 101, 115, 116,  46],
        [ 32,  32,  32,  32,  77, 111, 115, 116, 108, 121,  32,  97],
        [ 32,  32,  32,  32,  32,  32,  73, 103, 110, 111, 114, 101]])
tensor([[0.6615, 0.3385]], device='cuda:0', grad_fn=<SqueezeBackward1>)
(0, 0, 2, 1)


# Training:

In [218]:
#from tqdm import tqdm
from tqdm.notebook import trange, tqdm

lowest_loss = 1e10
best_counts = 0

for epoch in trange(NUM_EPOCHS):
    model.train()
    examples_seen = 0
    positives_seen = 0
    total_train_loss = 0.0
    running_train_loss = 0.0
    validation_loss = 0.0
    tpr, tnr, fpr, fnr = 0, 0, 0, 0
    for batch_idx, (pre, suf, label) in tqdm(enumerate(train_dataloader)):
        label = (torch.Tensor(label) * 1).to(DEVICE)
        out = run_inference(model, pre, suf)
        loss = loss_fn(out, label)
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
        
        examples_seen += BATCH_SIZE
        positives_seen += torch.sum(label).item()
        per_example_loss = loss.item()/BATCH_SIZE
        total_train_loss += per_example_loss
        running_train_loss = running_train_loss * 0.9 + per_example_loss * 0.1
        batch_tp, batch_fp, batch_tn, batch_fn = compute_tpr_fpr_tnr_fnr(out, label)
        tpr += batch_tp
        tnr += batch_tn
        fpr += batch_fp
        fnr += batch_fn
        if batch_idx % 100 == 0:
            rate = tpr+tnr+fpr+fnr
            precision = tpr/float(tpr+fpr)
            recall = tpr/float(tpr+fnr)
            f1 = (2*precision*recall)/(precision + recall)
            wandb.log({
            #noop = dict(foo={
                "batch_loss": per_example_loss, 
                "false_positive_rate": fpr/rate, 
                "true_positive_rate": tpr/rate,
                "false_negative_rate": fnr/rate,
                "true_negative_rate": tnr/rate,
                "precision": precision,
                "recall": recall,
                "f1": f1,
                "accuracy": (tpr+tnr)/rate
            }, commit=(batch_idx%1000)==0)
            tpr, tnr, fpr, fnr = 0, 0, 0, 0
        if (batch_idx+1) % 1000 == 0:
            print(f"{epoch}: {batch_idx}: {running_train_loss}")
    print(f"END OF EPOCH {epoch}: {total_train_loss} train loss")
    model.eval()
    for batch_idx, (pre, suf, label) in enumerate(validate_dataloader):
        label = (torch.Tensor(label) * 1).to(DEVICE)
        out = run_inference(model, pre, suf)
        loss = loss_fn(out, label)
        validation_loss += loss.item()/BATCH_SIZE
    wandb.log({"epoch": epoch, "validation_loss": validation_loss})
    print(f"END OF EPOCH {epoch}: {validation_loss} validation loss")
    torch.save(model, f"checkpoint_epoch_{epoch}.pt")
    if validation_loss < lowest_loss:
        lowest_loss = validation_loss
        torch.save(model, f"best_{best_counts}.pt")
        best_counts += 1
wandb.finish()

  0%|          | 0/4 [00:00<?, ?it/s]

0it [00:00, ?it/s]

0: 999: 0.0013630061632433597
0: 1999: 0.0013599603097932465
0: 2999: 0.0013660601872827424
0: 3999: 0.0013610136676452153
0: 4999: 0.0013682807486761607
0: 5999: 0.0013526185856018698
0: 6999: 0.0013522559090553704
END OF EPOCH 0: 10.617626822553575 train loss
END OF EPOCH 0: 0.004185141297057271 validation loss


0it [00:00, ?it/s]

1: 999: 0.0013691968594523043
1: 1999: 0.001355107447403433
1: 2999: 0.0013617549777748957
1: 3999: 0.0013590342843573405
1: 4999: 0.0013720859770224557
1: 5999: 0.0013543043850086007
1: 6999: 0.001370117235383661
END OF EPOCH 1: 10.607014010311104 train loss
END OF EPOCH 1: 0.004185141180641949 validation loss


0it [00:00, ?it/s]

2: 999: 0.0013566519491371263
2: 1999: 0.0013708272551828863
2: 2999: 0.0013403854504217352
2: 3999: 0.0013586299205739414
2: 4999: 0.0013731517354719314
2: 5999: 0.0013493571773274234
2: 6999: 0.001373983475450804
END OF EPOCH 2: 10.60801824531518 train loss
END OF EPOCH 2: 0.004185141529887915 validation loss


0it [00:00, ?it/s]

3: 999: 0.00135787644987126
3: 1999: 0.0013710970381047237
3: 2999: 0.0013482916651648977
3: 3999: 0.0013588632386499326
3: 4999: 0.0013534841039215346
3: 5999: 0.0013462657834074948
3: 6999: 0.0013655029102604248
END OF EPOCH 3: 10.608599066152237 train loss
END OF EPOCH 3: 0.004185141529887915 validation loss


VBox(children=(Label(value='0.002 MB of 0.002 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
accuracy,▁███████████████████████████████████
batch_loss,█▂▁▂▁▁▁▁▁▁▁▂▁▂▁▁▂▁▁▁▁▁▁▁▁▁▂▂▂▁▁▁▂▁▁▂
epoch,▁▃▆█
f1,▁███████████████████████████████████
false_negative_rate,█▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
false_positive_rate,█▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
precision,▁███████████████████████████████████
recall,▁███████████████████████████████████
true_negative_rate,▁████████▇█████████████████▅████████
true_positive_rate,▁▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇█▇▇▇▇▇▇▇▇

0,1
accuracy,0.96367
batch_loss,0.00141
epoch,3.0
f1,0.96235
false_negative_rate,0.03461
false_positive_rate,0.00172
precision,0.99631
recall,0.93062
true_negative_rate,0.49945
true_positive_rate,0.46422


Lifted from https://damdid2022.frccsc.ru/files/article/DAMDID_2022_paper_2646.pdf
Zavyalova, Martynyuk, and Samarev

"Testing will be performed on 5840 sentences from “The GUM Corpus” [16]."

|Rank|Tool Name                |tp  |fp | tn   |fn  |accuracy|error|precision|recall|f1   |
|---|---                      |--- |---|---   |--- |---     |---  |---      |---   |---  |
|1  |Sentencize.jl            |6330|254|107813|1078|0.99    |0.01 |0.96     |0.85  |0.905|
|2  |NLTK                     |6269|283|107787|1139|0.99    |0.01 |0.96     |0.85  |0.898|
|3  |OpenNLP                  |6255|276|107791|1153|0.99    |0.01 |0.96     |0.84  |0.897|
|4  |CoreNLP                  |6278|362|107786|1130|0.99    |0.01 |0.95     |0.85  |0.894|
|5  |WordTokenizers.jl        |6140|264|107809|1268|0.99    |0.01 |0.96     |0.83  |0.889|
|6  |Spacy (Dependency parser)|6631|934|107268|777 |0.99    |0.01 |0.88     |0.90  |0.886|
|7  |Spacy (Rule-based)       |6183|994|107531|1225|0.98    |0.02 |0.86     |0.83  |0.848|
|8  |SimpleSplitter           |5760|772|107847|1648|0.98    |0.02 |0.88     |0.78  |0.826|
|9  |Julia split()            |5760|878|107780|1648|0.98    |0.02 |0.87     |0.78  |0.820|

In [219]:
import numpy
print("Raw:")
print(ds[0])
print("DL raw:")
for pre, suf, label in train_dataloader:
    print(pre[0])
    print(label[0])
    break
print("Processed:")
model.eval()
with torch.no_grad():
    for pre, suf, label in train_dataloader:
        label = (torch.Tensor(label) * 1).to(DEVICE)
        out = run_inference(model, pre, suf)
        tp, fp, tn, fn = compute_tpr_fpr_tnr_fnr(out, label)
        loss = loss_fn(out, label).cpu().numpy()
        out = out.cpu().numpy()
        confidence = numpy.abs(out[:, 0] - out[:, 1])
        print(f"TP: {tp}  TN: {tn}  FP: {fp}  FN: {fn}")
        errors = 0
        for idx in range(out.shape[0]):
            model_guess_eos = out[idx,1]>out[idx,0]
            gt_eos = label[idx]>0.5
            if model_guess_eos != gt_eos:
                print("!!!")
                errors += 1
                print(f"Sent: {pre[idx]}")
                print(f"Model guess: EOS: {model_guess_eos}")
                print(f"Truth: EOS: {gt_eos}")
                print()
        print(f"Total errors: {errors}")
        break

Raw:
('fighting class between that of a squire and a page). County and ', 'municipal councils are popularly elected every four years. While', False)
DL raw:
D UNTIL 1433, BUT REFUSED TO BUY HIS RELEASE BY ABANDONING HIS C
tensor(False)
Processed:
TP: 128  TN: 118  FP: 0  FN: 10
!!!
Sent: s, and some scholars interpret this as a form of contraception."
Model guess: EOS: False
Truth: EOS: True

!!!
Sent: of chicory by farm animals results in reduction of worm burdens,
Model guess: EOS: False
Truth: EOS: True

!!!
Sent: founded on the ruins of besa where he died (dio cassius 'lix.11;
Model guess: EOS: False
Truth: EOS: True

!!!
Sent: ph journal, sackville/edmundston, nb, canada, newsroom@nbpub.com
Model guess: EOS: False
Truth: EOS: True

!!!
Sent: form a network of nodes &mdash; hence the term "neural network."
Model guess: EOS: False
Truth: EOS: True

!!!
Sent: to become a better christian by learning from his mistakes.pugh,
Model guess: EOS: False
Truth: EOS: True

!!!
Sent: tin Taran

In [230]:
# Save model:
# Prefix only:
placeholder_x = torch.zeros([BATCH_SIZE, CONTEXT_SIZE], dtype=torch.int64)
model.eval().to('cpu')
out = model(placeholder_x)
torch.onnx.export(
    model, 
    placeholder_x,
    f"sentence_tokenizer_v11_{CONTEXT_SIZE}_256.onnx",
    export_params=True,        # store the trained parameter weights inside the model file
    opset_version=14,          # the ONNX version to export the model to
    do_constant_folding=True,  # whether to execute constant folding for optimization
    input_names = ['input',],   # the model's input names
    output_names = ['output'], # the model's output names
    dynamic_axes={'input' : {0 : 'batch_size'}, 'output' : {0 : 'batch_size'}}
)

""" # Prefix + Suffix:
placeholder_a = torch.zeros([BATCH_SIZE, CONTEXT_SIZE], dtype=torch.int64)
placeholder_b = torch.zeros([BATCH_SIZE, CONTEXT_SIZE], dtype=torch.int64)
model.eval().to('cpu')
out = model(placeholder_a, placeholder_b)
print(out.shape)
torch.onnx.export(
    model, 
    (placeholder_a, placeholder_b),
    f"sentence_tokenizer_v11_{CONTEXT_SIZE}_prefix_{CONTEXT_SIZE}_suffix_256.onnx",
    export_params=True,        # store the trained parameter weights inside the model file
    opset_version=14,          # the ONNX version to export the model to
    do_constant_folding=True,  # whether to execute constant folding for optimization
    input_names = ['prefix', 'suffix'],   # the model's input names
    output_names = ['output'], # the model's output names
    dynamic_axes={'prefix' : {0 : 'batch_size'}, 'suffix' : {0 : 'batch_size'}, 'output' : {0 : 'batch_size'}}
)
"""

torch.Size([256, 2])




In [228]:
placeholder_x = torch.zeros([1, CONTEXT_SIZE], dtype=torch.int64)
model.eval().to('cpu')
exporter = torch.onnx.dynamo_export(model, placeholder_x)
exporter.save(f"sentence_tokenizer_v6_{CONTEXT_SIZE}x256_dynamo.onnx")



OnnxExporterError: Failed to export the model to ONNX. Generating SARIF report at 'report_dynamo_export.sarif'. SARIF is a standard format for the output of static analysis tools. SARIF logs can be loaded in VS Code SARIF viewer extension, or SARIF web viewer (https://microsoft.github.io/sarif-web-component/). Please report a bug on PyTorch Github: https://github.com/pytorch/pytorch/issues