In [3]:
import argparse
import itertools
import re
import os
import json
import pickle
from typing import List

import numpy as np
import spacy
from spacy import displacy
from spacy.language import Language
from spacy.pipeline import Pipe
from pathlib import Path
from tqdm import tqdm

from sklearn.linear_model import LogisticRegression
from sklearn.dummy import DummyClassifier
from sklearn import metrics
from sklearn import preprocessing



In [4]:
### Globals strings ###
ONTONOTES_LABELS = [
    'CARDINAL', 'DATE', 'EVENT', 'FAC', 'GPE', 'LANGUAGE', 'LAW', 'LOC', 'MONEY', 'NORP', 'ORDINAL',
    'ORG', 'PERCENT', 'PERSON', 'PRODUCT', 'QUANTITY', 'TIME', 'WORK_OF_ART']

ontonotes_json = "data/ontonotes5_en_16percent.json"

example_text = "On March 8, 2021, a group of hackers including Kottmann and calling themselves " \
       "'APT - 69420 Arson Cats' gained 'super admin' rights in the network of Verkada, a " \
       "cloud-based security camera company, using credentials they found on the public " \
       "internet. They had access to the network for 36 hours. The group collected about 5 " \
       "gigabytes of data, including live security camera footage and recordings from more" \
       " than 150,000 cameras in places like a Tesla factory, a jail in Alabama, a Halifax " \
       "Health hospital, and residential homes. The group also accessed a list of Verkada " \
       "customers and the company's private financial information, and gained superuser " \
       "access to the corporate networks of Cloudflare and Okta through their Verkada cameras."

In [5]:
# def setup_argparse():
#     p = argparse.ArgumentParser()
#     p.add_argument('--part', choices=['1','2','3','4'], required=True)
#     p.add_argument('--ents', choices=ONTONOTES_LABELS, nargs='+')
#     p.add_argument('--viz_output', default='entity_viz_example.html',
#                    help='Name of output file for the visualisation')
#     p.add_argument('--corpus', help='name of corpus file to load in')
#     p.add_argument('--tokenization', choices=['standard', 'subword'], default='standard',
#                    help='for part 2, whether to print out the tokenized document in standard'
#                         'tokenization (whitespace), or showing subwords (BPE)')
#     p.add_argument('--classifier_path', help='name for path to save classifier')
#     p.add_argument('--baseline', action='store_true', help='use a simple baseline classifier')
#     return p.parse_args()

In [6]:
def init_argparse():
    p = argparse.ArgumentParser()
    p.add_argument('--part', choices=['1','2','3','4'], required=True)
    p.add_argument('--ents', choices=ONTONOTES_LABELS, nargs='+')
    p.add_argument('--viz_output', default='entity_viz_example.html',
                   help='Name of output file for the visualisation')
    p.add_argument('--corpus', help='name of corpus file to load in')
    p.add_argument('--tokenization', choices=['standard', 'subword'], default='standard',
                   help='for part 2, whether to print out the tokenized document in standard'
                        'tokenization (whitespace), or showing subwords (BPE)')
    p.add_argument('--classifier_path', help='name for path to save classifier')
    p.add_argument('--baseline', action='store_true', help='use a simple baseline classifier')
    return p

In [188]:
#####

def part_1(args, nlp):
    doc = nlp(example_text)
    ent_list, output_file = args.ents, args.viz_output
    options = {"ents": ent_list} if ent_list else {"ents": ONTONOTES_LABELS}

    html = displacy.render(doc, style="ent", options=options)

    # output_path = Path(output_file)
    # output_path.open("w").write(html)


def part_2(args, nlp):
    # These are special characters used by the tokenizer, ignore them
    special_chars = re.compile("Ġ|<pad>|<s>|</s>|â|Ģ|ī")
    doc = nlp(example_text)

    print("List of Entities:")
    print(doc.ents)

    if args.tokenization == 'standard':
        print("\nStandard Tokenization:")
        print(" ".join([tok.text for tok in doc]))

    elif args.tokenization == 'subword':
        print("\nSubword Tokenization:")
        subword_string = " ".join([tok for tok in itertools.chain(*doc._.trf_data.wordpieces.strings)])
        cleaned_subword_string = special_chars.sub("", subword_string).strip()

        print(cleaned_subword_string)

In [8]:
### This is for Part 3 ###
class ContextualVectors(Pipe):
    def __init__(self, nlp):
        self._nlp = nlp
        self.combination_function = "average"

    def __call__(self, doc):
        if type(doc) == str:
            doc = self._nlp(doc)
        self.lengths = doc._.trf_data.align.lengths
        self.tensors = doc._.trf_data.tensors
        self.input_texts = doc._.trf_data.tokens['input_texts'][0]
        doc.user_token_hooks["vector"] = self.vector
        return doc

    ### HERE is where vectors are set
    def vector(self, token):

        token_start_idx = 1 + sum([self.lengths[ii] for ii in range(token.i)])
        token_end_idx = token_start_idx + self.lengths[token.i]
        trf_vector = self.tensors[0][0][token_start_idx:token_end_idx]
        
        if len(trf_vector) == 0: # this happens due to token alignment issues
            # print('len(trf_vector) = 0!')
            # print(token_start_idx, token_end_idx)
            # print(len(self.tensors[0][0]))
            # print('token.i:', token.i, token.text)
            # print('token_idx:', token_start_idx, token_end_idx)
            # print('input_texts', self.input_texts[token_start_idx:token_end_idx])
            return []
            
        return self.combine_vectors(trf_vector)

    def combine_vectors(self, trf_vector):
        return np.average(trf_vector, axis=0)

In [9]:
@Language.factory("trf_vector_hook", assigns=["doc.user_token_hooks"])
def create_contextual_hook(nlp, name):
    return ContextualVectors(nlp)


def part_3(args, nlp):

    nlp.add_pipe("trf_vector_hook", last=True)
    max_tok = 145  # max tokens per chunk based on the spacy striding behaviour. I can change this if I want
    def chunks(tokens, n):
        for i in range(0, len(tokens), n):
            yield tokens[i:i+n]


    with open(ontonotes_json) as fin:
        f = json.load(fin)
    # process all the data
    corpus = dict.fromkeys(f.keys())
    for key in f.keys():
        #print("loading {}".format(key))
        embeddings, labels = [], []
        corpus_split = f[key]
        for entry in tqdm(corpus_split, desc=f"Processing {key}"):
            if not entry.get("entities"):
                continue
            this_string = entry["text"]
            # BERT max is 512 wordpiece tokens at once, and there is one sample that exceeeds it
            if len(this_string.split()) > max_tok:
                text_chunks = chunks(this_string, max_tok)
            else:
                text_chunks = [this_string]
            for c in text_chunks:
                this_doc = nlp("".join(c))
                # for silver labels:
                for ent in this_doc.ents:
                    
                    try:
                        if not ent.vector.any(): 
                            continue
                    except:
                        # print(f"Error on entity '{ent}' in document: {this_doc}")
                        # print('ent_idx:', ent.start, ent.end)
                        continue
                    # validation check for nans
                    if np.isnan(ent.vector).any() or np.isinf(ent.vector.any()):
                        print(f"Skipping entry, found nan or inf in vector for entity '{ent}' "
                              f"in document: {this_doc}")
                        continue
                    embeddings.append(ent.vector)
                    labels.append(ent.label_)
        # save processed split of corpus, with matrix of number_samples x features, list of labels
        corpus[key] = [np.vstack(embeddings), labels]

    # print number of entities found in each section for information
    for key in corpus.keys():
        print("{}: {} entities".format(key, len(corpus[key][0])))

    save_file = "data/corpus_average.pkl"
    with open(save_file, "wb") as fout:
        pickle.dump(corpus, fout)

    print(f"Saved full processed corpus to {save_file}")

### Errors
# Token indices sequence length is longer than the specified maximum sequence length for this model (720 > 512). Running this sequence through the model will result in indexing errors


def print_classifier_stats(predictions: List[str], labels: List[str], classes: List[str]):
    # TODO check if this works with NER confusion matrix and if it does make a higher and use twice
    accuracy = np.mean(predictions == labels)
    # matrix_labels = (ONTONOTES_LABELS
    #     [label.name for label in Label] + [] if not conf_thresh else [label.name for label in
    #                                                                   Label] + ["below thresh"]
    # )
    print("Classifier Accuracy: {}".format(accuracy))
    print("-" * 89)
    print("Classification Report:")
    print(metrics.classification_report(labels, predictions, target_names=classes, zero_division=0))
    # TODO currently get a broadcast error, fix ValueError: shape mismatch: objects cannot be broadcast to a single shape
    # print("Confusion Matrix:")
    # print(metrics.confusion_matrix(test_labels_, predictions_, labels=[label_encoder.classes_]))


def part_4(args, nlp):
    # this involves reading in ontonotes data, getting embeddings for the entities,
    # then training a classifier with the paired embeddings and labels.
    classifier = LogisticRegression(
        multi_class="multinomial",
        #class_weight="balanced",
        max_iter=500
    )

    # This loads a dict of TESTING, TRAINING, VALIDATION keys and values as a nested list of
    # 0 as embeddings and 1 as labels (co-indexed, equal length)
    with open(args.corpus, "rb") as fin:
        corpus = pickle.load(fin)

    # process data
    label_encoder = preprocessing.LabelEncoder()  # labels need to be ints not strings
    all_labels = list(itertools.chain(*[corpus[split][1] for split in corpus.keys()]))
    label_encoder.fit(all_labels)

    train_data, train_labels_ = corpus["TRAINING"]  # the _ is the spacy convention for the string representation (rather than int/float)
    test_data, test_labels_ = corpus["TESTING"]

    train_labels = label_encoder.transform(train_labels_)  # transform strings to ints

    if args.baseline:
        for strat in ["most_frequent", "uniform", "stratified"]:
            dummy_classifier = DummyClassifier(strategy=strat)
            dummy_classifier.fit(train_data, train_labels)
            dummy_predictions = dummy_classifier.predict(test_data)
            dummy_predictions_ = label_encoder.inverse_transform(dummy_predictions)

            print(f"Stats for Baseline Classifier: {strat} on Test Set")
            print_classifier_stats(dummy_predictions_, test_labels_, label_encoder.classes_)

    else:
        print("Training classifier with params:")
        print(classifier.get_params())
        
        if 'cupy' in str(type(train_data)):
            train_data = train_data.get()
            test_data = test_data.get()

        classifier.fit(train_data, train_labels)

        print("Saving classifier to {}".format(args.classifier_path))
        with open(args.classifier_path, "wb") as fout:
            pickle.dump(classifier, fout)

        predictions = classifier.predict(test_data)
        predictions_ = label_encoder.inverse_transform(predictions)  # inverse transform to strings for printing

        print("Stats for Logistic Regression Classifier on Test Set")
        print_classifier_stats(predictions_, test_labels_, label_encoder.classes_)

In [10]:
def main(args, nlp):
    dict2func = {
        "1": part_1,
        "2": part_2,
        "3": part_3,
        "4": part_4,
    }

    dict2func[args.part](args, nlp)

In [11]:
# if __name__ == "__main__":
#     args = setup_argparse()

#     gpu = spacy.prefer_gpu()
#     print('GPU:', gpu)

#     # validation checks
#     # that model is downloaded
#     spacy_model_name = 'en_core_web_trf'
#     if not spacy.util.is_package(spacy_model_name):
#         spacy.cli.download(spacy_model_name)
#     # that relevant directories exist
#     for d in ["models", "data"]:
#         if not os.path.exists(d):
#             os.makedirs(d)

#     # load spacy model
#     nlp = spacy.load('en_core_web_trf')

#     main(args, nlp)

In [17]:
def lab8(arg):
    p = init_argparse()
    args = p.parse_args(arg)

    gpu = spacy.prefer_gpu()
    print('GPU:', gpu)

    # validation checks
    # that model is downloaded
    spacy_model_name = 'en_core_web_trf'
    if not spacy.util.is_package(spacy_model_name):
        spacy.cli.download(spacy_model_name)
    # that relevant directories exist
    for d in ["models", "data"]:
        if not os.path.exists(d):
            os.makedirs(d)

    # load spacy model
    nlp = spacy.load('en_core_web_trf')

    main(args, nlp)

## Run lab

In [189]:
lab8(['--part', '1'])

GPU: True




In [15]:
lab8(['--part', '2'])

List of Entities:
(March 8, 2021, Kottmann, Verkada, 36 hours, about 5 gigabytes, more than 150,000, Tesla, Alabama, Halifax Health, Verkada, Cloudflare, Okta, Verkada)

Standard Tokenization:
On March 8 , 2021 , a group of hackers including Kottmann and calling themselves ' APT   -   69420 Arson Cats ' gained ' super admin ' rights in the network of Verkada , a cloud - based security camera company , using credentials they found on the public internet . They had access to the network for 36 hours . The group collected about 5 gigabytes of data , including live security camera footage and recordings from more than 150,000 cameras in places like a Tesla factory , a jail in Alabama , a Halifax Health hospital , and residential homes . The group also accessed a list of Verkada customers and the company 's private financial information , and gained superuser access to the corporate networks of Cloudflare and Okta through their Verkada cameras .


In [16]:
# lab8(['--part', '3'])

Processing TESTING: 100%|██████████| 1734/1734 [00:24<00:00, 70.71it/s]
Processing TRAINING: 100%|██████████| 17125/17125 [04:17<00:00, 66.62it/s] 
Processing VALIDATION: 100%|██████████| 2257/2257 [00:36<00:00, 62.06it/s]


TESTING: 1864 entities
TRAINING: 19410 entities
VALIDATION: 3070 entities
Saved full processed corpus to data/corpus_average.pkl


In [63]:
lab8(['--part', '3'])

GPU: True


Processing TESTING: 100%|██████████| 1734/1734 [00:14<00:00, 123.40it/s]
Processing TRAINING: 100%|██████████| 17125/17125 [02:16<00:00, 125.44it/s]
Processing VALIDATION: 100%|██████████| 2257/2257 [00:20<00:00, 109.31it/s]


TESTING: 1864 entities
TRAINING: 19410 entities
VALIDATION: 3070 entities
Saved full processed corpus to data/corpus_average.pkl


In [187]:
lab8(['--part', '4', '--classifier_path', 'models/model_average.pkl', '--corpus', 'data/corpus_average.pkl'])

GPU: True
Training classifier with params:
{'C': 1.0, 'class_weight': None, 'dual': False, 'fit_intercept': True, 'intercept_scaling': 1, 'l1_ratio': None, 'max_iter': 500, 'multi_class': 'multinomial', 'n_jobs': None, 'penalty': 'l2', 'random_state': None, 'solver': 'lbfgs', 'tol': 0.0001, 'verbose': 0, 'warm_start': False}
Saving classifier to models/model_average.pkl
Stats for Logistic Regression Classifier on Test Set
Classifier Accuracy: 0.9393776824034334
-----------------------------------------------------------------------------------------
Classification Report:
              precision    recall  f1-score   support

    CARDINAL       0.93      0.95      0.94       151
        DATE       0.95      0.95      0.95       301
       EVENT       0.91      0.91      0.91        11
         FAC       0.77      0.89      0.83        19
         GPE       0.95      0.93      0.94       393
    LANGUAGE       0.50      0.67      0.57         3
         LAW       0.86      0.67      0.7

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


## Test

### Part 1 & 2

In [77]:
nlp = spacy.load('en_core_web_trf')

In [78]:
doc = nlp(example_text)

In [79]:
type(doc)

spacy.tokens.doc.Doc

In [86]:
doc.ents

(March 8, 2021,
 Kottmann,
 Verkada,
 36 hours,
 about 5 gigabytes,
 more than 150,000,
 Tesla,
 Alabama,
 Halifax Health,
 Verkada,
 Cloudflare,
 Okta,
 Verkada)

### Part 3

In [101]:
nlp.pipeline

[('transformer',
  <spacy_transformers.pipeline_component.Transformer at 0x263a9068e20>),
 ('tagger', <spacy.pipeline.tagger.Tagger at 0x263a90860a0>),
 ('parser', <spacy.pipeline.dep_parser.DependencyParser at 0x263cf74d970>),
 ('ner', <spacy.pipeline.ner.EntityRecognizer at 0x263a6753cf0>),
 ('attribute_ruler',
  <spacy.pipeline.attributeruler.AttributeRuler at 0x263aeec2c80>),
 ('lemmatizer', <spacy.lang.en.lemmatizer.EnglishLemmatizer at 0x263aeebc240>)]

In [102]:
nlp.add_pipe("trf_vector_hook", last=True)

<__main__.ContextualVectors at 0x263b02a00c0>

In [147]:
nlp.pipeline

[('transformer',
  <spacy_transformers.pipeline_component.Transformer at 0x263a9068e20>),
 ('tagger', <spacy.pipeline.tagger.Tagger at 0x263a90860a0>),
 ('parser', <spacy.pipeline.dep_parser.DependencyParser at 0x263cf74d970>),
 ('ner', <spacy.pipeline.ner.EntityRecognizer at 0x263a6753cf0>),
 ('attribute_ruler',
  <spacy.pipeline.attributeruler.AttributeRuler at 0x263aeec2c80>),
 ('lemmatizer', <spacy.lang.en.lemmatizer.EnglishLemmatizer at 0x263aeebc240>),
 ('trf_vector_hook', <__main__.ContextualVectors at 0x263b02a00c0>)]

In [148]:
with open(ontonotes_json) as fin:
    f = json.load(fin)

In [198]:
corpus = dict.fromkeys(f.keys())

In [150]:
f.keys()

dict_keys(['TESTING', 'TRAINING', 'VALIDATION'])

In [156]:
entry = f['TESTING'][0]

In [157]:
entry

{'entities': {'GPE': [[17, 27]], 'ORG': [[5, 13]], 'WORK_OF_ART': [[36, 66]]},
 'language': 'english',
 'morphology': {'$': [[66, 67]],
  'DT': [[28, 32], [41, 44]],
  'EX': [[9, 13]],
  'IJ': [[0, 4], [14, 16], [51, 55]],
  'NFP': [[5, 8], [17, 27], [36, 40], [45, 50], [56, 59], [60, 66]],
  'VBZ': [[33, 35]]},
 'syntax': {'NP': [[5, 27], [36, 50], [56, 66]],
  'NP-SBJ': [[28, 32]],
  'PP': [[0, 27], [51, 66]],
  'PP-LOC': [[14, 27]],
  'VP': [[33, 66]],
  'X-PRD': [[36, 66]]},
 'text': 'From NBC news in Washington this is Meet The Press with Tim Russet.'}

In [158]:
this_string = entry["text"]
this_doc = nlp("".join(this_string))

In [159]:
this_doc

From NBC news in Washington this is Meet The Press with Tim Russet.

In [160]:
this_doc.ents

(NBC, Washington, Meet The Press, Tim Russet)

In [161]:
ent = this_doc.ents[0]

In [162]:
ent.vector.shape

(768,)

In [163]:
ent.label_

'ORG'

In [199]:
read_file = "data/corpus_average.pkl"
with open(read_file, "rb") as fin:
    corpus = pickle.load(fin)

In [200]:
corpus['TESTING'][0].shape # embedding of TESTING

(1864, 768)

In [166]:
len(corpus['TESTING'][1]) # labels of TESTING

1864

### ContextualVectors(Pipe)

In [167]:
this_doc

From NBC news in Washington this is Meet The Press with Tim Russet.

In [168]:
lengths = this_doc._.trf_data.align.lengths

In [169]:
lengths # sub-words lengths of entities (doc)

array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1], dtype=int32)

In [170]:
tensors = this_doc._.trf_data.tensors

In [171]:
tensors[0].shape # vectors of sub-words (ent)

(1, 17, 768)

In [172]:
tensors[1].shape

(1, 768)

In [173]:
input_texts = this_doc._.trf_data.tokens['input_texts'][0]

In [174]:
input_texts # sub-words tokens (doc) ?

['<s>',
 'From',
 'ĠNBC',
 'Ġnews',
 'Ġin',
 'ĠWashington',
 'Ġthis',
 'Ġis',
 'ĠMeet',
 'ĠThe',
 'ĠPress',
 'Ġwith',
 'ĠTim',
 'ĠRuss',
 'et',
 '.',
 '</s>']

In [194]:
trf_vector = tensors[0][0][13:15]

In [195]:
trf_vector.shape

(2, 768)

In [197]:
np.average(trf_vector, axis=0).shape # average vector of subwords vectors

(768,)

### Part 4

In [201]:
train_data, train_labels_ = corpus["TRAINING"]

In [202]:
str(type(train_data)) # indicate GPU training

"<class 'cupy._core.core.ndarray'>"

## Cuda

In [21]:
import torch
torch.cuda.is_available()

True

In [23]:
import torch
print(torch.__version__)

print(torch.version.cuda)
print(torch.backends.cudnn.version())

1.10.0
11.3
8200


In [74]:
spacy.prefer_gpu()

True

In [16]:
spacy.require_gpu()

True