# SpaCy test

In [16]:
!nvidia-smi

'nvidia-smi' is not recognized as an internal or external command,
operable program or batch file.


In [17]:
import os
os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"   # see issue #152
os.environ["CUDA_VISIBLE_DEVICES"]="1"

In [18]:
# !pip install https://huggingface.co/latincy/la_core_web_lg/resolve/main/la_core_web_lg-any-py3-none-any.whl
import spacy
nlp = spacy.load('la_core_web_lg', exclude=['morphologizer', 'trainable_lemmatizer', 'parser', 'tagger', 'lemma_fixer'])
print(nlp.pipeline)
# doc = nlp('Haec narrantur a poetis de Perseo')

[('normer', <function normer at 0x00000185316DCA40>), ('tok2vec', <spacy.pipeline.tok2vec.Tok2Vec object at 0x0000018566306510>), ('ner', <spacy.pipeline.ner.EntityRecognizer object at 0x0000018566300F20>)]


In [19]:
# for token in doc:
#     print(token.lemma_)

In [20]:
from spacy.tokens import Doc

class WhitespaceTokenizer:
    def __init__(self, vocab):
        self.vocab = vocab

    def __call__(self, text):
        words = text.split(" ")
        spaces = [True] * len(words)
        # Avoid zero-length tokens
        for i, word in enumerate(words):
            if word == "":
                words[i] = " "
                spaces[i] = False
        # Remove the final trailing space
        if words[-1] == " ":
            words = words[0:-1]
            spaces = spaces[0:-1]
        else:
            spaces[-1] = False
            
        return Doc(self.vocab, words=words, spaces=spaces)

In [21]:
import pandas as pd

test = pd.read_csv('data/Latin_NER_test.csv', index_col=0)
len(test)


31788

In [22]:
def f7(seq):
    seen = set()
    seen_add = seen.add
    return [x for x in seq if not (x in seen or seen_add(x))]

unqlist = f7(test['sent_id'].to_list())

dct = {unq: i for i, unq in enumerate(unqlist)}

In [23]:
test['to_group'] = test.sent_id.apply(lambda x: dct[x])
test.head()

Unnamed: 0,word,tag,sentence,orig_text,sent_id,to_group
0,timere,O,11,CW,CW_11,0
1,Caesarem,B-PERS,11,CW,CW_11,0
2,ereptis,O,11,CW,CW_11,0
3,ab,O,11,CW,CW_11,0
4,eo,O,11,CW,CW_11,0


In [24]:
nlp.tokenizer = WhitespaceTokenizer(nlp.vocab)


In [25]:
from spacy.training import offsets_to_biluo_tags

grouped = test.groupby('to_group')

sents = [grouped.get_group(group)['word'].values.tolist() for group in grouped.groups]

labels = [grouped.get_group(group)['tag'].values.tolist() for group in grouped.groups]
print(labels)

predictions=[[(ent.start_char, ent.end_char, ent.label_) for ent in nlp(' '.join(sent)).ents] for sent in sents]
# print(predictions)
predictions=[offsets_to_biluo_tags(nlp(' '.join(sent)), predictions[i]) for i,sent in enumerate(sents)]
print(predictions)
# # [(i, text) for i,text in enumerate(test['word'])]

[['O', 'B-PERS', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-PERS', 'O', 'O'], ['B-PERS', 'O', 'B-PERS', 'O', 'O', 'O', 'O', 'O', 'B-PERS', 'O', 'O', 'O', 'O', 'O', 'O', 'O'], ['O', 'B-PERS', 'I-PERS', 'B-PERS', 'I-PERS', 'O', 'O', 'O'], ['O', 'O', 'O', 'O'], ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'], ['B-PERS', 'O', 'O', 'B-PERS', 'O', 'O', 'O', 'O', 'O'], ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-LOC', 'B-LOC', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'], ['O', 'O', 'O', 'O', 'B-LOC', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'], ['O', 'O', 'O', 'B-PERS', 'O', 'O', 'O', 'O'], ['O', 'O', 'O', 'O', 'O', 'O', 'B-PERS', 'O', 'B-PERS', 'O', 'O', 'O'], ['B-PERS', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'], ['O', 'O', 'O', 'O', 'O', 'O', 'B-PERS', 'O', 'O'], ['O', 'O', 'B-PERS

In [26]:
print(sents)

[['timere', 'Caesarem', 'ereptis', 'ab', 'eo', 'duabus', 'legionibus', ',', 'ne', 'ad', 'eius', 'periculum', 'reservare', 'et', 'retinere', 'eas', 'ad', 'urbem', 'Pompeius', 'videretur', ';'], ['Lentulus', 'sententiam', 'Calidi', 'pronuntiaturum', 'se', 'omnino', 'negavit', ',', 'Marcellus', 'perterritus', 'conviciis', 'a', 'sua', 'sententia', 'discessit', '.'], ['intercedit', 'M.', 'Antonius', 'Q.', 'Cassius', 'tribuni', 'plebis', '.'], ['dicuntur', 'sententiae', 'graves', ';'], ['laudat', 'promptos', 'atque', 'in', 'posterum', 'confirmat', ',', 'segniores', 'castigat', 'atque', 'incitat', '.'], ['Catonem', 'veteres', 'inimicitiae', 'Caesaris', 'incitant', 'et', 'dolor', 'repulsae', '.'], ['simul', 'infamia', 'duarum', 'legionum', 'permotus', ',', 'quas', 'ab', 'itinere', 'Asiae', 'Syriaeque', 'ad', 'suam', 'potentiam', 'dominatumque', 'converterat', ',', 'rem', 'ad', 'arma', 'deduci', 'studebat', '.'], ['is', 'eo', 'tempore', 'erat', 'Ravennae', 'exspectabatque', 'suis', 'lenissimis'

In [27]:
print(labels)

[['O', 'B-PERS', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-PERS', 'O', 'O'], ['B-PERS', 'O', 'B-PERS', 'O', 'O', 'O', 'O', 'O', 'B-PERS', 'O', 'O', 'O', 'O', 'O', 'O', 'O'], ['O', 'B-PERS', 'I-PERS', 'B-PERS', 'I-PERS', 'O', 'O', 'O'], ['O', 'O', 'O', 'O'], ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'], ['B-PERS', 'O', 'O', 'B-PERS', 'O', 'O', 'O', 'O', 'O'], ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-LOC', 'B-LOC', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'], ['O', 'O', 'O', 'O', 'B-LOC', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'], ['O', 'O', 'O', 'B-PERS', 'O', 'O', 'O', 'O'], ['O', 'O', 'O', 'O', 'O', 'O', 'B-PERS', 'O', 'B-PERS', 'O', 'O', 'O'], ['B-PERS', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'], ['O', 'O', 'O', 'O', 'O', 'O', 'B-PERS', 'O', 'O'], ['O', 'O', 'B-PERS

In [28]:
predictions=[y.replace("U-", "B-").replace("L-", "I-") for x in predictions for y in x]
print(predictions)

['O', 'B-PERSON', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-PERSON', 'O', 'O', 'B-PERSON', 'O', 'B-PERSON', 'O', 'O', 'O', 'O', 'O', 'B-PERSON', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-PERSON', 'I-PERSON', 'B-PERSON', 'I-PERSON', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-PERSON', 'O', 'O', 'B-PERSON', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-LOC', 'B-LOC', 'I-LOC', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-LOC', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-PERSON', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-PERSON', 'O', 'B-PERSON', 'O', 'O', 'O', 'B-PERSON', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-PERSON', 'O', 'O', 'O', '

In [29]:
len(predictions)

31788

In [30]:
predictions = [pred.replace('PERSON', 'PERS') for pred in predictions]
predictions = [pred.replace('NORP', 'GRP') for pred in predictions]
true = [tag for word in labels for tag in word]

from sklearn.metrics import classification_report

report=classification_report(true, predictions)

print(report)

              precision    recall  f1-score   support

       B-GRP       0.23      0.01      0.02       354
       B-LOC       0.42      0.62      0.50       305
      B-PERS       0.55      0.71      0.62       849
       I-GRP       0.00      0.00      0.00         3
       I-LOC       0.00      0.00      0.00         8
      I-PERS       0.74      0.29      0.42        99
           O       0.98      0.98      0.98     30170

    accuracy                           0.96     31788
   macro avg       0.42      0.37      0.36     31788
weighted avg       0.96      0.96      0.96     31788



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [31]:
report=classification_report(true, predictions)

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [32]:
print(report)

              precision    recall  f1-score   support

       B-GRP       0.23      0.01      0.02       354
       B-LOC       0.42      0.62      0.50       305
      B-PERS       0.55      0.71      0.62       849
       I-GRP       0.00      0.00      0.00         3
       I-LOC       0.00      0.00      0.00         8
      I-PERS       0.74      0.29      0.42        99
           O       0.98      0.98      0.98     30170

    accuracy                           0.96     31788
   macro avg       0.42      0.37      0.36     31788
weighted avg       0.96      0.96      0.96     31788



In [33]:
test['predictions'] = predictions

In [34]:
test['true_spacy'] = true

In [35]:
test 

Unnamed: 0,word,tag,sentence,orig_text,sent_id,to_group,predictions,true_spacy
0,timere,O,11,CW,CW_11,0,O,O
1,Caesarem,B-PERS,11,CW,CW_11,0,B-PERS,B-PERS
2,ereptis,O,11,CW,CW_11,0,O,O
3,ab,O,11,CW,CW_11,0,O,O
4,eo,O,11,CW,CW_11,0,O,O
...,...,...,...,...,...,...,...,...
31783,.,O,2433,Ovid,Ovid_2433,3408,O,O
31784,Quo,O,2434,Ovid,Ovid_2434,3409,O,O
31785,feror,O,2434,Ovid,Ovid_2434,3409,O,O
31786,insanus,O,2434,Ovid,Ovid_2434,3409,O,O


In [41]:
# test.to_csv('spacy_errors_test.csv')
orig_test = pd.read_csv('spacy_errors_test.csv', index_col=0)

In [52]:
compare_df = orig_test.compare(test, keep_equal=True, keep_shape=True)

In [53]:
compare_df.columns

MultiIndex([(       'word',  'self'),
            (       'word', 'other'),
            (        'tag',  'self'),
            (        'tag', 'other'),
            (   'sentence',  'self'),
            (   'sentence', 'other'),
            (  'orig_text',  'self'),
            (  'orig_text', 'other'),
            (    'sent_id',  'self'),
            (    'sent_id', 'other'),
            (   'to_group',  'self'),
            (   'to_group', 'other'),
            ('predictions',  'self'),
            ('predictions', 'other'),
            ( 'true_spacy',  'self'),
            ( 'true_spacy', 'other')],
           )

In [39]:
# len(orig_test)

31788

In [40]:
orig_test.columns

Index(['Unnamed: 0', 'word', 'tag', 'sentence', 'orig_text', 'sent_id',
       'to_group', 'predictions', 'true_spacy'],
      dtype='object')