# CRF-TEST 

In [21]:
# !nvidia-smi
# !pip install Cython
# !pip install scikit-learn==0.22.1  -q # to solve "AttributeError: 'CRF' object has no attribute 'keep_tempfiles'" when using crfsuite
# !pip install sklearn_crfsuite -q
# !pip install eli5 -q

In [22]:
import os
os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"   # see issue #152
os.environ["CUDA_VISIBLE_DEVICES"]="1"

In [23]:
import pandas as pd
train = pd.read_csv('data/Latin_NER_train.csv', index_col=0)
eva = pd.read_csv('data/Latin_NER_eval.csv', index_col=0)
test = pd.read_csv('data/Latin_NER_test.csv', index_col=0)
train.head()

Unnamed: 0,word,tag,sentence,orig_text,sent_id
0,ut,O,1,CW,CW_1
1,vero,O,1,CW,CW_1
2,ex,O,1,CW,CW_1
3,litteris,O,1,CW,CW_1
4,ad,O,1,CW,CW_1


In [24]:
def f7(seq):
    seen = set()
    seen_add = seen.add
    return [x for x in seq if not (x in seen or seen_add(x))]

def number_group(df, group_col):
    unqlist = f7(df[group_col].to_list())
    dct = {unq: i for i, unq in enumerate(unqlist)}
    new_col = [dct[x] for x in df[group_col]]     
    return new_col
    

eva['to_group'] = number_group(eva, 'sent_id')
train['to_group'] = number_group(train, 'sent_id')
test['to_group'] = number_group(test, 'sent_id')

test.head()

Unnamed: 0,word,tag,sentence,orig_text,sent_id,to_group
0,timere,O,11,CW,CW_11,0
1,Caesarem,B-PERS,11,CW,CW_11,0
2,ereptis,O,11,CW,CW_11,0
3,ab,O,11,CW,CW_11,0
4,eo,O,11,CW,CW_11,0


In [25]:
print(train.info())
print(test.info())
print(eva.info())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 88165 entries, 0 to 88164
Data columns (total 6 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   word       88165 non-null  object
 1   tag        88165 non-null  object
 2   sentence   88165 non-null  int64 
 3   orig_text  88165 non-null  object
 4   sent_id    88165 non-null  object
 5   to_group   88165 non-null  int64 
dtypes: int64(2), object(4)
memory usage: 4.7+ MB
None
<class 'pandas.core.frame.DataFrame'>
Int64Index: 31788 entries, 0 to 31787
Data columns (total 6 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   word       31788 non-null  object
 1   tag        31788 non-null  object
 2   sentence   31788 non-null  int64 
 3   orig_text  31788 non-null  object
 4   sent_id    31788 non-null  object
 5   to_group   31788 non-null  int64 
dtypes: int64(2), object(4)
memory usage: 1.7+ MB
None
<class 'pandas.core.frame.DataFrame'>
Int64Index: 

In [26]:
#import library
# !pip install sklearn_crfsuite
from sklearn.model_selection import train_test_split
import sklearn_crfsuite
from sklearn_crfsuite import metrics

In [27]:
#largerly copied from https://www.kaggle.com/code/shoumikgoswami/ner-using-random-forest-and-crf/notebook

class SentenceGetter(object):
    def __init__(self, data, token_col="word", tag_col="tag", sentence_col="sentence"):
        self.n_sent = 1
        self.data = data
        self.empty = False
        agg_func = lambda s: [(w, t) for w, t in zip(s[token_col].values.tolist(),  
                                                           s[tag_col].values.tolist())]
        self.grouped = self.data.groupby(sentence_col).apply(agg_func)
        self.sentences = [s for s in self.grouped]
        
    def get_next(self):
        try: 
            s = self.grouped['Sentence: {}'.format(self.n_sent)]
            self.n_sent += 1
            return s 
        except:
            return None

In [28]:
#copied from https://sklearn-crfsuite.readthedocs.io/, adjusted for dataset without PoS-tags

def word2features(sent, i):
    word = sent[i][0]
    
    features = {
        'bias': 1.0, 
        'word.lower()': word.lower(), 
        'word[-3:]': word[-3:],
        'word[-2:]': word[-2:],
        'word.isupper()': word.isupper(),
        'word.istitle()': word.istitle(),
        'word.isdigit()': word.isdigit(),
    }
    if i > 0:
        word1 = sent[i-1][0]
        features.update({
            '-1:word.lower()': word1.lower(),
            '-1:word.istitle()': word1.istitle(),
            '-1:word.isupper()': word1.isupper(),
        })
    else:
        features['BOS'] = True
    if i < len(sent)-1:
        word1 = sent[i+1][0]
        features.update({
            '+1:word.lower()': word1.lower(),
            '+1:word.istitle()': word1.istitle(),
            '+1:word.isupper()': word1.isupper(),
        })
    else:
        features['EOS'] = True

    return features

def sent2features(sent):
    return [word2features(sent, i) for i in range(len(sent))]
def sent2labels(sent):
    return [label for token, label in sent]
def sent2tokens(sent):
    return [token for token, label in sent]

In [29]:
# from sklearn.model_selection import train_test_split

# train, test = train_test_split(df, test_size=0.25, random_state=2)

# test, eva = train_test_split(test, test_size=0.5, random_state=2)

In [30]:
# for s in train_sentences:
#     try:
#         sent2features(s)
#     except AttributeError:
#         print(s)


In [31]:
getter = SentenceGetter(train, sentence_col='to_group')
train_sentences = getter.sentences

X_train = [sent2features(s) for s in train_sentences]
y_train = [sent2labels(s) for s in train_sentences]

In [33]:
train_sentences[0]

[('ut', 'O'),
 ('vero', 'O'),
 ('ex', 'O'),
 ('litteris', 'O'),
 ('ad', 'O'),
 ('senatum', 'O'),
 ('referretur', 'O'),
 (',', 'O'),
 ('impetrari', 'O'),
 ('non', 'O'),
 ('potuit', 'O'),
 ('.', 'O')]

In [12]:
getter = SentenceGetter(eva, sentence_col='to_group')
eval_sentences = getter.sentences

X_eval = [sent2features(s) for s in eval_sentences]
y_eval = [sent2labels(s) for s in eval_sentences]

In [13]:
getter = SentenceGetter(test, sentence_col='to_group')
test_sentences = getter.sentences

X_test = [sent2features(s) for s in test_sentences]
y_test = [sent2labels(s) for s in test_sentences]

### ignore test

In [14]:
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    c1=0.1,
    c2=0.1,
    max_iterations=100,
    all_possible_transitions=True
)
#default values


crf.fit(X_train, y_train)



CRF(algorithm='lbfgs', all_possible_states=None, all_possible_transitions=True,
    averaging=None, c=None, c1=0.1, c2=0.1, calibration_candidates=None,
    calibration_eta=None, calibration_max_trials=None, calibration_rate=None,
    calibration_samples=None, delta=None, epsilon=None, error_sensitive=None,
    gamma=None, keep_tempfiles=None, linesearch=None, max_iterations=100,
    max_linesearch=None, min_freq=None, model_filename=None, num_memories=None,
    pa_type=None, period=None, trainer_cls=None, variance=None, verbose=False)

In [15]:
labels = list(crf.classes_)
labels.remove('O')
labels

['B-PERS', 'I-PERS', 'B-LOC', 'B-GRP', 'I-LOC', 'I-GRP']

In [16]:
y_pred = crf.predict(X_test)
metrics.flat_f1_score(y_test, y_pred,
                      average='weighted', labels=labels)

0.6680795511679277

In [17]:
#test with default features

from sklearn_crfsuite.utils import flatten

# Check this notebook:

# sorted_labels = sorted(
#     labels,
#     key=lambda name: (name[1:], name[0])
# )
# print(metrics.flat_classification_report(
#     y_test, y_pred, 
#     labels=sorted_labels, digits=3
# ))

### hyperparameter optimisation

In [18]:
from sklearn.metrics import make_scorer
from seqeval.metrics import classification_report
from sklearn.model_selection import RandomizedSearchCV, PredefinedSplit
import scipy
import numpy as np

In [19]:
#create an array/list which sets all the X_train samples to -1 and all the evaluation samples to 0
split_index = [-1]*len(X_train) + [0]*len(X_eval)
X = X_train + X_eval
y = y_train + y_eval

print(len(X))
print(len(y))
print(len(split_index))

pds = PredefinedSplit(test_fold = split_index)

6787
6787
6787


In [20]:
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs', #decided to continue with this because it should be good for situations with little computing pawer and not a lot of data
    max_iterations=100,
    all_possible_transitions=True
)
params_space = {
    'c1': scipy.stats.expon(scale=0.5),
    'c2': scipy.stats.expon(scale=0.05),
}

# use the same metric for evaluation
f1_scorer = make_scorer(metrics.flat_f1_score,
                        average='macro', labels=labels)

# search
rs = RandomizedSearchCV(crf, 
                        params_space,
                        cv=pds,
                        verbose=1,
                        n_jobs=-1,
                        n_iter=50,
                        scoring=f1_scorer,
                        random_state=21)


# try:
rs.fit(X, y)
# except AttributeError:
#     pass

Fitting 1 folds for each of 50 candidates, totalling 50 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 128 concurrent workers.


[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:   13.2s remaining:    0.0s


KeyboardInterrupt: 

In [None]:
print('best params:', rs.best_params_)
print('best CV score:', rs.best_score_)
print('model size: {:0.2f}M'.format(rs.best_estimator_.size_ / 1000000))

In [None]:
crf = rs.best_estimator_
y_pred = crf.predict(X_test)
print(metrics.flat_classification_report(
    y_test, y_pred, labels=sorted(labels), digits=3
))

In [None]:
#entity level (strict) evaluation

print(classification_report(
    y_test, y_pred, digits=3
))

In [None]:
from collections import Counter

def print_transitions(trans_features):
    for (label_from, label_to), weight in trans_features:
        print("%-6s -> %-7s %0.6f" % (label_from, label_to, weight))

print("Top likely transitions:")
print_transitions(Counter(crf.transition_features_).most_common(10))

print("\nTop unlikely transitions:")
print_transitions(Counter(crf.transition_features_).most_common()[-15:])

In [None]:
# !pip install eli5 -q
import eli5
eli5.show_weights(crf, top=10)

In [None]:
import joblib

# model_directory = '/trained_models/'
filename = 'CRF_with_words_itself_as_feature_Herodotos.sav'

joblib.dump(crf, 'trained_CRF/' + filename)

## Train another CRF without the words themselves as features, 

should generalize better according to Palladino et al. (Greek NER with minimal annotation)

In [None]:
def word2features(sent, i):
    word = sent[i][0]
    
    features = {
        'bias': 1.0, 
        # 'word.lower()': word.lower(), 
        'word[-3:]': word[-3:],
        'word[-2:]': word[-2:],
        'word.isupper()': word.isupper(),
        'word.istitle()': word.istitle(),
        'word.isdigit()': word.isdigit(),
    }
    if i > 0:
        word1 = sent[i-1][0]
        features.update({
            '-1:word.lower()': word1.lower(),
            '-1:word.istitle()': word1.istitle(),
            '-1:word.isupper()': word1.isupper(),
        })
    else:
        features['BOS'] = True
    if i < len(sent)-1:
        word1 = sent[i+1][0]
        features.update({
            '+1:word.lower()': word1.lower(),
            '+1:word.istitle()': word1.istitle(),
            '+1:word.isupper()': word1.isupper(),
        })
    else:
        features['EOS'] = True

    return features

def sent2features(sent):
    return [word2features(sent, i) for i in range(len(sent))]
def sent2labels(sent):
    return [label for token, label in sent]
def sent2tokens(sent):
    return [token for token, label in sent]

In [None]:
getter = SentenceGetter(train, sentence_col='to_group')
train_sentences = getter.sentences

X_train = [sent2features(s) for s in train_sentences]
y_train = [sent2labels(s) for s in train_sentences]

In [None]:
print(X_train[1][0])

In [29]:
getter = SentenceGetter(eva, sentence_col='to_group')
eval_sentences = getter.sentences

X_eval = [sent2features(s) for s in eval_sentences]
y_eval = [sent2labels(s) for s in eval_sentences]

In [30]:
getter = SentenceGetter(test, sentence_col='to_group')
test_sentences = getter.sentences

X_test = [sent2features(s) for s in test_sentences]
y_test = [sent2labels(s) for s in test_sentences]

In [31]:
#create an array/list which sets all the X_train samples to -1 and all the evaluation samples to 0
split_index = [-1]*len(X_train) + [0]*len(X_eval)
X = X_train + X_eval
y = y_train + y_eval

print(len(X))
print(len(y))
print(len(split_index))

pds = PredefinedSplit(test_fold = split_index)

6787
6787
6787


In [32]:
crf_no_words = sklearn_crfsuite.CRF(
    algorithm='lbfgs', #decided to continue with this because laptop and not a lot of data
    max_iterations=100,
    all_possible_transitions=True
)
params_space = {
    'c1': scipy.stats.expon(scale=0.5),
    'c2': scipy.stats.expon(scale=0.05),
}

# use the same metric for evaluation
f1_scorer = make_scorer(metrics.flat_f1_score,
                        average='weighted', labels=labels)

# search
rs2 = RandomizedSearchCV(crf_no_words, params_space,
                        cv=pds,
                        verbose=1,
                        n_jobs=-1,
                        n_iter=50,
                        scoring=f1_scorer,
                        random_state=21)
rs2.fit(X, y)

Fitting 1 folds for each of 50 candidates, totalling 50 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 128 concurrent workers.


[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:   14.1s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:   14.1s finished


RandomizedSearchCV(cv=PredefinedSplit(test_fold=array([-1, -1, ...,  0,  0])),
                   error_score=nan,
                   estimator=CRF(algorithm='lbfgs', all_possible_states=None,
                                 all_possible_transitions=True, averaging=None,
                                 c=None, c1=None, c2=None,
                                 calibration_candidates=None,
                                 calibration_eta=None,
                                 calibration_max_trials=None,
                                 calibration_rate=None,
                                 calibration_samples=None, delta=None,
                                 ep...
                   param_distributions={'c1': <scipy.stats._distn_infrastructure.rv_continuous_frozen object at 0x7fdc2c532710>,
                                        'c2': <scipy.stats._distn_infrastructure.rv_continuous_frozen object at 0x7fdc2c5325c0>},
                   pre_dispatch='2*n_jobs', random_state=21, ref

In [33]:
print('best params:', rs2.best_params_)
print('best CV score:', rs2.best_score_)

best params: {'c1': 0.18258054250394934, 'c2': 0.0857633727161342}
best CV score: 0.8045138964866787


In [34]:
crf2 = rs2.best_estimator_
y_pred = crf2.predict(X_test)
print(metrics.flat_classification_report(
    y_test, y_pred, labels=sorted(labels), digits=3
))

  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

       B-GRP      0.592     0.616     0.604       354
       B-LOC      0.670     0.511     0.580       305
      B-PERS      0.722     0.660     0.689       849
       I-GRP      0.000     0.000     0.000         3
       I-LOC      0.000     0.000     0.000         8
      I-PERS      0.637     0.939     0.759        99

   micro avg      0.673     0.635     0.654      1618
   macro avg      0.437     0.454     0.439      1618
weighted avg      0.673     0.635     0.650      1618



In [35]:
#entity level (strict) evaluation

print(classification_report(
    y_test, y_pred, digits=3
))

              precision    recall  f1-score   support

         GRP      0.587     0.610     0.598       354
         LOC      0.644     0.492     0.558       305
        PERS      0.688     0.629     0.657       849

   micro avg      0.654     0.597     0.624      1508
   macro avg      0.640     0.577     0.604      1508
weighted avg      0.655     0.597     0.623      1508



In [36]:
from collections import Counter

def print_transitions(trans_features):
    for (label_from, label_to), weight in trans_features:
        print("%-6s -> %-7s %0.6f" % (label_from, label_to, weight))

print("Top likely transitions:")
print_transitions(Counter(crf.transition_features_).most_common(10))

print("\nTop unlikely transitions:")
print_transitions(Counter(crf.transition_features_).most_common()[-15:])

Top likely transitions:
B-LOC  -> I-LOC   5.535743
B-PERS -> I-PERS  3.986924
O      -> O       2.782399
B-GRP  -> I-GRP   2.707117
I-GRP  -> I-GRP   2.365238
I-PERS -> I-PERS  1.642912
B-PERS -> B-GRP   1.505224
I-LOC  -> B-LOC   1.163241
B-LOC  -> B-LOC   0.924128
O      -> B-LOC   0.521981

Top unlikely transitions:
I-PERS -> B-PERS  -0.757775
I-LOC  -> B-PERS  -0.915598
B-GRP  -> O       -1.088971
I-GRP  -> O       -1.514433
B-LOC  -> B-GRP   -1.601298
B-LOC  -> B-PERS  -1.606000
B-GRP  -> B-PERS  -1.671393
I-LOC  -> B-GRP   -1.738646
B-GRP  -> I-PERS  -2.060752
B-GRP  -> I-LOC   -2.083738
O      -> I-GRP   -2.314579
I-PERS -> B-GRP   -2.897293
B-LOC  -> I-PERS  -3.153592
O      -> I-LOC   -4.536597
O      -> I-PERS  -4.654732


In [37]:
import eli5
eli5.show_weights(crf2, top=10)

#we see less more generic things in the top features bv. istitle by B-PERS
#still issues with bv "caesar" as 

From \ To,O,B-GRP,I-GRP,B-LOC,I-LOC,B-PERS,I-PERS
O,2.384,-0.301,-1.919,0.526,-2.189,0.154,-3.464
B-GRP,-1.126,-0.459,2.618,-0.319,0.0,-1.144,-1.276
I-GRP,-1.615,0.0,2.171,0.0,0.0,0.0,0.0
B-LOC,-0.335,-1.187,-0.097,1.238,5.583,-1.175,-1.894
I-LOC,-0.62,-0.838,0.0,1.524,0.0,-0.694,0.0
B-PERS,-0.161,0.884,-0.067,-0.037,0.0,-0.479,3.859
I-PERS,-0.905,-2.761,0.0,-0.544,0.0,-0.617,2.047

Weight?,Feature,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,Unnamed: 6_level_0
Weight?,Feature,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Weight?,Feature,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
Weight?,Feature,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3
Weight?,Feature,Unnamed: 2_level_4,Unnamed: 3_level_4,Unnamed: 4_level_4,Unnamed: 5_level_4,Unnamed: 6_level_4
Weight?,Feature,Unnamed: 2_level_5,Unnamed: 3_level_5,Unnamed: 4_level_5,Unnamed: 5_level_5,Unnamed: 6_level_5
Weight?,Feature,Unnamed: 2_level_6,Unnamed: 3_level_6,Unnamed: 4_level_6,Unnamed: 5_level_6,Unnamed: 6_level_6
+6.112,BOS,,,,,
+4.979,EOS,,,,,
+4.617,bias,,,,,
+4.272,"-1:word.lower():""",,,,,
+3.894,+1:word.lower():cleopatrae,,,,,
+3.842,word[-3:]:ula,,,,,
… 783 more positive …,… 783 more positive …,,,,,
… 572 more negative …,… 572 more negative …,,,,,
-4.186,word[-2:]:an,,,,,
-4.334,word[-3:]:ten,,,,,

Weight?,Feature
+6.112,BOS
+4.979,EOS
+4.617,bias
+4.272,"-1:word.lower():"""
+3.894,+1:word.lower():cleopatrae
+3.842,word[-3:]:ula
… 783 more positive …,… 783 more positive …
… 572 more negative …,… 572 more negative …
-4.186,word[-2:]:an
-4.334,word[-3:]:ten

Weight?,Feature
+4.367,-1:word.lower():arvernisque
+4.050,+1:word.lower():balanitae
+3.992,word.istitle()
+3.980,-1:word.lower():populo
+3.966,word[-2:]:os
+3.932,word[-3:]:tii
+3.923,-1:word.lower():timaeo
+3.794,word[-3:]:uus
+3.777,word[-3:]:dui
+3.709,word[-3:]:deo

Weight?,Feature
+2.711,-1:word.lower():volcis
+2.391,+1:word.lower():pactiles
+2.375,+1:word.lower():spectata
+2.239,-1:word.lower():populus
+2.190,-1:word.lower():-
+1.996,word[-3:]:cis
+1.994,-1:word.lower():volcarum
+1.994,+1:word.lower():depopulandos
+1.850,+1:word.lower():tibus
+1.850,-1:word.lower():tuder

Weight?,Feature
+4.770,+1:word.lower():inpari
+4.507,+1:word.lower():familiarissimum
+4.377,+1:word.lower():conpositum
+3.893,+1:word.lower():dicti
+3.762,-1:word.lower():trans
+3.694,word[-3:]:cte
+3.633,-1:word.lower():appellatur
+3.579,-1:word.lower():mercurius
+3.578,-1:word.lower():aestuosus
+3.505,-1:word.lower():flumen

Weight?,Feature
+3.671,-1:word.lower():citerioris
+3.511,-1:word.lower():aqua
+3.300,-1:word.lower():citeriore
+3.039,-1:word.lower():aegyptio
+2.965,-1:word.lower():ulteriorem
+2.871,-1:word.lower():castellum
+2.346,-1:word.lower():septem
+2.310,+1:word.lower():peractis
+2.192,-1:word.lower():rubri
+2.161,-1:word.lower():ulterioris

Weight?,Feature
+5.970,word[-3:]:sar
+5.911,-1:word.lower():plinius
+4.631,-1:word.lower():athe
+4.321,+1:word.lower():deflexus
+4.106,word.istitle()
+4.019,+1:word.lower():decurio
+3.930,-1:word.lower():m
+3.767,-1:word.lower():milites
+3.742,-1:word.lower():atheniensibus
+3.567,word[-3:]:'

Weight?,Feature
+3.588,-1:word.lower():.
+3.055,-1:word.isupper()
+2.817,-1:word.lower():cornelio
+2.693,+1:word.lower():c.
+2.542,-1:word.lower():autem
+2.487,+1:word.lower():svo
+2.487,word[-2:]:RI
+2.487,word[-3:]:ARI
+2.317,-1:word.lower():vespasiano
+2.228,word[-3:]:sti


In [38]:
sum(y_pred, [])

['O',
 'B-PERS',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'B-PERS',
 'O',
 'O',
 'B-PERS',
 'O',
 'B-PERS',
 'O',
 'O',
 'O',
 'O',
 'O',
 'B-PERS',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'B-PERS',
 'I-PERS',
 'B-PERS',
 'I-PERS',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'B-PERS',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'B-LOC',
 'B-LOC',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'B-LOC',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'B-PERS',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'B-PERS',
 'O',
 'B-GRP',
 'O',
 'O',
 'O',
 'B-PERS',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 'O',
 

In [39]:
test['CRF_predictions'] = sum(y_pred, [])
test['CRF_check'] = sum(y_test, [])
assert test['CRF_check'].values.tolist() == test['tag'].values.tolist()

In [40]:
test.drop(columns=['CRF_check', 'to_group'], inplace=True)

In [41]:
test.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 31788 entries, 0 to 31787
Data columns (total 6 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   word             31788 non-null  object
 1   tag              31788 non-null  object
 2   sentence         31788 non-null  int64 
 3   orig_text        31788 non-null  object
 4   sent_id          31788 non-null  object
 5   CRF_predictions  31788 non-null  object
dtypes: int64(1), object(5)
memory usage: 1.7+ MB


In [42]:
test.to_csv('CRF_results_final.csv')

In [4]:
import pandas as pd
df = pd.read_csv('CRF_results_final.csv', index_col=0)

In [5]:
df.columns

Index(['word', 'tag', 'sentence', 'orig_text', 'sent_id', 'CRF_predictions'], dtype='object')

In [8]:
from sklearn.metrics import classification_report

print(classification_report(
    df['tag'], df['CRF_predictions'], digits=3
))

  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

       B-GRP      0.592     0.616     0.604       354
       B-LOC      0.670     0.511     0.580       305
      B-PERS      0.722     0.660     0.689       849
       I-GRP      0.000     0.000     0.000         3
       I-LOC      0.000     0.000     0.000         8
      I-PERS      0.637     0.939     0.759        99
           O      0.992     0.995     0.994     30170

    accuracy                          0.977     31788
   macro avg      0.516     0.532     0.518     31788
weighted avg      0.976     0.977     0.976     31788



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [43]:
# filename = 'trained_CRF/CRF_without_words_itself_as_feature_Herodotos.sav'

# joblib.dump(crf2, filename)

['data/CRF_without_words_itself_as_feature_Herodotos.sav']