In [14]:
import numpy as np
from matstract.nlp.ner_features import FeatureGenerator
from matstract.models.AnnotationBuilder import AnnotationBuilder

# Guide for using matstract.nlp.ner_features.FeatureGenerator

### Load the annotated data

In [5]:
builder = AnnotationBuilder()
annotations = builder.get_annotations(user='leighmi6')
annotations = [annotated.to_iob()[0] for annotated in annotations]
annotations = [[[((word, pos), tag) for word, pos, tag in sent] for sent in doc] for doc in annotations] #this line makes my code compatible with Vahe's

### Fit the features

In [9]:
feature_generator = FeatureGenerator()
features, outcomes = feature_generator.fit_transform(annotations)

The features are returned as a scipy sparse array. Any categorical features are binary encoded and any numerical features have been normalized. The outcomes are returned as a list.

FeatureGenerator can also return train and test sets for assessing ML models (see below)

### Example usage

In [18]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score

(X_train, y_train), (X_test, y_test) = feature_generator.train_test_set
best_accuracy = 0
for C_param in np.logspace(-3, 3, 7):
    clf = LogisticRegression(C = C_param)
    clf.fit(X_train, y_train)
    current_accuracy = accuracy_score(clf.predict(X_test), y_test)
    print('accuracy: {}'.format(current_accuracy))
    if current_accuracy > best_accuracy:
        best_C = C_param
clf = LogisticRegression(C = best_C)
clf.fit(X_train, y_train)
print('###########')
print('f1 scores:')
for label in clf.classes_:
    print(label, f1_score(clf.predict(X_test), y_test, average = None, labels = label)[0])

accuracy: 0.6545660805445264
accuracy: 0.6647759500850823
accuracy: 0.7430516165626773
accuracy: 0.7816222348269994
accuracy: 0.7884288145207033
accuracy: 0.784458309699376
accuracy: 0.7855927396483268
###########
f1 scores:
B-APL 0.0
B-CHM 0.1978021978021978
B-CMT 0.38461538461538464
B-CON 0.21276595744680854
B-CUT 0.8571428571428571
B-CVL 0.3
B-DSC 0.36781609195402304
B-MAT 0.712121212121212
B-PMT 0.0
B-PRO 0.23655913978494625
B-PUT 0.9166666666666666
B-PVL 0.5625000000000001
B-QUA 0.16216216216216217
B-REF 0.25
B-SMT 0.0
B-SPL 0.0
I-APL 0.761904761904762
I-CHM 0.15384615384615385
I-CMT 0.8818897637795277
I-CON 0.8333333333333334
I-CUT 1.0
I-CVL 0.7272727272727273
I-DSC 0.8717948717948718
I-MAT 0.2222222222222222
I-PRO 0.7032967032967034
I-PUT 0.4
I-PVL 1.0
I-QUA 0.6956521739130435
I-REF 0.0
I-SMT 0.631578947368421
I-SPL 0.0
O 0.9040424663127807
