In [214]:
from itertools import chain
import nltk
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.preprocessing import LabelBinarizer
import sklearn
import pycrfsuite
import pandas as pd
import numpy as np

# Import csv from preprocessing - generated with python script

In [215]:
test = pd.read_csv('../identification.csv')

In [237]:
test['essay'] = 'essay01.txt'

In [238]:
test.to_csv('indentification2.csv', index=False)

In [243]:
test.pos[test.pos=="MD"]

6      MD
85     MD
103    MD
154    MD
203    MD
301    MD
363    MD
388    MD
Name: pos, dtype: object

## Preprocess data from csv/DataFrame

In [217]:
sent_x = []
sent_y = []
for p in set(test.paragraph.values):
    for s in set(test[test.paragraph==p].sentence.values):
        temp_test = test[(test.paragraph == p) & (test.sentence == s)]
        sent_x.append(temp_test.loc[:, temp_test.columns != 'IOB'].to_dict("record"))
        sent_y.append(temp_test.IOB.values)

  sent_x.append(temp_test.loc[:, temp_test.columns != 'IOB'].to_dict("record"))


## Split data - informal for now and only on esssay001

In [218]:
X_train = sent_x[:10]
y_train = sent_y[:10]
X_test = sent_x[10:]
y_test = sent_y[10:]

## Initialize CRF trainer

In [219]:
trainer = pycrfsuite.Trainer(verbose=False)


### add training data

In [220]:
for xseq, yseq in zip(X_train, y_train):
    trainer.append(xseq, yseq)

In [221]:
trainer.set_params({
    'c1': 1.0,   # coefficient for L1 penalty
    'c2': 1e-3,  # coefficient for L2 penalty
    'max_iterations': 50,  # stop earlier

    # include transitions that are possible, but not observed
    'feature.possible_transitions': True
})

## Train and save to file

In [222]:
trainer.train('conll2002-esp.crfsuite')

## Load model from file

In [223]:
tagger = pycrfsuite.Tagger()
tagger.open('conll2002-esp.crfsuite')

<contextlib.closing at 0x7fe3b8e3d610>

## Test

In [235]:
print("Predicted:", ' '.join(tagger.tag(X_test[3])))
print("Correct:  ", ' '.join(y_test[3]))

Predicted: Arg-B Arg-I Arg-I Arg-I Arg-I Arg-I Arg-I Arg-I Arg-I Arg-I Arg-I Arg-I Arg-I O
Correct:   O O O O O O O O O O O O O O


In [225]:
y_pred = []
for i, each in enumerate(X_test):
    y_pred.append(tagger.tag(each))


In [226]:
lb = LabelBinarizer()

In [227]:
y_true_combined = lb.fit_transform(list(chain.from_iterable(y_test)))

In [228]:
y_pred_combined = lb.transform(list(chain.from_iterable(y_pred)))


In [229]:
tagset = set(lb.classes_)
tagset = sorted(tagset, key=lambda tag: tag.split('-', 1)[::-1])
class_indices = {cls: idx for idx, cls in enumerate(lb.classes_)}

report = classification_report(
        y_true_combined,
        y_pred_combined,
        labels = [class_indices[cls] for cls in tagset],
        target_names = tagset,
)

In [230]:
print(report)

              precision    recall  f1-score   support

       Arg-B       0.40      0.50      0.44         4
       Arg-I       0.79      0.88      0.83       102
           O       0.70      0.54      0.61        57

   micro avg       0.75      0.75      0.75       163
   macro avg       0.63      0.64      0.63       163
weighted avg       0.75      0.75      0.75       163
 samples avg       0.75      0.75      0.75       163

