In [2]:
import pandas as pd
import numpy as np
import nltk
from sklearn_crfsuite import CRF
from sklearn_crfsuite import metrics as crf_metrics
from sklearn.model_selection import train_test_split
import scipy
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import make_scorer
import gensim
import json 
from tqdm.notebook import tqdm
import pickle
import glob

nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/martijnschouten/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/martijnschouten/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [2]:
!pip install sklearn-crfsuite -U
!pip install -U 'scikit-learn<0.24'
!pip install gensim



In [14]:
train_df = pd.concat([pd.read_csv(file, sep='\t') for file in glob.glob('../data/train/*-tokens.tsv')])
train_df.sort_values('document_ID', inplace=True)
train_df.to_csv('../data/train-full.tsv', index=False, sep='\t')

# Prepare the data

In [16]:
train_df = pd.read_csv('../data/train-full.tsv', sep='\t')
validation_df = pd.read_csv('../data/validation-full.tsv', sep='\t')

In [17]:
train_df['doc-sent'] = [str(row.document_ID) + '-' + str(row.sentence_ID) for index, row in train_df.iterrows()]
validation_df['doc-sent'] = [str(row.document_ID) + '-' + str(row.sentence_ID) for index, row in validation_df.iterrows()]

### Ratio transformations

In [18]:
# Make new columns
train_df['total_occurences'] = 0
train_df['class_occurences'] = 0
train_df['attribute_occurences'] = 0
validation_df['total_occurences'] = 0
validation_df['class_occurences'] = 0
validation_df['attribute_occurences'] = 0

In [19]:
with open('../data/genmymodel/genmymodel_uml_extracted_metadata_final.json') as json_file:
    gmm_data = json.load(json_file)

# Store all classes and attributes independent of eachother
all_classes = []
all_attrs = []

# Loop over all metadata and append to proper list
for file, metadata in gmm_data.items():
    if 'classes' in metadata.keys():
        all_classes.append(metadata['classes'])

    if 'attributes' in metadata.keys():
        all_attrs.append(metadata['attributes'])

flatten = lambda t: [item for sublist in t for item in sublist]

all_classes = flatten(all_classes)
all_attrs = flatten(all_attrs)

In [21]:
noungroup = []
noungroup_indices = []

for index, row in tqdm(validation_df.iterrows()):
    if isinstance(row['fine_POS_tag'], str) and row['fine_POS_tag'][:2] == 'NN':
        noungroup.append(row['word'])
        noungroup_indices.append(index)
    else:
        if len(noungroup) == 0:
            continue
        else:
            full_ng = ' '.join(noungroup).lower()
            attr_no = all_attrs.count(full_ng)
            class_no = all_classes.count(full_ng)
            
            for noun_index in noungroup_indices:
                validation_df.loc[noun_index, ['class_occurences', 'attribute_occurences', 'total_occurences']] = [class_no, attr_no, attr_no + class_no]
                
            noungroup = []
            noungroup_indices = []

0it [00:00, ?it/s]

### Prepare IOB format

In [24]:
columns = ['doc-sent', 'word', 'lemma', 'POS_tag', 'fine_POS_tag', 'dependency_relation', 'event', 'supersense_category', 'entity', 'entity_type', 'entity_category', 'total_occurences', 'class_occurences', 'attribute_occurences', 'IOB_tag']
train_df = train_df[columns]
validation_df = validation_df[columns]

In [25]:
agg_func = lambda s: list(map(lambda w: tuple(w), s.loc[:, s.columns != 'doc-sent'].values.tolist()))

In [26]:
train_grouped_df = train_df.groupby('doc-sent').apply(agg_func)
validation_grouped_df = validation_df.groupby('doc-sent').apply(agg_func)

train_sentences = [s for s in train_grouped_df]
validation_sentences = [s for s in validation_grouped_df]

In [27]:
train_grouped_df

doc-sent
0-0      [(This, this, PRON, DT, nsubj, O, nan, nan, na...
0-1      [(., ., PUNCT, ., punct, O, nan, nan, nan, nan...
0-10     [(Section, section, NOUN, NN, nsubj, O, noun.c...
0-100    [(., ., PUNCT, ., punct, O, nan, nan, nan, nan...
0-101    [(text, text, NOUN, NN, compound, O, noun.comm...
                               ...                        
9-95     [(The, the, DET, DT, det, O, nan, nan, nan, na...
9-96     [(., ., PUNCT, ., punct, O, nan, nan, nan, nan...
9-97     [(., ., PUNCT, ., punct, O, nan, nan, nan, nan...
9-98     [(system, system, NOUN, NN, nsubj, O, noun.art...
9-99     [(., ., PUNCT, ., punct, O, nan, nan, nan, nan...
Length: 3130, dtype: object

In [28]:
train_df.dropna(subset=['doc-sent'], inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_df.dropna(subset=['doc-sent'], inplace=True)


In [29]:
# fastText model for embedding generation
vocab = train_df['word'].values.tolist() + validation_df['word'].values.tolist()
model = gensim.models.FastText(vocab, min_count=1)

In [30]:
pickle.dump(model, open('fasttext-model.pkl', 'wb'))

In [31]:
def word2features(sent, i, embedding, ratio):
    word = sent[i][1]
    postag = sent[i][3]
    fine_postag = sent[i][4]
    
    features = {
        label: data
        for label, data in zip(columns[1:-1], sent[i][:-1])
    }
    
    features.update({
        'word.lower()': word.lower(),
        'word.isupper()': word.isupper(),
        'word.istitle()': word.istitle(),
        'word.isdigit()': word.isdigit(),
        'word[-3:]': word[-3:],
        'word[-2:]': word[-2:],
        'postag[:2]': postag[:2],
        'postag[:2]': postag[:2],
        'finepostag[:2]': fine_postag[:2],
        'finepostag[:2]': fine_postag[:2],
    })
    if i > 0:
        word1 = sent[i-1][1]
        postag1 = sent[i-1][3]
        finepostag1 = sent[i-1][4]
        features.update({
            '-1:word.lower()': word1.lower(),
            '-1:word.istitle()': word1.istitle(),
            '-1:word.isupper()': word1.isupper(),
            '-1:postag': postag1,
            '-1:postag[:2]': postag1[:2],
            '-1:finepostag': finepostag1,
            '-1:finepostag[:2]': finepostag1[:2],
        })
    else:
        features['BOS'] = True

    if i < len(sent)-1:
        word1 = sent[i+1][1]
        postag1 = sent[i+1][3]
        finepostag1 = sent[i-1][4]
        features.update({
            '+1:word.lower()': word1.lower(),
            '+1:word.istitle()': word1.istitle(),
            '+1:word.isupper()': word1.isupper(),
            '+1:postag': postag1,
            '+1:postag[:2]': postag1[:2],
            '+1:finepostag': finepostag1,
            '+1:finepostag[:2]': finepostag1[:2],
        })
    else:
        features['EOS'] = True
    
    if not ratio:
        for ratio_feature in ['total_occurences', 'class_occurences', 'attribute_occurences']:
            del features[ratio_feature]
        
    if embedding:
        word_embedding = model.wv.get_vector(word)
        
        features.update({
            f'emb_pos_{i}': word_embedding[i]
            for i in range(len(word_embedding))
        })

    return features


def sent2features(sent, embedding = False, ratio = False):
    return [word2features(sent, i, embedding, ratio) for i in range(len(sent))]

def sent2labels(sent):
    return list(map(lambda s: s[-1], sent))

In [32]:
sent2features(validation_sentences[0][:7])

[{'word': 'The',
  'lemma': 'the',
  'POS_tag': 'DET',
  'fine_POS_tag': 'DT',
  'dependency_relation': 'det',
  'event': 'O',
  'supersense_category': nan,
  'entity': 1.0,
  'entity_type': 'NOM',
  'entity_category': 'FAC',
  'word.lower()': 'the',
  'word.isupper()': False,
  'word.istitle()': False,
  'word.isdigit()': False,
  'word[-3:]': 'the',
  'word[-2:]': 'he',
  'postag[:2]': 'DT',
  'finepostag[:2]': 'de',
  'BOS': True,
  '+1:word.lower()': 'clinic',
  '+1:word.istitle()': False,
  '+1:word.isupper()': False,
  '+1:postag': 'NN',
  '+1:postag[:2]': 'NN',
  '+1:finepostag': 'conj',
  '+1:finepostag[:2]': 'co'},
 {'word': 'clinic',
  'lemma': 'clinic',
  'POS_tag': 'NOUN',
  'fine_POS_tag': 'NN',
  'dependency_relation': 'nsubj',
  'event': 'O',
  'supersense_category': 'noun.group',
  'entity': 1.0,
  'entity_type': 'NOM',
  'entity_category': 'FAC',
  'word.lower()': 'clinic',
  'word.isupper()': False,
  'word.istitle()': False,
  'word.isdigit()': False,
  'word[-3:]': 

# Default model

In [33]:
X_train = np.array([sent2features(s) for s in train_sentences])
X_test = np.array([sent2features(s) for s in validation_sentences])
y_train = np.array([sent2labels(s) for s in train_sentences])
y_test = np.array([sent2labels(s) for s in validation_sentences])

  X_train = np.array([sent2features(s) for s in train_sentences])
  X_test = np.array([sent2features(s) for s in validation_sentences])
  y_train = np.array([sent2labels(s) for s in train_sentences])
  y_test = np.array([sent2labels(s) for s in validation_sentences])


In [34]:
labels = list(train_df['IOB_tag'].unique())
labels.remove('O')

In [35]:
crf = CRF(
    algorithm='lbfgs',
    max_iterations=100,
    all_possible_transitions=True
)

In [36]:
params_space = {
    'c1': scipy.stats.expon(scale=0.5),
    'c2': scipy.stats.expon(scale=0.05),
}

In [37]:
# use the same metric for evaluation
f1_scorer = make_scorer(crf_metrics.flat_f1_score, average='weighted', labels=labels)

# search
rs = RandomizedSearchCV(crf, params_space,
                        cv=3,
                        verbose=1,
                        n_jobs=-1,
                        n_iter=50,
                        scoring=f1_scorer)
rs.fit(X_train, y_train)

Fitting 3 folds for each of 50 candidates, totalling 150 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   46.1s
[Parallel(n_jobs=-1)]: Done 150 out of 150 | elapsed:  2.9min finished


RandomizedSearchCV(cv=3,
                   estimator=CRF(algorithm='lbfgs',
                                 all_possible_transitions=True,
                                 keep_tempfiles=None, max_iterations=100),
                   n_iter=50, n_jobs=-1,
                   param_distributions={'c1': <scipy.stats._distn_infrastructure.rv_frozen object at 0x7fe2222dd5e0>,
                                        'c2': <scipy.stats._distn_infrastructure.rv_frozen object at 0x7fe2579f2550>},
                   scoring=make_scorer(flat_f1_score, average=weighted, labels=['B-class', 'I-class', 'B-attr', 'I-attr']),
                   verbose=1)

In [38]:
y_pred = rs.predict(X_test)

In [39]:
labels

['B-class', 'I-class', 'B-attr', 'I-attr']

In [40]:
print(crf_metrics.flat_classification_report(y_test, y_pred, labels=labels, digits=3))

              precision    recall  f1-score   support

     B-class      0.633     0.377     0.472       215
     I-class      0.571     0.235     0.333        85
      B-attr      0.824     0.300     0.440       140
      I-attr      0.744     0.235     0.358       136

   micro avg      0.681     0.304     0.420       576
   macro avg      0.693     0.287     0.401       576
weighted avg      0.696     0.304     0.417       576





In [41]:
[[[word['word'], pred] for word, pred in zip(sent, predictions)] for sent, predictions in zip(X_test, y_pred)]

[[['The', 'O'],
  ['clinic', 'B-class'],
  ['basically', 'O'],
  ['schedules', 'O'],
  ['patients', 'B-class'],
  [',', 'O'],
  ['provides', 'O'],
  ['services', 'O'],
  ['for', 'O'],
  ['them', 'O'],
  [',', 'O'],
  ['and', 'O'],
  ['bills', 'O'],
  ['them', 'O'],
  ['for', 'O'],
  ['those', 'O'],
  ['services', 'O'],
  ['.', 'O']],
 [['New', 'O'],
  ['patients', 'B-class'],
  ['fill', 'O'],
  ['out', 'O'],
  ['a', 'O'],
  ['form', 'O'],
  ['listing', 'O'],
  ['their', 'O'],
  ['name', 'B-attr'],
  [',', 'O'],
  ['address', 'B-attr'],
  [',', 'O'],
  ['telephone', 'B-attr'],
  ['numbers', 'I-attr'],
  [',', 'O'],
  ['allergies', 'O'],
  [',', 'O'],
  ['and', 'O'],
  ['state', 'O'],
  ['of', 'O'],
  ['mind', 'O'],
  ['prior', 'O'],
  ['to', 'O'],
  ['scheduling', 'O'],
  ['their', 'O'],
  ['first', 'O'],
  ['appointment', 'O'],
  ['.', 'O']],
 [['Billing', 'O'],
  ['is', 'O'],
  ['always', 'O'],
  ['done', 'O'],
  ['by', 'O'],
  ['the', 'O'],
  ['month', 'O'],
  [',', 'O'],
  ['and', '

# Default model + fastText

In [42]:
X_train = np.array([sent2features(s, embedding = True) for s in train_sentences])
X_test = np.array([sent2features(s, embedding = True) for s in validation_sentences])
y_train = np.array([sent2labels(s) for s in train_sentences])
y_test = np.array([sent2labels(s) for s in validation_sentences])

crf = CRF(
    algorithm='lbfgs',
    max_iterations=100,
    all_possible_transitions=True
)

params_space = {
    'c1': scipy.stats.expon(scale=0.5),
    'c2': scipy.stats.expon(scale=0.05),
}

# use the same metric for evaluation
f1_scorer = make_scorer(crf_metrics.flat_f1_score, average='weighted', labels=labels)

# search
rs = RandomizedSearchCV(crf, params_space,
                        cv=3,
                        verbose=1,
                        n_jobs=-1,
                        n_iter=50,
                        scoring=f1_scorer)

  X_train = np.array([sent2features(s, embedding = True) for s in train_sentences])
  X_test = np.array([sent2features(s, embedding = True) for s in validation_sentences])
  y_train = np.array([sent2labels(s) for s in train_sentences])
  y_test = np.array([sent2labels(s) for s in validation_sentences])


In [43]:
rs.fit(X_train, y_train)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


Fitting 3 folds for each of 50 candidates, totalling 150 fits


[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed: 11.5min




[Parallel(n_jobs=-1)]: Done 150 out of 150 | elapsed: 50.5min finished


RandomizedSearchCV(cv=3,
                   estimator=CRF(algorithm='lbfgs',
                                 all_possible_transitions=True,
                                 keep_tempfiles=None, max_iterations=100),
                   n_iter=50, n_jobs=-1,
                   param_distributions={'c1': <scipy.stats._distn_infrastructure.rv_frozen object at 0x7fe2579416a0>,
                                        'c2': <scipy.stats._distn_infrastructure.rv_frozen object at 0x7fe25071d880>},
                   scoring=make_scorer(flat_f1_score, average=weighted, labels=['B-class', 'I-class', 'B-attr', 'I-attr']),
                   verbose=1)

In [44]:
y_pred = rs.predict(X_test)
print(crf_metrics.flat_classification_report(y_test, y_pred, labels=labels, digits=3))

              precision    recall  f1-score   support

     B-class      0.588     0.358     0.445       215
     I-class      0.472     0.200     0.281        85
      B-attr      0.741     0.307     0.434       140
      I-attr      0.705     0.228     0.344       136

   micro avg      0.625     0.292     0.398       576
   macro avg      0.626     0.273     0.376       576
weighted avg      0.636     0.292     0.394       576



# Default model + class/attribute ratio

In [45]:
X_train = np.array([sent2features(s, ratio = True) for s in train_sentences])
X_test = np.array([sent2features(s, ratio = True) for s in validation_sentences])
y_train = np.array([sent2labels(s) for s in train_sentences])
y_test = np.array([sent2labels(s) for s in validation_sentences])

crf = CRF(
    algorithm='lbfgs',
    max_iterations=100,
    all_possible_transitions=True
)

params_space = {
    'c1': scipy.stats.expon(scale=0.5),
    'c2': scipy.stats.expon(scale=0.05),
}

# use the same metric for evaluation
f1_scorer = make_scorer(crf_metrics.flat_f1_score, average='weighted', labels=labels)

# search
rs = RandomizedSearchCV(crf, params_space,
                        cv=3,
                        verbose=1,
                        n_jobs=-1,
                        n_iter=50,
                        scoring=f1_scorer)

  X_train = np.array([sent2features(s, ratio = True) for s in train_sentences])
  X_test = np.array([sent2features(s, ratio = True) for s in validation_sentences])
  y_train = np.array([sent2labels(s) for s in train_sentences])
  y_test = np.array([sent2labels(s) for s in validation_sentences])


In [46]:
rs.fit(X_train, y_train)

Fitting 3 folds for each of 50 candidates, totalling 150 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   50.5s
[Parallel(n_jobs=-1)]: Done 150 out of 150 | elapsed:  3.1min finished


RandomizedSearchCV(cv=3,
                   estimator=CRF(algorithm='lbfgs',
                                 all_possible_transitions=True,
                                 keep_tempfiles=None, max_iterations=100),
                   n_iter=50, n_jobs=-1,
                   param_distributions={'c1': <scipy.stats._distn_infrastructure.rv_frozen object at 0x7fe241c7f520>,
                                        'c2': <scipy.stats._distn_infrastructure.rv_frozen object at 0x7fe242159fd0>},
                   scoring=make_scorer(flat_f1_score, average=weighted, labels=['B-class', 'I-class', 'B-attr', 'I-attr']),
                   verbose=1)

In [47]:
y_pred = rs.predict(X_test)
print(crf_metrics.flat_classification_report(y_test, y_pred, labels=labels, digits=3))

              precision    recall  f1-score   support

     B-class      0.580     0.353     0.439       215
     I-class      0.472     0.200     0.281        85
      B-attr      0.764     0.300     0.431       140
      I-attr      0.721     0.228     0.346       136

   micro avg      0.626     0.288     0.395       576
   macro avg      0.634     0.270     0.374       576
weighted avg      0.642     0.288     0.392       576



# All features together

In [48]:
X_train = np.array([sent2features(s, ratio = True, embedding = True) for s in train_sentences])
X_test = np.array([sent2features(s, ratio = True, embedding = True) for s in validation_sentences])
y_train = np.array([sent2labels(s) for s in train_sentences])
y_test = np.array([sent2labels(s) for s in validation_sentences])

crf = CRF(
    algorithm='lbfgs',
    max_iterations=100,
    all_possible_transitions=True
)

params_space = {
    'c1': scipy.stats.expon(scale=0.5),
    'c2': scipy.stats.expon(scale=0.05),
}

# use the same metric for evaluation
f1_scorer = make_scorer(crf_metrics.flat_f1_score, average='weighted', labels=labels)

# search
rs = RandomizedSearchCV(crf, params_space,
                        cv=3,
                        verbose=1,
                        n_jobs=-1,
                        n_iter=50,
                        scoring=f1_scorer)

  X_train = np.array([sent2features(s, ratio = True, embedding = True) for s in train_sentences])
  X_test = np.array([sent2features(s, ratio = True, embedding = True) for s in validation_sentences])
  y_train = np.array([sent2labels(s) for s in train_sentences])
  y_test = np.array([sent2labels(s) for s in validation_sentences])


In [49]:
rs.fit(X_train, y_train)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


Fitting 3 folds for each of 50 candidates, totalling 150 fits


[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed: 12.4min


[Parallel(n_jobs=-1)]: Done 150 out of 150 | elapsed: 51.7min finished


RandomizedSearchCV(cv=3,
                   estimator=CRF(algorithm='lbfgs',
                                 all_possible_transitions=True,
                                 keep_tempfiles=None, max_iterations=100),
                   n_iter=50, n_jobs=-1,
                   param_distributions={'c1': <scipy.stats._distn_infrastructure.rv_frozen object at 0x7fe2420d3d90>,
                                        'c2': <scipy.stats._distn_infrastructure.rv_frozen object at 0x7fe1d222db20>},
                   scoring=make_scorer(flat_f1_score, average=weighted, labels=['B-class', 'I-class', 'B-attr', 'I-attr']),
                   verbose=1)

In [50]:
y_pred = rs.predict(X_test)
print(crf_metrics.flat_classification_report(y_test, y_pred, labels=labels, digits=3))

              precision    recall  f1-score   support

     B-class      0.585     0.367     0.451       215
     I-class      0.472     0.200     0.281        85
      B-attr      0.816     0.286     0.423       140
      I-attr      0.744     0.235     0.358       136

   micro avg      0.639     0.292     0.400       576
   macro avg      0.654     0.272     0.378       576
weighted avg      0.662     0.292     0.397       576





In [51]:
pickle.dump(rs, open('model.pkl', 'wb'))