In [108]:
import pandas as pd
import numpy as np
import nltk
from sklearn_crfsuite import CRF
from sklearn_crfsuite import metrics as crf_metrics
from sklearn.model_selection import train_test_split
import scipy
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import make_scorer
import gensim
import json 
from tqdm.notebook import tqdm
import pickle

nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/martijnschouten/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/martijnschouten/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [2]:
!pip install sklearn-crfsuite -U
!pip install -U 'scikit-learn<0.24'
!pip install gensim



# Prepare the data

In [3]:
train_df = pd.read_csv('../data/train-full.tsv', sep='\t')
validation_df = pd.read_csv('../data/validation-full.tsv', sep='\t')

In [None]:
train_df['doc-sent'] = [str(row.document_ID) + '-' + str(row.sentence_ID) for index, row in train_df.iterrows()]
validation_df['doc-sent'] = [str(row.document_ID) + '-' + str(row.sentence_ID) for index, row in validation_df.iterrows()]

### Ratio transformations

In [75]:
# Make new columns
train_df['total_occurences'] = 0
train_df['class_occurences'] = 0
train_df['attribute_occurences'] = 0
validation_df['total_occurences'] = 0
validation_df['class_occurences'] = 0
validation_df['attribute_occurences'] = 0

In [72]:
with open('../data/genmymodel/genmymodel_uml_extracted_metadata_final.json') as json_file:
    gmm_data = json.load(json_file)

# Store all classes and attributes independent of eachother
all_classes = []
all_attrs = []

# Loop over all metadata and append to proper list
for file, metadata in gmm_data.items():
    if 'classes' in metadata.keys():
        all_classes.append(metadata['classes'])

    if 'attributes' in metadata.keys():
        all_attrs.append(metadata['attributes'])

flatten = lambda t: [item for sublist in t for item in sublist]

all_classes = flatten(all_classes)
all_attrs = flatten(all_attrs)

In [76]:
noungroup = []
noungroup_indices = []

for index, row in tqdm(train_df.iterrows()):
    if isinstance(row['fine_POS_tag'], str) and row['fine_POS_tag'][:2] == 'NN':
        noungroup.append(row['word'])
        noungroup_docsents.append(index)
    else:
        if len(noungroup) == 0:
            continue
        else:
            full_ng = ' '.join(noungroup).lower()
            attr_no = all_attrs.count(full_ng)
            class_no = all_classes.count(full_ng)
            
            for noun_index in noungroup_docsents:
                train_df.loc[noun_index, ['class_occurences', 'attribute_occurences', 'total_occurences']] = [class_no, attr_no, attr_no + class_no]
                
            noungroup = []
            noungroup_docsents = []

0it [00:00, ?it/s]

In [77]:
noungroup = []
noungroup_indices = []

for index, row in tqdm(validation_df.iterrows()):
    if isinstance(row['fine_POS_tag'], str) and row['fine_POS_tag'][:2] == 'NN':
        noungroup.append(row['word'])
        noungroup_indices.append(index)
    else:
        if len(noungroup) == 0:
            continue
        else:
            full_ng = ' '.join(noungroup).lower()
            attr_no = all_attrs.count(full_ng)
            class_no = all_classes.count(full_ng)
            
            for noun_index in noungroup_indices:
                validation_df.loc[noun_index, ['class_occurences', 'attribute_occurences', 'total_occurences']] = [class_no, attr_no, attr_no + class_no]
                
            noungroup = []
            noungroup_indices = []

0it [00:00, ?it/s]

### Prepare IOB format

In [78]:
columns = ['doc-sent', 'word', 'lemma', 'POS_tag', 'fine_POS_tag', 'dependency_relation', 'event', 'supersense_category', 'entity', 'entity_type', 'entity_category', 'total_occurences', 'class_occurences', 'attribute_occurences', 'IOB_tag']
train_df = train_df[columns]
validation_df = validation_df[columns]

In [79]:
agg_func = lambda s: list(map(lambda w: tuple(w), s.loc[:, s.columns != 'doc-sent'].values.tolist()))

In [80]:
train_grouped_df = train_df.groupby('doc-sent').apply(agg_func)
validation_grouped_df = validation_df.groupby('doc-sent').apply(agg_func)

train_sentences = [s for s in train_grouped_df]
validation_sentences = [s for s in validation_grouped_df]

In [81]:
train_grouped_df

doc-sent
0-0      [(This, this, PRON, DT, nsubj, O, nan, nan, na...
0-1      [(The, the, DET, DT, det, O, nan, nan, nan, na...
0-10     [(Section, section, NOUN, NN, nsubj, O, noun.c...
0-100    [(The, the, DET, DT, det, O, nan, nan, nan, na...
0-101    [(Metadata, Metadata, PROPN, NNP, nmod, O, nou...
                               ...                        
9-95     [(The, the, DET, DT, det, O, nan, nan, nan, na...
9-96     [(The, the, DET, DT, det, O, nan, nan, nan, na...
9-97     [(The, the, DET, DT, det, O, nan, nan, nan, na...
9-98     [(The, the, DET, DT, det, O, nan, nan, nan, na...
9-99     [(The, the, DET, DT, det, O, nan, nan, nan, na...
Length: 2639, dtype: object

In [89]:
train_df.dropna(subset=['doc-sent'], inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_df.dropna(subset=['doc-sent'], inplace=True)


In [90]:
# fastText model for embedding generation
vocab = train_df['word'].values.tolist() + validation_df['word'].values.tolist()
model = gensim.models.FastText(vocab, min_count=1)

In [None]:
pickle.dump(model, open('fasttext-model.pkl', 'wb'))

In [94]:
def word2features(sent, i, embedding, ratio):
    word = sent[i][1]
    postag = sent[i][3]
    fine_postag = sent[i][4]
    
    features = {
        label: data
        for label, data in zip(columns[1:-1], sent[i][:-1])
    }
    
    features.update({
        'word.lower()': word.lower(),
        'word.isupper()': word.isupper(),
        'word.istitle()': word.istitle(),
        'word.isdigit()': word.isdigit(),
        'word[-3:]': word[-3:],
        'word[-2:]': word[-2:],
        'postag[:2]': postag[:2],
        'postag[:2]': postag[:2],
        'finepostag[:2]': fine_postag[:2],
        'finepostag[:2]': fine_postag[:2],
    })
    if i > 0:
        word1 = sent[i-1][1]
        postag1 = sent[i-1][3]
        finepostag1 = sent[i-1][4]
        features.update({
            '-1:word.lower()': word1.lower(),
            '-1:word.istitle()': word1.istitle(),
            '-1:word.isupper()': word1.isupper(),
            '-1:postag': postag1,
            '-1:postag[:2]': postag1[:2],
            '-1:finepostag': finepostag1,
            '-1:finepostag[:2]': finepostag1[:2],
        })
    else:
        features['BOS'] = True

    if i < len(sent)-1:
        word1 = sent[i+1][1]
        postag1 = sent[i+1][3]
        finepostag1 = sent[i-1][4]
        features.update({
            '+1:word.lower()': word1.lower(),
            '+1:word.istitle()': word1.istitle(),
            '+1:word.isupper()': word1.isupper(),
            '+1:postag': postag1,
            '+1:postag[:2]': postag1[:2],
            '+1:finepostag': finepostag1,
            '+1:finepostag[:2]': finepostag1[:2],
        })
    else:
        features['EOS'] = True
    
    if not ratio:
        for ratio_feature in ['total_occurences', 'class_occurences', 'attribute_occurences']:
            del features[ratio_feature]
        
    if embedding:
        word_embedding = model.wv.get_vector(word)
        
        features.update({
            f'emb_pos_{i}': word_embedding[i]
            for i in range(len(word_embedding))
        })

    return features


def sent2features(sent, embedding = False, ratio = False):
    return [word2features(sent, i, embedding, ratio) for i in range(len(sent))]

def sent2labels(sent):
    return list(map(lambda s: s[-1], sent))

In [97]:
sent2features(validation_sentences[0][:7])

[{'word': 'The',
  'lemma': 'the',
  'POS_tag': 'DET',
  'fine_POS_tag': 'DT',
  'dependency_relation': 'det',
  'event': 'O',
  'supersense_category': nan,
  'entity': 1.0,
  'entity_type': 'NOM',
  'entity_category': 'FAC',
  'word.lower()': 'the',
  'word.isupper()': False,
  'word.istitle()': False,
  'word.isdigit()': False,
  'word[-3:]': 'the',
  'word[-2:]': 'he',
  'postag[:2]': 'DT',
  'finepostag[:2]': 'de',
  'BOS': True,
  '+1:word.lower()': 'clinic',
  '+1:word.istitle()': False,
  '+1:word.isupper()': False,
  '+1:postag': 'NN',
  '+1:postag[:2]': 'NN',
  '+1:finepostag': 'conj',
  '+1:finepostag[:2]': 'co'},
 {'word': 'clinic',
  'lemma': 'clinic',
  'POS_tag': 'NOUN',
  'fine_POS_tag': 'NN',
  'dependency_relation': 'nsubj',
  'event': 'O',
  'supersense_category': 'noun.group',
  'entity': 1.0,
  'entity_type': 'NOM',
  'entity_category': 'FAC',
  'word.lower()': 'clinic',
  'word.isupper()': False,
  'word.istitle()': False,
  'word.isdigit()': False,
  'word[-3:]': 

# Default model

In [12]:
X_train = np.array([sent2features(s) for s in train_sentences])
X_test = np.array([sent2features(s) for s in validation_sentences])
y_train = np.array([sent2labels(s) for s in train_sentences])
y_test = np.array([sent2labels(s) for s in validation_sentences])

  X_train = np.array([sent2features(s) for s in train_sentences])
  X_test = np.array([sent2features(s) for s in validation_sentences])
  y_train = np.array([sent2labels(s) for s in train_sentences])
  y_test = np.array([sent2labels(s) for s in validation_sentences])


In [99]:
labels = list(train_df['IOB_tag'].unique())
labels.remove('O')

In [15]:
crf = CRF(
    algorithm='lbfgs',
    max_iterations=100,
    all_possible_transitions=True
)

In [16]:
params_space = {
    'c1': scipy.stats.expon(scale=0.5),
    'c2': scipy.stats.expon(scale=0.05),
}

In [17]:
# use the same metric for evaluation
f1_scorer = make_scorer(crf_metrics.flat_f1_score, average='weighted', labels=labels)

# search
rs = RandomizedSearchCV(crf, params_space,
                        cv=3,
                        verbose=1,
                        n_jobs=-1,
                        n_iter=50,
                        scoring=f1_scorer)
rs.fit(X_train, y_train)

Fitting 3 folds for each of 50 candidates, totalling 150 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   32.5s
[Parallel(n_jobs=-1)]: Done 150 out of 150 | elapsed:  2.1min finished


RandomizedSearchCV(cv=3,
                   estimator=CRF(algorithm='lbfgs',
                                 all_possible_transitions=True,
                                 keep_tempfiles=None, max_iterations=100),
                   n_iter=50, n_jobs=-1,
                   param_distributions={'c1': <scipy.stats._distn_infrastructure.rv_frozen object at 0x7faecb1cd880>,
                                        'c2': <scipy.stats._distn_infrastructure.rv_frozen object at 0x7faecb1cdc40>},
                   scoring=make_scorer(flat_f1_score, average=weighted, labels=['B-class', 'I-class', 'B-attr', 'I-attr']),
                   verbose=1)

In [19]:
y_pred = rs.predict(X_test)

In [22]:
labels

['B-class', 'I-class', 'B-attr', 'I-attr']

In [23]:
print(crf_metrics.flat_classification_report(y_test, y_pred, labels=labels, digits=3))

              precision    recall  f1-score   support

     B-class      0.628     0.274     0.382       215
     I-class      0.591     0.153     0.243        85
      B-attr      0.677     0.300     0.416       140
      I-attr      0.730     0.338     0.462       136

   micro avg      0.664     0.278     0.392       576
   macro avg      0.657     0.266     0.376       576
weighted avg      0.659     0.278     0.389       576





In [24]:
[[[word['word'], pred] for word, pred in zip(sent, predictions)] for sent, predictions in zip(X_test, y_pred)]

[[['The', 'O'],
  ['clinic', 'B-class'],
  ['basically', 'O'],
  ['schedules', 'O'],
  ['patients', 'B-class'],
  [',', 'O'],
  ['provides', 'O'],
  ['services', 'O'],
  ['for', 'O'],
  ['them', 'O'],
  [',', 'O'],
  ['and', 'O'],
  ['bills', 'O'],
  ['them', 'O'],
  ['for', 'O'],
  ['those', 'O'],
  ['services', 'O'],
  ['.', 'O']],
 [['New', 'O'],
  ['patients', 'B-class'],
  ['fill', 'O'],
  ['out', 'O'],
  ['a', 'O'],
  ['form', 'O'],
  ['listing', 'O'],
  ['their', 'O'],
  ['name', 'B-attr'],
  [',', 'O'],
  ['address', 'B-attr'],
  [',', 'O'],
  ['telephone', 'B-attr'],
  ['numbers', 'I-attr'],
  [',', 'O'],
  ['allergies', 'O'],
  [',', 'O'],
  ['and', 'O'],
  ['state', 'O'],
  ['of', 'O'],
  ['mind', 'O'],
  ['prior', 'O'],
  ['to', 'O'],
  ['scheduling', 'O'],
  ['their', 'O'],
  ['first', 'O'],
  ['appointment', 'O'],
  ['.', 'O']],
 [['Billing', 'O'],
  ['is', 'O'],
  ['always', 'O'],
  ['done', 'O'],
  ['by', 'O'],
  ['the', 'O'],
  ['month', 'O'],
  [',', 'O'],
  ['and', '

# Default model + fastText

In [42]:
X_train = np.array([sent2features(s, embedding = True) for s in train_sentences])
X_test = np.array([sent2features(s, embedding = True) for s in validation_sentences])
y_train = np.array([sent2labels(s) for s in train_sentences])
y_test = np.array([sent2labels(s) for s in validation_sentences])

crf = CRF(
    algorithm='lbfgs',
    max_iterations=100,
    all_possible_transitions=True
)

params_space = {
    'c1': scipy.stats.expon(scale=0.5),
    'c2': scipy.stats.expon(scale=0.05),
}

# use the same metric for evaluation
f1_scorer = make_scorer(crf_metrics.flat_f1_score, average='weighted', labels=labels)

# search
rs = RandomizedSearchCV(crf, params_space,
                        cv=3,
                        verbose=1,
                        n_jobs=-1,
                        n_iter=50,
                        scoring=f1_scorer)

  X_train = np.array([sent2features(s, embedding = True) for s in train_sentences])
  X_test = np.array([sent2features(s, embedding = True) for s in validation_sentences])
  y_train = np.array([sent2labels(s) for s in train_sentences])
  y_test = np.array([sent2labels(s) for s in validation_sentences])


In [44]:
rs.fit(X_train, y_train)

Fitting 3 folds for each of 50 candidates, totalling 150 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed: 10.3min
[Parallel(n_jobs=-1)]: Done 150 out of 150 | elapsed: 42.9min finished


RandomizedSearchCV(cv=3,
                   estimator=CRF(algorithm='lbfgs',
                                 all_possible_transitions=True,
                                 keep_tempfiles=None, max_iterations=100),
                   n_iter=50, n_jobs=-1,
                   param_distributions={'c1': <scipy.stats._distn_infrastructure.rv_frozen object at 0x7faecb299f40>,
                                        'c2': <scipy.stats._distn_infrastructure.rv_frozen object at 0x7faeb81fc100>},
                   scoring=make_scorer(flat_f1_score, average=weighted, labels=['B-class', 'I-class', 'B-attr', 'I-attr']),
                   verbose=1)

In [45]:
y_pred = rs.predict(X_test)
print(crf_metrics.flat_classification_report(y_test, y_pred, labels=labels, digits=3))

              precision    recall  f1-score   support

     B-class      0.632     0.279     0.387       215
     I-class      0.560     0.165     0.255        85
      B-attr      0.667     0.300     0.414       140
      I-attr      0.754     0.316     0.446       136

   micro avg      0.662     0.276     0.390       576
   macro avg      0.653     0.265     0.375       576
weighted avg      0.659     0.276     0.388       576





# Default model + class/attribute ratio

In [100]:
X_train = np.array([sent2features(s, ratio = True) for s in train_sentences])
X_test = np.array([sent2features(s, ratio = True) for s in validation_sentences])
y_train = np.array([sent2labels(s) for s in train_sentences])
y_test = np.array([sent2labels(s) for s in validation_sentences])

crf = CRF(
    algorithm='lbfgs',
    max_iterations=100,
    all_possible_transitions=True
)

params_space = {
    'c1': scipy.stats.expon(scale=0.5),
    'c2': scipy.stats.expon(scale=0.05),
}

# use the same metric for evaluation
f1_scorer = make_scorer(crf_metrics.flat_f1_score, average='weighted', labels=labels)

# search
rs = RandomizedSearchCV(crf, params_space,
                        cv=3,
                        verbose=1,
                        n_jobs=-1,
                        n_iter=50,
                        scoring=f1_scorer)

  X_train = np.array([sent2features(s, ratio = True) for s in train_sentences])
  X_test = np.array([sent2features(s, ratio = True) for s in validation_sentences])
  y_train = np.array([sent2labels(s) for s in train_sentences])
  y_test = np.array([sent2labels(s) for s in validation_sentences])


In [101]:
rs.fit(X_train, y_train)

Fitting 3 folds for each of 50 candidates, totalling 150 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   45.7s
[Parallel(n_jobs=-1)]: Done 150 out of 150 | elapsed:  2.9min finished


RandomizedSearchCV(cv=3,
                   estimator=CRF(algorithm='lbfgs',
                                 all_possible_transitions=True,
                                 keep_tempfiles=None, max_iterations=100),
                   n_iter=50, n_jobs=-1,
                   param_distributions={'c1': <scipy.stats._distn_infrastructure.rv_frozen object at 0x7fc5e111ebe0>,
                                        'c2': <scipy.stats._distn_infrastructure.rv_frozen object at 0x7fc692e8e670>},
                   scoring=make_scorer(flat_f1_score, average=weighted, labels=['B-class', 'I-class', 'B-attr', 'I-attr']),
                   verbose=1)

In [102]:
y_pred = rs.predict(X_test)
print(crf_metrics.flat_classification_report(y_test, y_pred, labels=labels, digits=3))

              precision    recall  f1-score   support

     B-class      0.623     0.307     0.411       215
     I-class      0.255     0.153     0.191        85
      B-attr      0.684     0.186     0.292       140
      I-attr      0.821     0.169     0.280       136

   micro avg      0.574     0.222     0.320       576
   macro avg      0.596     0.204     0.294       576
weighted avg      0.630     0.222     0.319       576





# All features together

In [103]:
X_train = np.array([sent2features(s, ratio = True, embedding = True) for s in train_sentences])
X_test = np.array([sent2features(s, ratio = True, embedding = True) for s in validation_sentences])
y_train = np.array([sent2labels(s) for s in train_sentences])
y_test = np.array([sent2labels(s) for s in validation_sentences])

crf = CRF(
    algorithm='lbfgs',
    max_iterations=100,
    all_possible_transitions=True
)

params_space = {
    'c1': scipy.stats.expon(scale=0.5),
    'c2': scipy.stats.expon(scale=0.05),
}

# use the same metric for evaluation
f1_scorer = make_scorer(crf_metrics.flat_f1_score, average='weighted', labels=labels)

# search
rs = RandomizedSearchCV(crf, params_space,
                        cv=3,
                        verbose=1,
                        n_jobs=-1,
                        n_iter=50,
                        scoring=f1_scorer)

  X_train = np.array([sent2features(s, ratio = True, embedding = True) for s in train_sentences])
  X_test = np.array([sent2features(s, ratio = True, embedding = True) for s in validation_sentences])
  y_train = np.array([sent2labels(s) for s in train_sentences])
  y_test = np.array([sent2labels(s) for s in validation_sentences])


In [106]:
rs.fit(X_train, y_train)

Fitting 3 folds for each of 50 candidates, totalling 150 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed: 10.1min
[Parallel(n_jobs=-1)]: Done 150 out of 150 | elapsed: 42.9min finished


RandomizedSearchCV(cv=3,
                   estimator=CRF(algorithm='lbfgs',
                                 all_possible_transitions=True,
                                 keep_tempfiles=None, max_iterations=100),
                   n_iter=50, n_jobs=-1,
                   param_distributions={'c1': <scipy.stats._distn_infrastructure.rv_frozen object at 0x7fc676d3ee80>,
                                        'c2': <scipy.stats._distn_infrastructure.rv_frozen object at 0x7fc5f8b634c0>},
                   scoring=make_scorer(flat_f1_score, average=weighted, labels=['B-class', 'I-class', 'B-attr', 'I-attr']),
                   verbose=1)

In [107]:
y_pred = rs.predict(X_test)
print(crf_metrics.flat_classification_report(y_test, y_pred, labels=labels, digits=3))

              precision    recall  f1-score   support

     B-class      0.651     0.321     0.430       215
     I-class      0.425     0.200     0.272        85
      B-attr      0.795     0.221     0.346       140
      I-attr      0.844     0.199     0.321       136

   micro avg      0.664     0.250     0.363       576
   macro avg      0.679     0.235     0.342       576
weighted avg      0.698     0.250     0.361       576



In [None]:
ratio
              precision    recall  f1-score   support

     B-class      0.623     0.307     0.411       215
     I-class      0.255     0.153     0.191        85
      B-attr      0.684     0.186     0.292       140
      I-attr      0.821     0.169     0.280       136

   micro avg      0.574     0.222     0.320       576
   macro avg      0.596     0.204     0.294       576
weighted avg      0.630     0.222     0.319       576

In [None]:
fasttext
              precision    recall  f1-score   support

     B-class      0.632     0.279     0.387       215
     I-class      0.560     0.165     0.255        85
      B-attr      0.667     0.300     0.414       140
      I-attr      0.754     0.316     0.446       136

   micro avg      0.662     0.276     0.390       576
   macro avg      0.653     0.265     0.375       576
weighted avg      0.659     0.276     0.388       576

In [None]:
default
              precision    recall  f1-score   support

     B-class      0.628     0.274     0.382       215
     I-class      0.591     0.153     0.243        85
      B-attr      0.677     0.300     0.416       140
      I-attr      0.730     0.338     0.462       136

   micro avg      0.664     0.278     0.392       576
   macro avg      0.657     0.266     0.376       576
weighted avg      0.659     0.278     0.389       576



In [110]:
pickle.dump(rs, open('model.pkl', 'wb'))