In [36]:
import pandas as pd
import glob
import errno
import nltk
from itertools import chain
from itertools import groupby
from operator import itemgetter
import re

import sklearn
import scipy.stats
from sklearn.metrics import make_scorer
import numpy as np
import random
random.seed(42)
import sklearn_crfsuite
from sklearn_crfsuite import scorers
from sklearn_crfsuite import metrics

# Data Creation

In [37]:
path = 'Training_Data/*.csv'

files = glob.glob(path)
train=[]
names=[]
for name in files:
    try:
        df=pd.read_csv(name,index_col=0)
        df['w']=df['w'].replace({'\n':'#n'})
        
        df['pos']=df['w'].apply(lambda x:nltk.pos_tag([str(x)])[0][1])
        tuple1 = [tuple(x) for x in df.values]
        train.append(tuple1)
        
    except IOError as exc:
        if exc.errno != errno.EISDIR:
            raise


In [38]:
len(train)

195

In [39]:
path = 'Retraining_Data/*.csv'

files = glob.glob(path)

for name in files:
    try:
        df=pd.read_csv(name,index_col=0)
        df['w']=df['w'].replace({'\n':'#n'})
        
        df['pos']=df['w'].apply(lambda x:nltk.pos_tag([str(x)])[0][1])
        tuple1 = [tuple(x) for x in df.values]
        train.append(tuple1)
        
    except IOError as exc:
        if exc.errno != errno.EISDIR:
            raise


In [40]:
len(train)

197

In [41]:
import json

with open('crf_module_vocab.json') as fp:
    crf_module_vocab=json.load( fp)

In [42]:
crf_module_vocab.keys()

dict_keys(['sub_names', 'suffix', 'chars', 'starts', 'range_cat'])

In [43]:
range_category=crf_module_vocab['range_cat']

In [13]:
def is_name(word):
    if any([ word.upper().strip().endswith(i) for i in crf_module_vocab['suffix']]):
           return True
    elif any([ word.upper().strip().startswith(i) for i in crf_module_vocab['starts']]):
           return True
    elif any([i in word.upper().strip() for i in crf_module_vocab['sub_names']]):
           return True
    
    else:
           return False
    

In [14]:
def is_unit(word):
#     if word in units_list:
#         return True
    if word.upper().strip().endswith('/L'):
        return True
    elif re.match(r'(^(10)\s?\^\s?[1-9]\s?(/[Uu]?[lL])?$)|(^(10)?\s?~?\s?\d/[Uu]?[Ll]$)',word):
        return True  
    else:
        return False

In [15]:
range_list=['-','>','<','-—','—-','=','–','Up to 15', 'Up']
def is_range(word):
    if word in range_list:
        return True
    elif re.match(r'^\d*\.?\d*\s?[-|—|>|<|=|-—|–|-]\s?\d+\.?\d*$',word):
        return True
    else:
        return False

In [16]:

def is_range_cat(word):
    if word in range_category:
        return True
    
    else:
        return False

In [17]:
def word2features(sent, i):
    word = str(sent[i][0])
    #print(type(word))
    postag = sent[i][2]
    features = {
        'bias': 1.0,
        'word.lower()': word.lower(),
        'word[-3:]': word[-3:],
        'word[-2:]': word[-2:],
        'word.isupper()': word.isupper(),
        'word.istitle()': word.istitle(),
        'word.isdigit()': word.isdigit(),
        'postag': postag,
         'postag[:2]': postag[:2],
         'word[+3:]': word[+3:],
             'word[+2:]': word[+2:],
      #   'is_unit()': is_unit(word),
         'is_range_cat()': is_range_cat(word),
        'is_name()': is_name(word),
        #'length':len(word)
        
    }
    if i > 0:
        word1 = str(sent[i-1][0])
        postag1 = sent[i-1][1]
        features.update({
            '-1:word.lower()': word1.lower(),
            '-1:word.istitle()': word1.istitle(),
            '-1:word.isupper()': word1.isupper(),
            '-1:word.isdigit()': word1.isdigit(),
           '-1:is_name()': is_name(word1),
           # '-1:is_range_cat()': is_range_cat(word1),
          #  '-1:length':len(word1)
        })
    else:
        features['BOS'] = True

    if i < len(sent)-1:
        word1 = str(sent[i+1][0])
        postag1 = sent[i+1][1]
        features.update({
            '+1:word.lower()': word1.lower(),
            '+1:word.istitle()': word1.istitle(),
            '+1:word.isupper()': word1.isupper(),
             '+1:word.isdigit()': word1.isdigit(),
               '+1:is_name()': is_name(word1),
          #  '+1:is_range_cat()': is_range_cat(word1),
          #  '+1:length':len(word1)
        })
    else:
        features['EOS'] = True

    return features


def sent2features(sent):
    return [word2features(sent, i) for i in range(len(sent))]

def sent2labels(sent):
    return [label for token,label, postag  in sent]

# def sent2tokens(sent):
#     return [token for token, label, postag in sent]

In [18]:
X = [sent2features(s) for s in train]
y = [sent2labels(s) for s in train]


In [19]:
%%time
crf = sklearn_crfsuite.CRF(
    algorithm='ap',
   all_possible_states=True,
   all_possible_transitions=True,
    max_iterations=1000,
    
   
)

CPU times: user 22 µs, sys: 2 µs, total: 24 µs
Wall time: 34.1 µs


In [20]:
from sklearn.model_selection import cross_val_predict
from sklearn_crfsuite.metrics import flat_classification_report

In [21]:
pred = cross_val_predict(estimator=crf, X=X, y=y, cv=10)



In [23]:
report = flat_classification_report(y_pred=pred, y_true=y)
print(report)

                     precision    recall  f1-score   support

            NEWLINE       1.00      1.00      1.00      3223
                  O       0.93      0.95      0.94     18155
          TEST_NAME       0.93      0.89      0.91      6963
         TEST_RANGE       0.96      0.94      0.95      8416
TEST_RANGE_CATEGORY       0.87      0.89      0.88       484
          TEST_UNIT       0.93      0.91      0.92      3255
         TEST_VALUE       0.90      0.91      0.90      4270

           accuracy                           0.94     44766
          macro avg       0.93      0.93      0.93     44766
       weighted avg       0.94      0.94      0.94     44766



In [24]:

crf.fit(X, y)

# crf1.fit(X, y)

# crf2.fit(X, y)

# crf3.fit(X, y)


CRF(algorithm='ap', all_possible_states=True, all_possible_transitions=True,
    averaging=None, c=None, c1=None, c2=None, calibration_candidates=None,
    calibration_eta=None, calibration_max_trials=None, calibration_rate=None,
    calibration_samples=None, delta=None, epsilon=None, error_sensitive=None,
    gamma=None, keep_tempfiles=None, linesearch=None, max_iterations=1000,
    max_linesearch=None, min_freq=None, model_filename=None, num_memories=None,
    pa_type=None, period=None, trainer_cls=None, variance=None, verbose=False)

In [33]:
labels = list(crf.classes_)
labels.remove('O')
y_pred = crf.predict(X_test)
metrics.flat_f1_score(y_test, y_pred,
                      average='weighted', labels=labels)

0.9956021800071991

In [33]:
from datetime import datetime
now=datetime.now()

In [34]:
now=now.strftime("%Y-%m-%d::%H:%M:%S")

In [115]:
import pickle
filename = now+'_crf_model.sav'
pickle.dump(crf, open(filename, 'wb'))

In [143]:
import pickle
filename = '2020-05-04_crf_model.sav'
crf=pickle.load(open(filename, 'rb'))