### Import Functions

In [1]:
from ipynb.fs.full.utilities import *

### Data Prep

In [2]:
X_train_60 = load_json('../../data/Train_60_X.json')
y_train_60 = load_json('../../data/Train_60_y.json')
X_dev = load_json('../../data/Dev_20_X.json')
y_dev = load_json('../../data/Dev_20_y.json')

In [3]:
verify_prepped_data(X_train_60, y_train_60)

In [4]:
verify_prepped_data(X_dev, y_dev)

### CRF Model - Use default epsilon and delta

In [5]:
%%time
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    c1=0.1,
    c2=0.1,
    max_iterations=200,
    all_possible_transitions=True,
    linesearch='MoreThuente',
    num_memories=8,
    period=12 
)
crf.fit(X_train_60, y_train_60)

CPU times: user 20min 58s, sys: 967 ms, total: 20min 59s
Wall time: 20min 57s


In [6]:
labels = list(crf.classes_)
y_pred = crf.predict(X_dev)
print(metrics.flat_f1_score(y_dev, y_pred, average='weighted', labels=labels))
print(metrics.flat_classification_report(y_dev, y_pred, labels=labels, digits=3))
labels.remove('')
print(metrics.flat_f1_score(y_dev, y_pred, average='weighted', labels=labels))
print(metrics.flat_classification_report(y_dev, y_pred, labels=labels, digits=3))
punctuated_sentences = punctuate(X_dev, y_pred)
print(punctuated_sentences[:100])

0.9455972531293116




              precision    recall  f1-score   support

                  0.965     0.988     0.976    227825
           :      0.905     0.784     0.840       935
           !      0.552     0.256     0.350       371
           ,      0.802     0.681     0.737     18651
           ?      0.867     0.765     0.813       451
           ;      0.262     0.076     0.118       628
           .      0.944     0.986     0.965     11435
           -      0.525     0.135     0.215       939
           "      0.667     0.236     0.349      1598
          ).      0.958     0.934     0.946       271
           (      0.970     0.904     0.935      1690
           )      0.897     0.868     0.882      1003
          :"      0.821     0.338     0.479        68
          ".      0.609     0.130     0.214       300
          ",      0.734     0.592     0.655       848
          ),      0.833     0.699     0.760       385
         ...      0.500     0.043     0.079        70
           [      1.000    



              precision    recall  f1-score   support

           :      0.905     0.784     0.840       935
           !      0.552     0.256     0.350       371
           ,      0.802     0.681     0.737     18651
           ?      0.867     0.765     0.813       451
           ;      0.262     0.076     0.118       628
           .      0.944     0.986     0.965     11435
           -      0.525     0.135     0.215       939
           "      0.667     0.236     0.349      1598
          ).      0.958     0.934     0.946       271
           (      0.970     0.904     0.935      1690
           )      0.897     0.868     0.882      1003
          :"      0.821     0.338     0.479        68
          ".      0.609     0.130     0.214       300
          ",      0.734     0.592     0.655       848
          ),      0.833     0.699     0.760       385
         ...      0.500     0.043     0.079        70
           [      1.000     0.022     0.043        45
           ]      1.000    

### CRF Model - Use another algorithm for linesearch

In [7]:
%%time
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    c1=0.1,
    c2=0.1,
    max_iterations=200,
    all_possible_transitions=True,
    linesearch='Backtracking',
    num_memories=8,
    period=12 
)
crf.fit(X_train_60, y_train_60)

CPU times: user 19min 46s, sys: 592 ms, total: 19min 46s
Wall time: 19min 45s


In [8]:
labels = list(crf.classes_)
y_pred = crf.predict(X_dev)
print(metrics.flat_f1_score(y_dev, y_pred, average='weighted', labels=labels))
print(metrics.flat_classification_report(y_dev, y_pred, labels=labels, digits=3))
labels.remove('')
print(metrics.flat_f1_score(y_dev, y_pred, average='weighted', labels=labels))
print(metrics.flat_classification_report(y_dev, y_pred, labels=labels, digits=3))
punctuated_sentences = punctuate(X_dev, y_pred)
print(punctuated_sentences[:100])

0.9455972531293116




              precision    recall  f1-score   support

                  0.965     0.988     0.976    227825
           :      0.905     0.784     0.840       935
           !      0.552     0.256     0.350       371
           ,      0.802     0.681     0.737     18651
           ?      0.867     0.765     0.813       451
           ;      0.262     0.076     0.118       628
           .      0.944     0.986     0.965     11435
           -      0.525     0.135     0.215       939
           "      0.667     0.236     0.349      1598
          ).      0.958     0.934     0.946       271
           (      0.970     0.904     0.935      1690
           )      0.897     0.868     0.882      1003
          :"      0.821     0.338     0.479        68
          ".      0.609     0.130     0.214       300
          ",      0.734     0.592     0.655       848
          ),      0.833     0.699     0.760       385
         ...      0.500     0.043     0.079        70
           [      1.000    



              precision    recall  f1-score   support

           :      0.905     0.784     0.840       935
           !      0.552     0.256     0.350       371
           ,      0.802     0.681     0.737     18651
           ?      0.867     0.765     0.813       451
           ;      0.262     0.076     0.118       628
           .      0.944     0.986     0.965     11435
           -      0.525     0.135     0.215       939
           "      0.667     0.236     0.349      1598
          ).      0.958     0.934     0.946       271
           (      0.970     0.904     0.935      1690
           )      0.897     0.868     0.882      1003
          :"      0.821     0.338     0.479        68
          ".      0.609     0.130     0.214       300
          ",      0.734     0.592     0.655       848
          ),      0.833     0.699     0.760       385
         ...      0.500     0.043     0.079        70
           [      1.000     0.022     0.043        45
           ]      1.000    

In [9]:
%%time
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    c1=0.1,
    c2=0.1,
    max_iterations=200,
    all_possible_transitions=True,
    linesearch='StrongBacktracking',
    num_memories=8,
    period=12 
)
crf.fit(X_train_60, y_train_60)

CPU times: user 20min 28s, sys: 696 ms, total: 20min 29s
Wall time: 20min 27s


In [10]:
labels = list(crf.classes_)
y_pred = crf.predict(X_dev)
print(metrics.flat_f1_score(y_dev, y_pred, average='weighted', labels=labels))
print(metrics.flat_classification_report(y_dev, y_pred, labels=labels, digits=3))
labels.remove('')
print(metrics.flat_f1_score(y_dev, y_pred, average='weighted', labels=labels))
print(metrics.flat_classification_report(y_dev, y_pred, labels=labels, digits=3))
punctuated_sentences = punctuate(X_dev, y_pred)
print(punctuated_sentences[:100])

0.9455972531293116




              precision    recall  f1-score   support

                  0.965     0.988     0.976    227825
           :      0.905     0.784     0.840       935
           !      0.552     0.256     0.350       371
           ,      0.802     0.681     0.737     18651
           ?      0.867     0.765     0.813       451
           ;      0.262     0.076     0.118       628
           .      0.944     0.986     0.965     11435
           -      0.525     0.135     0.215       939
           "      0.667     0.236     0.349      1598
          ).      0.958     0.934     0.946       271
           (      0.970     0.904     0.935      1690
           )      0.897     0.868     0.882      1003
          :"      0.821     0.338     0.479        68
          ".      0.609     0.130     0.214       300
          ",      0.734     0.592     0.655       848
          ),      0.833     0.699     0.760       385
         ...      0.500     0.043     0.079        70
           [      1.000    



              precision    recall  f1-score   support

           :      0.905     0.784     0.840       935
           !      0.552     0.256     0.350       371
           ,      0.802     0.681     0.737     18651
           ?      0.867     0.765     0.813       451
           ;      0.262     0.076     0.118       628
           .      0.944     0.986     0.965     11435
           -      0.525     0.135     0.215       939
           "      0.667     0.236     0.349      1598
          ).      0.958     0.934     0.946       271
           (      0.970     0.904     0.935      1690
           )      0.897     0.868     0.882      1003
          :"      0.821     0.338     0.479        68
          ".      0.609     0.130     0.214       300
          ",      0.734     0.592     0.655       848
          ),      0.833     0.699     0.760       385
         ...      0.500     0.043     0.079        70
           [      1.000     0.022     0.043        45
           ]      1.000    

### CRF Model - Use more iterations

In [11]:
%%time
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    c1=0.1,
    c2=0.1,
    max_iterations=300,
    all_possible_transitions=True,
    linesearch='MoreThuente',
    num_memories=8,
    period=12 
)
crf.fit(X_train_60, y_train_60)

CPU times: user 29min 39s, sys: 885 ms, total: 29min 40s
Wall time: 29min 38s


In [12]:
labels = list(crf.classes_)
y_pred = crf.predict(X_dev)
print(metrics.flat_f1_score(y_dev, y_pred, average='weighted', labels=labels))
print(metrics.flat_classification_report(y_dev, y_pred, labels=labels, digits=3))
labels.remove('')
print(metrics.flat_f1_score(y_dev, y_pred, average='weighted', labels=labels))
print(metrics.flat_classification_report(y_dev, y_pred, labels=labels, digits=3))
punctuated_sentences = punctuate(X_dev, y_pred)
print(punctuated_sentences[:100])

0.9455280733690878




              precision    recall  f1-score   support

                  0.965     0.988     0.976    227825
           :      0.903     0.780     0.837       935
           !      0.562     0.256     0.352       371
           ,      0.801     0.681     0.736     18651
           ?      0.859     0.769     0.812       451
           ;      0.271     0.083     0.127       628
           .      0.944     0.987     0.965     11435
           -      0.514     0.133     0.212       939
           "      0.663     0.235     0.347      1598
          ).      0.958     0.934     0.946       271
           (      0.968     0.906     0.936      1690
           )      0.895     0.864     0.879      1003
          :"      0.767     0.338     0.469        68
          ".      0.635     0.133     0.220       300
          ",      0.734     0.593     0.656       848
          ),      0.811     0.704     0.754       385
         ...      0.400     0.029     0.053        70
           [      1.000    



              precision    recall  f1-score   support

           :      0.903     0.780     0.837       935
           !      0.562     0.256     0.352       371
           ,      0.801     0.681     0.736     18651
           ?      0.859     0.769     0.812       451
           ;      0.271     0.083     0.127       628
           .      0.944     0.987     0.965     11435
           -      0.514     0.133     0.212       939
           "      0.663     0.235     0.347      1598
          ).      0.958     0.934     0.946       271
           (      0.968     0.906     0.936      1690
           )      0.895     0.864     0.879      1003
          :"      0.767     0.338     0.469        68
          ".      0.635     0.133     0.220       300
          ",      0.734     0.593     0.656       848
          ),      0.811     0.704     0.754       385
         ...      0.400     0.029     0.053        70
           [      1.000     0.022     0.043        45
           ]      1.000    