In [6]:
import nltk
import numpy as np
import pandas as pd
import random
from sklearn.model_selection import train_test_split
import pprint, time

import sklearn
import sklearn_crfsuite
import scipy.stats
import math, string, re
from sklearn.metrics import make_scorer
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RandomizedSearchCV
from sklearn_crfsuite import scorers
from sklearn_crfsuite import metrics
from itertools import chain
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn_crfsuite import CRF
from nltk.tokenize import word_tokenize
from nltk.tag.util import untag

import ast
from ast import literal_eval

import jieba 
from hanziconv import HanziConv

from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.ml.feature import Tokenizer
from operator import add

In [7]:
# read in the seniors previous training data 
df = pd.read_csv("seniors_data.csv")
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6318 entries, 0 to 6317
Data columns (total 1 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   tagged  6318 non-null   object
dtypes: object(1)
memory usage: 49.5+ KB


In [8]:
df.head()

Unnamed: 0,tagged
0,"[('In', 'ADP'), ('a', 'DET'), ('webinar', 'NOU..."
1,"[('I', 'PRON'), ('think', 'VERB'), ('that', 'D..."
2,"[('Goooood', 'X'), (',', 'PUNCT'), ('very', 'A..."
3,"[('!', 'PUNCT'), ('nice', 'ADJ'), ('to', 'ADP'..."
4,"[('item', 'NOUN'), ('with', 'ADP'), ('nice', '..."


In [64]:
def convert_string2_list(text):
    return ast.literal_eval(str(text))

In [65]:
def features(sentence, index):
    # """ sentence: [w1, w2, ...], index: the index of the word """
    
    feature_set =  {
        'word': sentence[index],
        'is_first': index == 0,
        'is_last': index == len(sentence) - 1,
        'is_capitalized': sentence[index][0].upper() == sentence[index][0],
        'is_all_caps': sentence[index].upper() == sentence[index],
        'is_all_lower': sentence[index].lower() == sentence[index],
        'prefix-1': sentence[index][0],
        'prefix-2': sentence[index][:2],
        'prefix-3': sentence[index][:3],
        'prefix-4': sentence[index][:4],
        'prefix-5': sentence[index][:5],
        'suffix-1': sentence[index][-1],
        'suffix-2': sentence[index][-2:],
        'suffix-3': sentence[index][-3:],
        'suffix-4': sentence[index][-4:],
        'suffix-5': sentence[index][-5:],
        'prev_word': '' if index == 0 else sentence[index - 1],
        'next_word': '' if index == len(sentence) - 1 else sentence[index + 1],
        'prev2_word': '' if index == 0 else sentence[index - 2],
        'next2_word': '' if index == len(sentence) - 2 or index == len(sentence) - 1  else sentence[index + 2],
        'prev3_word': '' if index == 0 else sentence[index - 3],
        'next3_word': '' if index == len(sentence) - 2 or index == len(sentence) - 1  or index == len(sentence) - 3  else sentence[index + 3],
        'has_hyphen': '-' in sentence[index],
        'is_numeric': sentence[index].isdigit(),
        'capitals_inside': sentence[index][1:].lower() != sentence[index][1:],
        'natural_number': (re.findall(r'^[0-9]+', sentence[index])),
        'initcaps' : (re.findall(r'^[A-Z]\w+', sentence[index])),
        'initcapsalpha': (re.findall(r'^[A-Z][a-z]\w+', sentence[index])),
        'word.stemmed': re.sub(r'(.{2,}?)([aeiougyn]+$)',r'\1', sentence[index].lower()),
        'word.ispunctuation': (sentence[index] in string.punctuation)
    }
    
    if index <= 0:
        feature_set['BOS'] = True
    
    if index > len(sentence)-1:
        feature_set['EOS'] = True
        
    return feature_set

In [66]:
def transform_to_dataset(tagged_sentences):
    X, y = [], []
 
    for tagged in tagged_sentences:
        X.append([features(untag(tagged), index) for index in range(len(tagged))])
        y.append([tag for _, tag in tagged])
 
    return X, y

In [67]:
def pos_tag(sentence, model):
    sentence = sentence_splitter(sentence)
    sentence_features = [features(sentence, index) for index in range(len(sentence))]
    return list(zip(sentence, model.predict([sentence_features])[0]))

In [68]:
def sentence_splitter(sentence):
    result = []
    sents = word_tokenize(sentence)
    for s in sents:
        if re.findall(r'[\u4e00-\u9fff]+', s):
            s = HanziConv.toSimplified(s)
            result = result + list(jieba.cut(s, cut_all=False))
        else:
            result.append(s)
    return result

In [74]:
# Split the dataset for training and testing
data = df.tagged.apply(convert_string2_list)
data = data.to_list()

In [75]:
cutoff = int(.80 * len(data))
training_sentences = data[:cutoff]
test_sentences = data[cutoff:]

In [76]:
X_train, y_train = transform_to_dataset(training_sentences)
X_test, y_test = transform_to_dataset(test_sentences)

In [77]:
print(len(X_train))     
print(len(X_test))         

5054
1264


In [79]:
CRF_model_lbfgs = sklearn_crfsuite.CRF(
    algorithm = 'lbfgs',
    max_iterations = 100,
    all_possible_transitions=True,
    c1 = 0.25,
    c2 = 0.35
)


In [80]:
CRF_model_lbfgs.fit(X_train, y_train)



CRF(algorithm='lbfgs', all_possible_states=None, all_possible_transitions=True,
    averaging=None, c=None, c1=0.25, c2=0.35, calibration_candidates=None,
    calibration_eta=None, calibration_max_trials=None, calibration_rate=None,
    calibration_samples=None, delta=None, epsilon=None, error_sensitive=None,
    gamma=None, keep_tempfiles=None, linesearch=None, max_iterations=100,
    max_linesearch=None, min_freq=None, model_filename=None, num_memories=None,
    pa_type=None, period=None, trainer_cls=None, variance=None, verbose=False)

In [90]:
len(ypred)

5054

In [102]:
y_pred = CRF_model_lbfgs.predict(X_test)
print('F1 score on the test set = {}\n'.format(metrics.flat_f1_score(y_test, y_pred, labels=labels, average='weighted')))
print('Accuracy on the test set = {}\n'.format(metrics.flat_accuracy_score(y_test, y_pred)))
print('Precision on the test set = {}\n'.format(metrics.flat_precision_score(y_test, y_pred, labels=labels, average='weighted')))
print('Recall on the test set = {}\n'.format(metrics.flat_recall_score(y_test, y_pred,labels=labels, average='weighted')))


sorted_labels = sorted(
    labels,
    key=lambda name: (name[1:], name[0])
)
print('Test set classification report: \n\n{}'.format(metrics.flat_classification_report(
y_test, y_pred, labels=sorted_labels, digits=3
)))

F1 score on the test set = 0.969186531240904

Accuracy on the test set = 0.9691030739127691

Precision on the test set = 0.969311564152366

Recall on the test set = 0.9691030739127691

Test set classification report: 

              precision    recall  f1-score   support

           X      0.998     1.000     0.999      2127
        PART      0.997     1.000     0.998       991
       CCONJ      0.999     0.994     0.996       683
       SCONJ      0.000     0.000     0.000         0
         ADJ      0.868     0.898     0.883      1988
         ADP      0.980     0.981     0.981      3153
         ADV      0.881     0.871     0.876       793
        VERB      0.968     0.962     0.965      4206
         DET      0.992     0.995     0.993      2837
        INTJ      0.000     0.000     0.000         0
        NOUN      0.954     0.950     0.952      6313
        PRON      0.996     1.000     0.998       479
       PROPN      0.969     0.962     0.965      2918
         NUM      0.999 

  _warn_prf(average, modifier, msg_start, len(result))


In [81]:
labels = list(CRF_model_lbfgs.classes_)
# labels.remove('O')
print(labels)
params_space = {
    'c1': scipy.stats.expon(scale=0.5),
    'c2': scipy.stats.expon(scale=0.05),
}

# use the same metric for evaluation
f1_scorer = make_scorer(metrics.flat_f1_score,
                        average='weighted', labels=labels)

# search
rs = RandomizedSearchCV(CRF_model_lbfgs, params_space,
                        cv=3,
                        verbose=1,
                        n_jobs=-1,
                        n_iter=50,
                        scoring=f1_scorer)
rs.fit(X_train, y_train)

['ADP', 'DET', 'NOUN', 'ADJ', 'PUNCT', 'PROPN', 'PART', 'VERB', 'ADV', 'CCONJ', 'AUX', 'PRON', 'X', 'INTJ', 'SCONJ', 'NUM', 'SYM']
Fitting 3 folds for each of 50 candidates, totalling 150 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed:  1.4min
[Parallel(n_jobs=-1)]: Done 150 out of 150 | elapsed:  6.5min finished


RandomizedSearchCV(cv=3, error_score=nan,
                   estimator=CRF(algorithm='lbfgs', all_possible_states=None,
                                 all_possible_transitions=True, averaging=None,
                                 c=None, c1=0.25, c2=0.35,
                                 calibration_candidates=None,
                                 calibration_eta=None,
                                 calibration_max_trials=None,
                                 calibration_rate=None,
                                 calibration_samples=None, delta=None,
                                 epsilon=None, error_sensitive=None, gamma=None,
                                 keep_...
                   param_distributions={'c1': <scipy.stats._distn_infrastructure.rv_continuous_frozen object at 0x7f783b3bf1c0>,
                                        'c2': <scipy.stats._distn_infrastructure.rv_continuous_frozen object at 0x7f783708fd00>},
                   pre_dispatch='2*n_jobs', random_st

In [83]:
from sklearn.model_selection import GridSearchCV
params_space = {
    "c1": [0,0.05,0.1, 0.25,0.5,1],
    "c2": [0,0.05,0.1, 0.25,0.5,1]
}

# use the same metric for evaluation
f1_scorer = make_scorer(metrics.flat_f1_score,
                        average='weighted', labels=labels)

# search
grid_search = GridSearchCV(estimator=CRF_model_lbfgs,
                           param_grid=params_space,
                           cv=3,
                           n_jobs=-1, verbose=1,scoring=f1_scorer)

grid_search.fit(X_train, y_train)

Fitting 3 folds for each of 36 candidates, totalling 108 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed:  1.4min
[Parallel(n_jobs=-1)]: Done 108 out of 108 | elapsed:  4.7min finished


GridSearchCV(cv=3, error_score=nan,
             estimator=CRF(algorithm='lbfgs', all_possible_states=None,
                           all_possible_transitions=True, averaging=None,
                           c=None, c1=0.25, c2=0.35,
                           calibration_candidates=None, calibration_eta=None,
                           calibration_max_trials=None, calibration_rate=None,
                           calibration_samples=None, delta=None, epsilon=None,
                           error_sensitive=None, gamma=None,
                           keep_tempfi...
                           trainer_cls=None, variance=None, verbose=False),
             iid='deprecated', n_jobs=-1,
             param_grid={'c1': [0, 0.05, 0.1, 0.25, 0.5, 1],
                         'c2': [0, 0.05, 0.1, 0.25, 0.5, 1]},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=make_scorer(flat_f1_score, average=weighted, labels=['ADP', 'DET', 'NOUN', 'ADJ', 'PUNCT

In [84]:
sorted_labels = sorted(
    labels,
    key=lambda name: (name[1:], name[0])
)

In [85]:
y_pred = grid_search.predict(X_test)
print("CRF_model_lbfgs : ", round(metrics.flat_accuracy_score(y_test, y_pred)*100, 4))
print(metrics.flat_classification_report(
    y_test, y_pred, labels=sorted_labels, digits=3
))

CRF_model_lbfgs :  96.9008
              precision    recall  f1-score   support

           X      0.997     1.000     0.998      2127
        PART      0.996     1.000     0.998       991
       CCONJ      0.999     0.996     0.997       683
       SCONJ      0.000     0.000     0.000         0
         ADJ      0.868     0.897     0.883      1988
         ADP      0.982     0.982     0.982      3153
         ADV      0.866     0.878     0.872       793
        VERB      0.967     0.962     0.965      4206
         DET      0.991     0.994     0.992      2837
        INTJ      0.000     0.000     0.000         0
        NOUN      0.955     0.949     0.952      6313
        PRON      0.994     1.000     0.997       479
       PROPN      0.972     0.961     0.966      2918
         NUM      0.999     0.994     0.997      1599
       PUNCT      1.000     1.000     1.000      3273
         AUX      0.000     0.000     0.000         0
         SYM      1.000     1.000     1.000       326


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [87]:
from sklearn_crfsuite import metrics

y_pred_lbfgs = CRF_model_lbfgs.predict(X_test)
print("CRF_model_lbfgs : ", round(metrics.flat_accuracy_score(y_test, y_pred_lbfgs)*100, 4))
print("CRF_model_lbfgs    : \n", metrics.flat_classification_report(y_test, y_pred_lbfgs), "\n")

# y_pred_l2sgd = CRF_model_l2sgd.predict(X_test)
# print("CRF_model_l2sgd : ", round(metrics.flat_accuracy_score(y_test, y_pred_l2sgd)*100, 4))
# print("CRF_model_l2sgd    : \n", metrics.flat_classification_report(y_test, y_pred_l2sgd), "\n")

# y_pred_arow = CRF_model_arow.predict(X_test)
# print("CRF_model_arow  : ", round(metrics.flat_accuracy_score(y_test, y_pred_arow)*100, 4))
# print("CRF_model_arow    : \n", metrics.flat_classification_report(y_test, y_pred_arow), "\n")

# y_pred_pa = CRF_model_pa.predict(X_test)
# print("CRF_model_pa    : ", round(metrics.flat_accuracy_score(y_test, y_pred_pa)*100, 4))
# print("CRF_model_pa    : \n", metrics.flat_classification_report(y_test, y_pred_pa), "\n")

# y_pred_ap = CRF_model_ap.predict(X_test)
# print("CRF_model_ap    : ", round(metrics.flat_accuracy_score(y_test, y_pred_ap)*100, 4))
# print("CRF_model_ap    : \n", metrics.flat_classification_report(y_test, y_pred_ap), "\n")

# print(metrics.sequence_accuracy_score(y_test, y_pred_lbfgs)*100)
# https://sklearn-crfsuite.readthedocs.io/en/latest/_modules/sklearn_crfsuite/metrics.html
# CRF_model_lbfgs :  97.664
# CRF_model_l2sgd :  97.3046
# CRF_model_arow  :  99.7604
# CRF_model_pa    :  99.7904
# CRF_model_ap    :  99.8203

CRF_model_lbfgs :  96.9103
CRF_model_lbfgs    : 
               precision    recall  f1-score   support

         ADJ       0.87      0.90      0.88      1988
         ADP       0.98      0.98      0.98      3153
         ADV       0.88      0.87      0.88       793
         AUX       0.00      0.00      0.00         0
       CCONJ       1.00      0.99      1.00       683
         DET       0.99      1.00      0.99      2837
        NOUN       0.95      0.95      0.95      6313
         NUM       1.00      0.99      1.00      1599
        PART       1.00      1.00      1.00       991
        PRON       1.00      1.00      1.00       479
       PROPN       0.97      0.96      0.97      2918
       PUNCT       1.00      1.00      1.00      3273
         SYM       1.00      1.00      1.00       326
        VERB       0.97      0.96      0.97      4206
           X       1.00      1.00      1.00      2127

    accuracy                           0.97     31686
   macro avg       0.91      0