### Import packages, Read data

In [2]:
import re

import spacy
from nltk.corpus import stopwords
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn_crfsuite import CRF
from sklearn_crfsuite.metrics import flat_classification_report, flat_f1_score
from tqdm import tqdm

In [3]:
def read_file(f):
    data = open(f,'r').readlines()[1:]
    row_id = [i.split('\t')[0].strip() for i in data]
    data = [i.split('\t')[1].strip().split(' ') for i in data]
    return row_id,data

In [4]:
row_id_text, texts = read_file('./review_data/REVIEW_TEXT.txt')
row_id_tags, tags = read_file('./review_data/REVIEW_LABELSEQ.txt')

### Model 1 (baseline)

In [5]:
def word2features(word, i, sent_len):
    
    features = {
        'word.lower()': word.lower(),  # 
        'word.isdigit()': word.isdigit()
    } 
        
    return features

def text2features(text):
    return [word2features(txt, i, len(text)) for i, txt in enumerate(text)]

In [6]:
Xb = [text2features(text) for text in texts]
yb = tags
X_train, X_validation, y_train, y_validation = train_test_split(Xb, yb, test_size = 0.2)

In [7]:
labels = ['B-AE','B-SSI','I-AE', 'I-SSI', 'O']
crf = CRF()
crf.fit(X_train, y_train) # train step 
y_pred = crf.predict(X_validation) # inference step
report = flat_classification_report(y_validation, y_pred, labels=  labels)
f1_score = flat_f1_score(y_validation, y_pred, average = 'macro', labels = labels)
print(f"Model 1\nF1 (macro): {f1_score:.3f}\nClassification report:\n{report}")

Model 1
F1 (macro): 0.519
Classification report:
              precision    recall  f1-score   support

        B-AE       0.73      0.41      0.52       807
       B-SSI       0.78      0.55      0.65       149
        I-AE       0.59      0.36      0.45      1485
       I-SSI       0.10      0.04      0.06        79
           O       0.89      0.96      0.92     12055

    accuracy                           0.86     14575
   macro avg       0.62      0.46      0.52     14575
weighted avg       0.84      0.86      0.85     14575





### Model 2 (added features)

In [43]:
nlp = spacy.load("en_core_web_sm")

def word2features(word, i, sent_len):
    # get POS
    doc = nlp(word)
    
    features = {
        'word.lower()': word.lower(),  # 
        'word.isdigit()': word.isdigit(),
        'word[0:3]': word[0:3],
        'word[-2:]': word[-2:],
        'word[-3:]': word[-3:],
        'word[-4:]': word[-4:],
        'word.stemmed': re.sub(r'(.{2,}?)([aeiougyn]+$)',r'\1', word.lower()),
        'pos': [token.pos_ for token in doc][0],
        "is.stopword": (word in stopwords.words())
    } 

    if i == 0: 
        features['BOS'] = True
    if i == sent_len-1:
        features['EOS'] = True
        
    return features

def text2features(text):
    return [word2features(txt, i, len(text)) for i, txt in enumerate(text)]

In [26]:
X = [text2features(text) for text in texts]
y = tags

X_train, X_validation, y_train, y_validation = train_test_split(X, y, test_size = 0.2)

In [60]:
labels = ['B-AE','B-SSI','I-AE', 'I-SSI', 'O']
crf = CRF()
crf.fit(X_train, y_train) # train step 
y_pred = crf.predict(X_validation) # inference step
report = flat_classification_report(y_validation, y_pred, labels=  labels)
f1_score = flat_f1_score(y_validation, y_pred, average = 'macro', labels = labels)
print(f"Model 2\nF1 (macro): {f1_score:.3f}\nClassification report:\n{report}")

Model 2
F1 (macro): 0.580
Classification report:
              precision    recall  f1-score   support

        B-AE       0.70      0.60      0.64       705
       B-SSI       0.71      0.54      0.61       157
        I-AE       0.58      0.55      0.57      1299
       I-SSI       0.32      0.09      0.14       100
           O       0.93      0.95      0.94     11926

    accuracy                           0.89     14187
   macro avg       0.65      0.55      0.58     14187
weighted avg       0.88      0.89      0.88     14187





### Model 3 (Hyperparameter tuning)

In [None]:
# param_grid
param_grid = {
    'algorithm': ['lbfgs'],
    'c1': [0, 0.5, 1],
    'c2': [0, 0.5, 1],
    'max_iterations': [50, 100, 200],
    'delta': [1e-5, 1e-3, 0.01],
    'epsilon': [1e-5, 1e-3, 0.01]
}

gcv = GridSearchCV(estimator = CRF(), param_grid = param_grid, n_jobs = -1, verbose = 3)

gcv.fit(X_train, y_train) 
######### output deleted #########

In [30]:
print(gcv.best_params_)

{'algorithm': 'lbfgs', 'c1': 0.5, 'c2': 0, 'delta': 1e-05, 'epsilon': 1e-05, 'max_iterations': 50}


In [61]:
from sklearn_crfsuite.metrics import flat_f1_score

crf = CRF(algorithm = 'lbfgs', c1 = 0.5, c2 = 0, delta = 1e-5, epsilon = 1e-5, max_iterations = 50)
crf.fit(X_train, y_train) # train step 
y_pred = crf.predict(X_validation) # inference step
report = flat_classification_report(y_validation, y_pred, labels=  labels)
f1_score = flat_f1_score(y_validation, y_pred, average = 'macro', labels = labels)
print(f"Model 3\nF1 (macro): {f1_score:.3f}\nClassification report:\n{report}")

Model 3
F1 (macro): 0.591
Classification report:
              precision    recall  f1-score   support

        B-AE       0.70      0.61      0.65       705
       B-SSI       0.73      0.54      0.62       157
        I-AE       0.62      0.54      0.57      1299
       I-SSI       0.33      0.11      0.17       100
           O       0.93      0.96      0.94     11926

    accuracy                           0.89     14187
   macro avg       0.66      0.55      0.59     14187
weighted avg       0.88      0.89      0.88     14187





In [9]:
import sklearn

sklearn.__file__


'/Users/randy/.conda/envs/numpy_m1_build/lib/python3.10/site-packages/sklearn/__init__.py'