In [1]:
pip install sklearn-crfsuite

Collecting sklearn-crfsuite
  Downloading sklearn_crfsuite-0.5.0-py2.py3-none-any.whl.metadata (4.9 kB)
Collecting python-crfsuite>=0.9.7 (from sklearn-crfsuite)
  Downloading python_crfsuite-0.9.11-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.3 kB)
Downloading sklearn_crfsuite-0.5.0-py2.py3-none-any.whl (10 kB)
Downloading python_crfsuite-0.9.11-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m14.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: python-crfsuite, sklearn-crfsuite
Successfully installed python-crfsuite-0.9.11 sklearn-crfsuite-0.5.0


In [2]:
import sklearn_crfsuite
from sklearn.model_selection import train_test_split
import os
from sklearn_crfsuite import metrics

In [3]:
# Feature template definition
#features = {
#    'prefix': word[0:3],
#    'suffix': word[-3:],
#    'begin_upper': word[0].isupper(),
#    'lower_case': word.lower(),
#    'prev_word': word_prev,
#    'prev_tag': tag_prev,
#    'next_word': word_next,
#    'Begin':
#    'End':,
#    'word_form'
#}

# converting a word to its features representation
def wordToFeatures(curr_word, prev_word = None, next_word = None):
  word_form = ''
  for symbol in curr_word:
    if (symbol.isdigit()):
      word_form += 'd'
    elif (symbol.isupper()):
      word_form += 'X'
    elif(symbol == '-'):
      word_form += '-'
    else:
      word_form += 'x'

  features = {
    'begin_upper': curr_word[0].isupper(),
    'lower_case': curr_word.lower(),
    'word_form': word_form
  }

  # missing feature is associated weigt 0
  if (len(curr_word) > 2):
    features.update({
        'prefix': curr_word[0:3],
        'suffix': curr_word[-3:]
    })

  if (prev_word != None):
    features.update({
        'prev_word': prev_word,
        'Begin': False
    })
  else:
    features.update({
        'Begin': True
    })

  if (next_word != None):
    features.update({
        'next_word': next_word,
        'End': False
    })
  else:
    features.update({
        'End': True
    })

  return features

In [4]:
# reading corpus file
# must be store in the processed_file directory
def read_file(file_name):
  file_path = os.path.join('processed_files', file_name)
  words = []
  labels = []

  with open(file_path) as file:
    for line in file:
      line = line.strip('\n')
      if line == 'START':
        words.append('START')
        sentence_labels = []
      elif line == 'END':
        words.append('END')
        labels.append(sentence_labels)
      else:
        word, label = line.split('\t')
        words.append(word)
        sentence_labels.append(label)

  return words, labels

In [6]:
test_file = 'it_old-ud-test.txt'
train_file = 'it_old-ud-train.txt'

test_words, test_labels = read_file(test_file)
train_words, train_labels = read_file(train_file)

In [7]:
# processes file and computes
# embedding for each word
def create_embedding(words):
  embeddings = []
  sent_embeddings = []
  curr_word = None
  prev_word = None
  prev_tag = None
  next_word = None
  words_count = len(words)

  for i in range(words_count):
    curr_word = words[i]
    if (curr_word != 'START' and curr_word != 'END'):
      if i > 0:
        prev_word = words[i - 1]
        if (prev_word == 'START'):
          prev_word = None

      if i < words_count - 1:
        next_word = words[i + 1]
        if (next_word == 'END'):
          next_word = None

      sent_embeddings.append(wordToFeatures(curr_word, prev_word, next_word))
    elif (curr_word == 'END'):
      embeddings.append(sent_embeddings)
      sent_embeddings = []

  return embeddings

In [8]:
test_embeddings = create_embedding(test_words)
train_embeddings = create_embedding(train_words)

In [9]:
print(test_embeddings)

[[{'begin_upper': True, 'lower_case': 'non', 'word_form': 'Xxx', 'prefix': 'Non', 'suffix': 'Non', 'Begin': True, 'next_word': 'ci', 'End': False}, {'begin_upper': False, 'lower_case': 'ci', 'word_form': 'xx', 'prev_word': 'Non', 'Begin': False, 'next_word': 'fare', 'End': False}, {'begin_upper': False, 'lower_case': 'fare', 'word_form': 'xxxx', 'prefix': 'far', 'suffix': 'are', 'prev_word': 'ci', 'Begin': False, 'next_word': 'ire', 'End': False}, {'begin_upper': False, 'lower_case': 'ire', 'word_form': 'xxx', 'prefix': 'ire', 'suffix': 'ire', 'prev_word': 'fare', 'Begin': False, 'next_word': 'a', 'End': False}, {'begin_upper': False, 'lower_case': 'a', 'word_form': 'x', 'prev_word': 'ire', 'Begin': False, 'next_word': 'Tizio', 'End': False}, {'begin_upper': True, 'lower_case': 'tizio', 'word_form': 'Xxxxx', 'prefix': 'Tiz', 'suffix': 'zio', 'prev_word': 'a', 'Begin': False, 'next_word': 'né', 'End': False}, {'begin_upper': False, 'lower_case': 'né', 'word_form': 'xx', 'prev_word': 'Ti

In [10]:
# instantiating and fitting the model
crf_model = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    # lasso regression coefficient
    # the higher the better generalization
    # but the higher the risk of underfitting
    c1 = 0.5,
    # ridge regression coefficient
    # the higher the better generalization
    c2 = 0.3,
    max_iterations=100,
    # includes also transitions not seen during training
    all_possible_states=True
)
crf_model.fit(train_embeddings, train_labels)

In [11]:
predictions = crf_model.predict(create_embedding(test_words))

In [12]:
metrics.flat_classification_report(test_labels, predictions)

'              precision    recall  f1-score   support\n\n         ADJ       0.84      0.79      0.81       494\n         ADP       0.99      0.99      0.99      1243\n         ADV       0.93      0.92      0.92      1039\n         AUX       0.93      0.94      0.94       355\n       CCONJ       0.99      0.99      0.99       516\n         DET       0.96      0.96      0.96      1556\n        INTJ       0.88      0.54      0.67        13\n        NOUN       0.93      0.93      0.93      1654\n         NUM       0.98      0.98      0.98        50\n        PART       0.55      0.86      0.67         7\n        PRON       0.94      0.94      0.94      1359\n       PROPN       0.93      0.97      0.95       175\n       PUNCT       1.00      1.00      1.00      1700\n       SCONJ       0.89      0.91      0.90       321\n        VERB       0.92      0.94      0.93      1629\n           X       0.88      0.24      0.38        29\n\n    accuracy                           0.95     12140\n   ma