In [1]:
from google.colab import drive
drive.mount('/content/gdrive')
import os
os.chdir('/content/gdrive/My Drive/finch/python/atis/main')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [2]:
!pip install python-crfsuite



In [3]:
from sklearn.metrics import f1_score, classification_report

import pycrfsuite
import pprint

In [4]:
def word2features(sent, i):
  word = sent[i]
  features = {
    'bias': 1.0,
    'word': word.lower(),
    'word[-4:]': word[-4:],
    'word[-3:]': word[-3:],
    'word[-2:]': word[-2:],
    'word[:4]': word[:4], 
    'word[:3]': word[:3],
    'word[:2]': word[:2],
    'len(word)': len(word),}
  if i > 0:
    word = sent[i - 1]
    features.update({
      '-1:word': word.lower(),
      '-1:word[-4:]': word[-4:],
      '-1:word[-3:]': word[-3:],
      '-1:word[-2:]': word[-2:],
      '-1:word[:4]': word[:4],      
      '-1:word[:3]': word[:3],
      '-1:word[:2]': word[:2],
      '-1:len(word)': len(word),})
  else:
    features['<bos>'] = True
  if i < len(sent) - 1:
    word = sent[i + 1]
    features.update({
      '+1:word': word.lower(),
      '+1:word[-4:]': word[-4:],
      '+1:word[-3:]': word[-3:],
      '+1:word[-2:]': word[-2:],
      '+1:word[:4]': word[:4],
      '+1:word[:3]': word[:3],
      '+1:word[:2]': word[:2],
      '+1:len(word)': len(word),})
  else:
    features['<eos>'] = True
  if i > 1:
    word = sent[i - 2]
    features.update({
      '-2:word': word.lower(),
      '-2:word[-4:]': word[-4:],
      '-2:word[-3:]': word[-3:],
      '-2:word[-2:]': word[-2:],
      '-2:word[:4]': word[:4],
      '-2:word[:3]': word[:3],
      '-2:word[:2]': word[:2],
      '-2:len(word)': len(word),})
  if i < len(sent) - 2:
    word = sent[i + 2]
    features.update({
      '+2:word': word.lower(),
      '+2:word[-4:]': word[-4:],
      '+2:word[-3:]': word[-3:],
      '+2:word[-2:]': word[-2:],
      '+2:word[:4]': word[:4],
      '+2:word[:3]': word[:3],
      '+2:word[:2]': word[:2],
      '+2:len(word)': len(word),})
  return features

def sent2features(sent):
  sent = ['<num>' if word.isdigit() else word for word in sent]
  return [word2features(sent, i) for i in range(len(sent))]

In [5]:
with open('../data/atis.train.w-intent.iob') as f:
  for line in f:
    line = line.rstrip()
    text, slot_intent = line.split('\t')
    words = text.split()[1:-1]
    features = sent2features(words)
    pprint.pprint(features)
    break

[{'+1:len(word)': 4,
  '+1:word': 'want',
  '+1:word[-2:]': 'nt',
  '+1:word[-3:]': 'ant',
  '+1:word[-4:]': 'want',
  '+1:word[:2]': 'wa',
  '+1:word[:3]': 'wan',
  '+1:word[:4]': 'want',
  '+2:len(word)': 2,
  '+2:word': 'to',
  '+2:word[-2:]': 'to',
  '+2:word[-3:]': 'to',
  '+2:word[-4:]': 'to',
  '+2:word[:2]': 'to',
  '+2:word[:3]': 'to',
  '+2:word[:4]': 'to',
  '<bos>': True,
  'bias': 1.0,
  'len(word)': 1,
  'word': 'i',
  'word[-2:]': 'i',
  'word[-3:]': 'i',
  'word[-4:]': 'i',
  'word[:2]': 'i',
  'word[:3]': 'i',
  'word[:4]': 'i'},
 {'+1:len(word)': 2,
  '+1:word': 'to',
  '+1:word[-2:]': 'to',
  '+1:word[-3:]': 'to',
  '+1:word[-4:]': 'to',
  '+1:word[:2]': 'to',
  '+1:word[:3]': 'to',
  '+1:word[:4]': 'to',
  '+2:len(word)': 3,
  '+2:word': 'fly',
  '+2:word[-2:]': 'ly',
  '+2:word[-3:]': 'fly',
  '+2:word[-4:]': 'fly',
  '+2:word[:2]': 'fl',
  '+2:word[:3]': 'fly',
  '+2:word[:4]': 'fly',
  '-1:len(word)': 1,
  '-1:word': 'i',
  '-1:word[-2:]': 'i',
  '-1:word[-3:]': 

In [6]:
trainer = pycrfsuite.Trainer(verbose=True)

with open('../data/atis.train.w-intent.iob') as f:
  for line in f:
    line = line.rstrip()
    text, slot_intent = line.split('\t')
    words = text.split()[1:-1]
    slot_intent = slot_intent.split()
    slots, intent = slot_intent[1:-1], slot_intent[-1]
    assert len(words) == len(slots)
    features = sent2features(words)
    trainer.append(features, slots)

trainer.train('../model/atis.crfsuite')

Feature generation
type: CRF1d
feature.minfreq: 0.000000
feature.possible_states: 0
feature.possible_transitions: 0
0....1....2....3....4....5....6....7....8....9....10
Number of features: 65662
Seconds required: 0.497

L-BFGS optimization
c1: 0.000000
c2: 1.000000
num_memories: 6
max_iterations: 2147483647
epsilon: 0.000010
stop: 10
delta: 0.000010
linesearch: MoreThuente
linesearch.max_iterations: 20

***** Iteration #1 *****
Loss: 185735.605012
Feature norm: 1.000000
Error norm: 206434.985620
Active features: 65662
Line search trials: 1
Line search step: 0.000003
Seconds required for this iteration: 5.816

***** Iteration #2 *****
Loss: 130906.708244
Feature norm: 0.755916
Error norm: 129421.555535
Active features: 65662
Line search trials: 1
Line search step: 1.000000
Seconds required for this iteration: 3.052

***** Iteration #3 *****
Loss: 113786.546366
Feature norm: 0.725972
Error norm: 56715.193668
Active features: 65662
Line search trials: 2
Line search step: 0.346361
Seconds 

In [7]:
tagger = pycrfsuite.Tagger()
tagger.open('../model/atis.crfsuite')

with open('../data/atis.test.w-intent.iob') as f:
  for line in f:
    line = line.rstrip()
    text, slot_intent = line.split('\t')
    words = text.split()[1:-1]
    slot_intent = slot_intent.split()
    slots, intent = slot_intent[1:-1], slot_intent[-1]
    assert len(words) == len(slots)
    print('expected:', slots)
    print('predicted:', tagger.tag(sent2features(words)))
    break

expected: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-fromloc.city_name', 'O', 'B-toloc.city_name', 'I-toloc.city_name', 'O', 'O', 'O', 'O', 'O', 'B-stoploc.city_name', 'I-stoploc.city_name']
predicted: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-fromloc.city_name', 'O', 'B-toloc.city_name', 'I-toloc.city_name', 'O', 'O', 'O', 'O', 'O', 'B-stoploc.city_name', 'I-stoploc.city_name']


In [8]:
slot_true = []
slot_pred = []

with open('../data/atis.test.w-intent.iob') as f:
  for line in f:
    line = line.rstrip()
    text, slot_intent = line.split('\t')
    words = text.split()[1:-1]
    slot_intent = slot_intent.split()
    slots, intent = slot_intent[1:-1], slot_intent[-1]
    assert len(words) == len(slots)
    slot_pred += tagger.tag(sent2features(words))
    slot_true += slots

mask = [0 if s == 'O' else 1 for s in slot_true]

f1_slots = f1_score(y_true = slot_true,
                    y_pred = slot_pred,
                    sample_weight = mask,
                    average = 'micro',)

print('\n'+classification_report(y_true = slot_true,
                                 y_pred = slot_pred,
                                 sample_weight = mask,
                                 digits = 3))

print('micro avg: {:.3f}'.format(f1_slots))


                              precision    recall  f1-score   support

             B-aircraft_code      1.000     0.455     0.625      33.0
              B-airline_code      0.917     0.971     0.943      34.0
              B-airline_name      1.000     0.960     0.980     101.0
              B-airport_code      0.800     0.444     0.571       9.0
              B-airport_name      0.750     0.286     0.414      21.0
 B-arrive_date.date_relative      1.000     0.500     0.667       2.0
      B-arrive_date.day_name      1.000     0.364     0.533      11.0
    B-arrive_date.day_number      1.000     0.167     0.286       6.0
    B-arrive_date.month_name      1.000     0.167     0.286       6.0
      B-arrive_time.end_time      0.700     0.875     0.778       8.0
 B-arrive_time.period_of_day      0.000     0.000     0.000       6.0
    B-arrive_time.start_time      0.667     0.250     0.364       8.0
          B-arrive_time.time      0.875     0.824     0.848      34.0
 B-arrive_time.tim

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
