In [60]:
import spacy
from spacy.util import minibatch, compounding
from spacy.training import Example
import pandas as pd
import numpy as np

df = pd.read_csv('data/direction_phrases_dataset.txt', names=['phrase', 'move'])
nlp = spacy.load("ja_core_news_sm")

df

UnicodeDecodeError: 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte

In [2]:
phrase = '交差点を渡ります。そして、真っ直ぐ行って、右に曲がります。'
# parsed_phrase = nlp(df.phrase[14])
parsed_phrase = nlp(phrase)

tokenized_text = pd.DataFrame()

sentence_spans = list(parsed_phrase.sents)

phrases = pd.DataFrame()
phrase = ''
for i, token in enumerate(parsed_phrase):
#     tokenized_text.loc[i, 'text'] = token.text
    phrase += token.text
    if (token.pos_ == 'PUNCT'):
        phrases.loc[len(phrases), 'phrase'] = phrase
        phrase = ''
    
print(phrases)
print(df.shape)
# tokenized_text

      phrase
0  交差点を渡ります。
1       そして、
2   真っ直ぐ行って、
3   右に曲がります。
(33, 2)


In [51]:
df['tuples'] = df.apply(
    lambda row: (row['phrase'],row['move']), axis=1)
train = df['tuples'].tolist()

def func(category):
    cats = {'forward': False, 'behind': False, 'turn_left': False, 'turn_right': False}
    cats[category] = True
    return cats

def load_data(limit=0, split=0.8):
    train_data = train
    np.random.shuffle(train_data)
    train_data = train_data[-limit:]
    texts, labels = zip(*train_data)
    print(labels)
    print(labels[0])
    cats = [func(labels[y]) for y in range(len(labels))]
    split = int(len(train_data) * split)
    return (texts[:split], cats[:split]), (texts[split:], cats[split:])

def evaluate(tokenizer, textcat, texts, cats):
    docs = (tokenizer(text) for text in texts)
    correct = 0
    total = len(texts)
    for i, doc in enumerate(textcat.pipe(docs)):
        gold = cats[i]
        scores = list(doc.cats.values())
        max_index = scores.index(max(scores))
        truth_values = list(gold.values())
        true_index = truth_values.index(True)
        if max_index == true_index:
            correct += 1
    accuracy = correct / total

    return {'textcat_a': accuracy}

n_texts=len(df)

n_iter=15

In [52]:
if 'textcat' not in nlp.pipe_names:
#     textcat = nlp.create_pipe('textcat')
    nlp.add_pipe('textcat', last=True)
# otherwise, get it, so we can add labels to it
else:
    textcat = nlp.get_pipe('textcat')

# add label to text classifier
textcat.add_label('forward')
textcat.add_label('behind')
textcat.add_label('turn_right')
textcat.add_label('turn_left')

# load the dataset
print("Loading Japanese phrases data...")
(train_texts, train_cats), (dev_texts, dev_cats) = load_data(limit=n_texts)
print("Using {} examples ({} training, {} evaluation)"
      .format(n_texts, len(train_texts), len(dev_texts)))
train_data = list(zip(train_texts,
                      [{'cats': cats} for cats in train_cats]))

# train_data
print(textcat.labels)

Loading Japanese phrases data...
('turn_right', 'forward', 'turn_left', 'forward', 'forward', 'forward', 'turn_right', 'forward', 'forward', 'forward', 'turn_left', 'turn_left', 'behind', 'forward', 'behind', 'forward', 'forward', 'forward', 'forward', 'turn_right', 'turn_right', 'forward', 'forward', 'turn_left', 'forward', 'turn_left', 'forward', 'behind', 'turn_right', 'turn_right', 'turn_left', 'forward', 'turn_left')
turn_right
Using 33 examples (26 training, 7 evaluation)
('forward', 'behind', 'turn_right', 'turn_left')


In [53]:
# get names of other pipes to disable them during training
other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'textcat']
with nlp.disable_pipes(*other_pipes):  # only train textcat
    optimizer = nlp.begin_training()
    print("Training the model...")
    print('{:^5}\t{:^5}'.format('LOSS', 'A'))
    for i in range(n_iter):
        losses = {}
        # batch up the examples using spaCy's minibatch
        batches = minibatch(train_data, size=compounding(4., 32., 1.001))
        for batch in batches:          
            for text, annotations in batch:
                doc = nlp.make_doc(text)
                example = Example.from_dict(doc, annotations)
                nlp.update([example], drop=0.2, sgd=optimizer, losses=losses)
        with textcat.model.use_params(optimizer.averages):
            scores = evaluate(nlp.tokenizer, textcat, dev_texts, dev_cats)

        print('{0:.3f}\t{1:.3f}'  # print a simple table
              .format(losses['textcat'], scores['textcat_a'])
             )



Training the model...
LOSS 	  A  
4.560	0.571
2.549	0.571
1.280	0.571
0.561	0.714
0.144	0.714
0.042	0.714
0.003	0.714
0.001	0.714
0.000	0.714
0.001	0.714
0.000	0.714
0.000	0.714
0.000	0.714
0.000	0.714
0.000	0.714


In [20]:
test_text1 = '右に行きます'
doc1 = nlp(test_text1)
test_text1, doc1.cats

('右に行きます',
 {'forward': 0.02371937595307827,
  'behind': 0.06858570128679276,
  'turn_right': 0.5757348537445068,
  'turn_left': 0.3319600522518158})

In [21]:
test_text2 = '左に行きます'
doc2 = nlp(test_text2)
test_text2, doc2.cats

('左に行きます',
 {'forward': 1.0865726835618261e-05,
  'behind': 7.933408778626472e-05,
  'turn_right': 0.0007369335507974029,
  'turn_left': 0.9991728663444519})