In [None]:
# !pip install -U ginza
# import spacy
# from spacy import displacy
# from thinc.extra.load_nlp import get_spacy
# import pkg_resources, imp
# import pandas as pd
# from sklearn.model_selection import train_test_split
# import copy

# # evaluate
# from sklearn.metrics import precision_recall_fscore_support
# from sklearn.metrics import classification_report

# # minibatch
# import random
# from spacy.util import minibatch, compounding

# train.py

In [None]:
!pip install -U ginza
import spacy
from spacy import displacy
from thinc.extra.load_nlp import get_spacy
import pkg_resources, imp
import pandas as pd
from sklearn.model_selection import train_test_split
import copy

# evaluate
from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import classification_report

# minibatch
import random
from spacy.util import minibatch, compounding

def _get_model():
  imp.reload(pkg_resources)
  nlp = get_spacy("ja_ginza")
  return nlp

def conv_cat(cats, labels):
  d = {}
  for l in labels:
    d[l] = False
  temps = [copy.deepcopy(d) for i in range(len(cats))]
  for i, label in enumerate(cats):
    temps[i][label] = True
  return temps

def build_bow_text_classifier(
    nr_class, ngram_size=1, exclusive_classes=False, no_output_layer=False, **cfg):
  with Model.define_operators({">>": chain}):
      model = with_cpu(
      Model.ops, extract_ngrams(ngram_size, attr=ORTH) >> LinearModel(nr_class)
  )
  if not no_output_layer:
      model = model >> (cpu_softmax if exclusive_classes else logistic)
  model.nO = nr_class
  return model

def evaluate(tokenizer, textcat, docs, cats, verbose=False):
    y_true = [max(cat.items(), key=lambda x:x[1])[0] for cat in cats]
    y_pred = []
    for i, doc in enumerate(textcat.pipe(docs)):
        prediction = max(doc.cats.items(), key=lambda x:x[1])[0]
        y_pred.append(prediction)
    if verbose == False:
      p, r, f1 = precision_recall_fscore_support(y_true, y_pred, average="micro")[:3]    
      return {"textcat_p": p, "textcat_r": r, "textcat_f": f1}
    else:
      return classification_report(y_true, y_pred)

class Main:
  def __init__(self):
    self.model_path = "./model/my_model"
    self.nlp = _get_model()
  
  def train(self, training_directory):
    df = pd.read_csv(training_directory)
    train_df, test_df = train_test_split(df, test_size = 0.4, random_state=0)
    test_df, def_df = train_test_split(test_df, test_size=0.5, random_state=0)

    train_texts = train_df['text']
    train_cats = train_df['label']
    dev_texts = dev_df['text']
    dev_cats = dev_df['label']
    test_texts = test_df['text']
    test_cats = test_df['label']

    labels = train_df['label'].unique()

    train_cats = conv_cat(train_cats, labels)
    dev_cats = conv_cat(dev_cats, labels)
    test_cats = conv_cat(test_cats, labels)

    train_texts = train_texts.values
    dev_texts = dev_texts.values
    test_texts = test_texts.values

    train_docs = list(nlp.pipe(train_texts, disable=['ner']))
    dev_docs = list(nlp.pipe(dev_texts, disable=['ner']))
    test_docs = list(nlp.pipe(test_texts, disable=['ner']))

    train_data = list(zip(train_docs, [{"cats": cats} for cats in train_cats]))

    textcat = nlp.create_pipe("textcat", config={"exclusive_classes": True, "architecture": "bow"})
    nlp.add_pipe(textcat, last=True)
    for label in train_df['label'].unique():
      textcat.add_label(label)

    other_pipes = [pipe for pipe in nlp.pipe_names if pipe != "textcat"]
    n_iter = 20

    with nlp.disable_pipes(*other_pipes):  # only train textcat
        textcat = nlp.pipeline[-1][-1]
        optimizer = textcat.begin_training() # NOTE
        print("Training the model...")
        print("{:^5}\t{:^5}\t{:^5}\t{:^5}".format("LOSS", "P", "R", "F"))
        batch_sizes = compounding(4.0, 32.0, 1.001)
        num_samples = len(train_data)
        for i in range(n_iter):
            losses = {}
            # batch up the examples using spaCy's minibatch
            random.shuffle(train_data)
            batches = minibatch(train_data, size=batch_sizes)
            processed = 0
            for i, batch in enumerate(batches):
                texts, annotations = zip(*batch)
                nlp.update(texts, annotations, sgd=optimizer, drop=0.2, losses=losses)
                processed += len(batch)
                percentage = processed / num_samples * 100.0
                #if i % 20 == 0:
                #  print("  %5.2f %% of epoch done. batch size = %d" % (percentage, len(batch)))
            with textcat.model.use_params(optimizer.averages):
                # evaluate on the dev data split off in load_data()
                scores = evaluate(nlp.tokenizer, textcat, dev_docs, dev_cats)
            #print("{:^5}\t{:^5}\t{:^5}\t{:^5}".format("LOSS", "P", "R", "F"))
            print(
                "{0:.3f}\t{1:.3f}\t{2:.3f}\t{3:.3f}".format(  # print a simple table
                    losses["textcat"],
                    scores["textcat_p"],
                    scores["textcat_r"],
                    scores["textcat_f"],
                )
            )
    with textcat.model.use_params(optimizer.averages):
      report = evaluate(nlp.tokenizer, textcat, test_docs, test_cats, verbose=True)
      print("test loss = %5.3f\n" % (losses["textcat"]))
      print(report)
    
    nlp.to_disk("./model/new_model")

Requirement already up-to-date: ginza in /usr/local/lib/python3.6/dist-packages (4.0.5)


# main.py

In [None]:
import spacy
import pandas as pd

class Main:
  def __init__(self):
    self.nlp = spacy.load("./model/new_model")
  def predict(self, texts):
    docs = self.nlp.pipe(texts)
    result = []
    for i, doc in enumerate(docs):
      prediction = max(doc.cats.items(), key=lambda x:x[1])[0]
      ne_text = []
      ne_label = []
      for ent in doc.ents:
        if ent.text in ne_text and ent.label_ in ne_label:
          pass
        else:
          ne_text.append(ent.text)
          ne_label.append(ent.label_)
      result += [[prediction, ne_text, ne_label]]
    return result

if __name__ == '__main__':
  temp = pd.read_csv("./test.csv")
  m = Main()
  predict = m.predict(temp["text"].values)
  print(predict)

[['movie-enter', ['——', 'ホワイト・カラー', 'ティム・ディケイ', 'ピーター', 'バーク', 'ニール', 'ふたり', '俳優', 'ティム', 'プロデューサー', 'ボマー', '銃弾', 'アメリカ', '私の家庭では妻がズボンを履く', 'エリザベス', '犬', 'サッチモ', 'FBI', 'ナーバス', '脚本家', '愛妻家', '敏腕捜査官', 'ハニー', '人間', 'ひとつ', '両極端', '——シーズン2', 'シーズン2', '先生', 'シーズン3', '巡業', '2時間', '編集者', 'カメラマン', '1日', 'スタッフ', 'ウィリー', 'ティファニー・ティーセン', 'ティファニー', '１話', '７日間', 'ふたつ', '音声さん', 'ホワイトカラー “知的”犯罪ファイル'], ['Ordinal_Number', 'Book', 'Person', 'Person', 'Person', 'Person', 'N_Person', 'Position_Vocation', 'Person', 'Position_Vocation', 'Person', 'Animal_Part', 'Country', 'Book', 'Person', 'Mammal', 'Person', 'Position_Vocation', 'Person', 'Position_Vocation', 'Flora', 'Position_Vocation', 'Person', 'Mammal', 'Countx_Other', 'Animal_Disease', 'Date', 'Ordinal_Number', 'Position_Vocation', 'Ordinal_Number', 'Person', 'Period_Time', 'Position_Vocation', 'Position_Vocation', 'Period_Day', 'Position_Vocation', 'Person', 'Person', 'Product_Other', 'Ordinal_Number', 'Period_Day', 'N_Product', 'Position_Vocation',

In [None]:
predict[0]

['movie-enter',
 ['——',
  'ホワイト・カラー',
  'ティム・ディケイ',
  'ピーター',
  'バーク',
  'ニール',
  'ふたり',
  '俳優',
  'ティム',
  'プロデューサー',
  'ボマー',
  '銃弾',
  'アメリカ',
  '私の家庭では妻がズボンを履く',
  'エリザベス',
  '犬',
  'サッチモ',
  'FBI',
  'ナーバス',
  '脚本家',
  '愛妻家',
  '敏腕捜査官',
  'ハニー',
  '人間',
  'ひとつ',
  '両極端',
  '——シーズン2',
  'シーズン2',
  '先生',
  'シーズン3',
  '巡業',
  '2時間',
  '編集者',
  'カメラマン',
  '1日',
  'スタッフ',
  'ウィリー',
  'ティファニー・ティーセン',
  'ティファニー',
  '１話',
  '７日間',
  'ふたつ',
  '音声さん',
  'ホワイトカラー “知的”犯罪ファイル'],
 ['Ordinal_Number',
  'Book',
  'Person',
  'Person',
  'Person',
  'Person',
  'N_Person',
  'Position_Vocation',
  'Person',
  'Position_Vocation',
  'Person',
  'Animal_Part',
  'Country',
  'Book',
  'Person',
  'Mammal',
  'Person',
  'Position_Vocation',
  'Person',
  'Position_Vocation',
  'Flora',
  'Position_Vocation',
  'Person',
  'Mammal',
  'Countx_Other',
  'Animal_Disease',
  'Date',
  'Ordinal_Number',
  'Position_Vocation',
  'Ordinal_Number',
  'Person',
  'Period_Time',
  'Position_Vocation',
  'Posi