In [None]:
from google.colab import drive

In [None]:
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import sys
sys.path.append(f'/content/drive/My Drive/')
import os
os.chdir(f'/content/drive/My Drive/')

In [None]:
!pip install -r requirements.txt

Collecting fugashi==1.0.4
[?25l  Downloading https://files.pythonhosted.org/packages/32/0c/d0bf73e1a90aeb3e696c7741a812d4b86adc31a8a9783cc92c535ae29016/fugashi-1.0.4-cp36-cp36m-manylinux1_x86_64.whl (476kB)
[K     |████████████████████████████████| 481kB 9.1MB/s 
[?25hCollecting ipadic==1.0.0
[?25l  Downloading https://files.pythonhosted.org/packages/e7/4e/c459f94d62a0bef89f866857bc51b9105aff236b83928618315b41a26b7b/ipadic-1.0.0.tar.gz (13.4MB)
[K     |████████████████████████████████| 13.4MB 235kB/s 
[?25hCollecting logzero==1.5.0
  Downloading https://files.pythonhosted.org/packages/97/24/27295d318ea8976b12cf9cc51d82e7c7129220f6a3cc9e3443df3be8afdb/logzero-1.5.0-py2.py3-none-any.whl
Collecting mojimoji==0.0.11
[?25l  Downloading https://files.pythonhosted.org/packages/1d/0e/eb8297652315519ccc0ca3da9e06f0457d87e27f1000f696ca537914856f/mojimoji-0.0.11-cp36-cp36m-manylinux1_x86_64.whl (126kB)
[K     |████████████████████████████████| 133kB 56.0MB/s 
[?25hCollecting sentence-tra

In [None]:
from transformers import BertJapaneseTokenizer, BertForSequenceClassification
from torchtext.data import Field, Dataset, Example
from sklearn.metrics import f1_score, classification_report, accuracy_score
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from logzero import logger
from pathlib import Path
from tqdm import tqdm
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torchtext
import mojimoji
import collections
import time
import random
import re

In [None]:
class EarlyStopping:
    """
    Early stops the training if validation loss doesn't improve after a given patience.
    based on: https://github.com/Bjarten/early-stopping-pytorch
    """
    def __init__(self, patience=7, verbose=False, delta=0):
        """
        Args:
            patience (int): How long to wait after last time validation loss improved.
                            Default: 7
            verbose (bool): If True, prints a message for each validation loss improvement. 
                            Default: False
            delta (float): Minimum change in the monitored quantity to qualify as an improvement.
                            Default: 0
        """
        self.patience = patience
        self.verbose = verbose
        self.counter = 0
        self.best_score = None
        self.early_stop = False
        self.val_loss_min = np.Inf
        self.delta = delta

    def __call__(self, val_loss, model):

        score = -val_loss

        if self.best_score is None:
            self.best_score = score
            self.save_checkpoint(val_loss, model)
        elif score < self.best_score + self.delta:
            self.counter += 1
            logger.info(f'EarlyStopping counter: {self.counter} out of {self.patience}')
            if self.counter >= self.patience:
                self.early_stop = True
        else:
            self.best_score = score
            self.save_checkpoint(val_loss, model)
            self.counter = 0

    def save_checkpoint(self, val_loss, model):
        '''Saves model when validation loss decrease.'''
        if self.verbose:
            logger.info(f'Validation loss decreased ({self.val_loss_min:.6f} --> {val_loss:.6f}).  Saving model ...')
        torch.save(model.state_dict(), 'checkpoint.pt')
        self.val_loss_min = val_loss

In [None]:
class DataFrameDataset(Dataset):
    """
    pandas DataFrameからtorchtextのdatasetつくる
    https://stackoverflow.com/questions/52602071/dataframe-as-datasource-in-torchtext
    """
    def __init__(self, examples, fields, filter_pred=None):
        """
         Create a dataset from a pandas dataframe of examples and Fields
         Arguments:
             examples pd.DataFrame: DataFrame of examples
             fields {str: Field}: The Fields to use in this tuple. The
                 string is a field name, and the Field is the associated field.
             filter_pred (callable or None): use only exanples for which
                 filter_pred(example) is true, or use all examples if None.
                 Default is None
        """
        self.examples = examples.apply(SeriesExample.fromSeries, args=(fields,), axis=1).tolist()
        if filter_pred is not None:
            self.examples = filter(filter_pred, self.examples)
        self.fields = dict(fields)
        # Unpack field tuples
        for n, f in list(self.fields.items()):
            if isinstance(n, tuple):
                self.fields.update(zip(n, f))
                del self.fields[n]

In [None]:
class SeriesExample(Example):
    """Class to convert a pandas Series to an Example"""
    @classmethod
    def fromSeries(cls, data, fields):
        return cls.fromdict(data.to_dict(), fields)

    @classmethod
    def fromdict(cls, data, fields):
        ex = cls()

        for key, field in fields.items():
            if key not in data:
                raise ValueError("Specified key {} was not found in "
                "the input data".format(key))
            if field is not None:
                setattr(ex, key, field.preprocess(data[key]))
            else:
                setattr(ex, key, data[key])
        return ex

In [None]:
class BertClassifier:
    def __init__(self, net_dir=None, tokenizer_dir=None, max_length=512, batch_size=32, num_labels=2, num_epochs=100, random_seed=None):
        self.max_length = max_length
        self.batch_size = batch_size
        self.num_labels = num_labels
        self.num_epochs = num_epochs
        
        if random_seed is not None:
            self.seed_everything(random_seed)
            
        if tokenizer_dir is None:
          self.tokenizer = BertJapaneseTokenizer.from_pretrained("cl-tohoku/bert-base-japanese-whole-word-masking")
          # self.tokenizer.save_pretrained('tokenizer_dir') 形態素解析の結果を保存
        if net_dir is None:
          self.net = BertForSequenceClassification.from_pretrained("cl-tohoku/bert-base-japanese-whole-word-masking", num_labels=num_labels)
          # self.net.save_pretrained('model_dir') モデルを保存
        else:
            self.net = BertForSequenceClassification.from_pretrained(net_dir)
        self.TEXT = torchtext.data.Field(
            sequential=True,
            tokenize=self.tokenizer_with_preprocessing,
            use_vocab=True,
            lower=False,
            include_lengths=True,
            batch_first=True,
            fix_length=max_length,
            init_token='[CLS]',
            eos_token='[SEP]',
            pad_token='[PAD]',
            unk_token='[UNK]'
        )
        self.LABEL = torchtext.data.Field(sequential=False, use_vocab=False)
    
    def seed_everything(self, seed):
        random.seed(seed)
        os.environ['PYTHONHASHSEED'] = str(seed)
        np.random.seed(seed)
        torch.manual_seed(seed)
        torch.cuda.manual_seed(seed)
        torch.backends.cudnn.deterministic = True
        torch.backends.cudnn.benchmark = False
        logger.info('Set random seeds')

    def tokenizer_with_preprocessing(self, text):
        # 半角、全角の変換
        text = mojimoji.han_to_zen(text)
        # 改行、半角スペース、全角スペースを削除
        text = re.sub('\r', '', text)
        text = re.sub('\n', '', text)
        text = re.sub('　', '', text)
        text = re.sub(' ', '', text)
        # 数字文字の一律「0」化
        text = re.sub(r'[0-9 ０-９]', '0', text)  # 数字
        ret = self.tokenizer.tokenize(text)
        return ret

    def _build_vocab(self, ds, min_freq=1):
        self.TEXT.build_vocab(ds, min_freq=min_freq)
        self.TEXT.vocab.stoi = self.tokenizer.vocab
        
    def fit(self, train_df, test_df, early_stopping_rounds=10, fine_tuning_type='fast'):
        print("-------------------------Training Phase----------------------------\n")
        print('Creating Datasets from pandas dataframes')
        train_ds = DataFrameDataset(train_df, fields={'Text': self.TEXT, 'Label': self.LABEL})
        test_ds = DataFrameDataset(test_df, fields={'Text': self.TEXT, 'Label': self.LABEL})
        logger.info('Creating datasets from  pandas dataframes has finished ')
        if not hasattr(self.TEXT, 'vocab'):
            self._build_vocab(train_ds, min_freq=1)
        
        print('Creating dataLoaders')
        train_dl = torchtext.data.Iterator(train_ds, batch_size=self.batch_size, train=True)
        test_dl = torchtext.data.Iterator(test_ds, batch_size=self.batch_size, train=False, sort=False)
        logger.info('Creating dataLoaders has finished ')

        dataloaders_dict = {
            'train': train_dl,
            'test': test_dl
        }
        if fine_tuning_type == 'fast':
            # 1. まず全部を、勾配計算Falseにしてしまう
            for name, param in self.net.named_parameters():
                param.requires_grad = False
            # 2. 最後のBertLayerモジュールを勾配計算ありに変更
            for name, param in self.net.bert.encoder.layer[-1].named_parameters():
                param.requires_grad = True
            # 3. 識別器を勾配計算ありに変更
            for name, param in self.net.classifier.named_parameters():
                param.requires_grad = True
        elif fine_tuning_type == 'full':
            for name, param in self.net.named_parameters():
                param.requires_grad = True
        else:
            logger.error('please input fine_tuning_type "fast" or "full"')
            raise ValueError

        # 最適化手法の設定
        # BERTの元の部分はファインチューニング
        optimizer = optim.Adam([
            {'params': self.net.bert.encoder.layer[-1].parameters(), 'lr': 5e-5},
            {'params': self.net.classifier.parameters(), 'lr': 5e-5}
        ], betas=(0.9, 0.999))

        # 損失関数の設定
        criterion = nn.CrossEntropyLoss()

        # 学習・検証を実行。
        self.net = self._train_model(
            self.net, dataloaders_dict, criterion, optimizer, num_epochs=self.num_epochs,
            patience=early_stopping_rounds)

        return self

    @staticmethod
    def _train_model(net, dataloaders_dict, criterion, optimizer, num_epochs, patience):

        # GPUが使えるかを確認
        device = torch.device("cuda" if torch.cuda.is_available() else "cpu") 
        logger.info(f"使用デバイス：{device}")
        logger.info('-----start-------')

        # ネットワークをGPUへ送る
        net.to(device)

        # ネットワークがある程度固定であれば、高速化させる
        torch.backends.cudnn.benchmark = True

        # ミニバッチのサイズ
        batch_size = dataloaders_dict["train"].batch_size

        # early stopping
        # initialize the early_stopping object
        early_stopping = EarlyStopping(patience=patience, verbose=True)

        # epochのループ
        for epoch in range(num_epochs):
            # epochごとの訓練と検証のループ
            for phase in ['train', 'test']:
                if phase == 'train':
                    net.train()  # モデルを訓練モードに
                else:
                    net.eval()   # モデルを検証モードに

                epoch_loss = 0.0  # epochの損失和
                epoch_corrects = 0  # epochの正解数
                iteration = 1

                # 開始時刻を保存
                t_epoch_start = time.time()
                t_iter_start = time.time()
                predictions = []
                ground_truths = []

                # データローダーからミニバッチを取り出すループ
                for batch in (dataloaders_dict[phase]):
                    # batchはTextとLableの辞書型変数

                    # GPUが使えるならGPUにデータを送る
                    inputs = batch.Text[0].to(device)  # 文章
                    labels = batch.Label.to(device)  # ラベル

                    # optimizerを初期化
                    optimizer.zero_grad()

                    # 順伝搬（forward）計算
                    with torch.set_grad_enabled(phase == 'train'):

                        loss, logit = net(input_ids=inputs, labels=labels)                    
                        _, preds = torch.max(logit, axis=1)  # ラベルを予測
                        predictions.append(preds.cpu().numpy())
                        ground_truths.append(labels.data.cpu().numpy())

                        # 訓練時はバックプロパゲーション
                        if phase == 'train':
                            loss.backward()
                            optimizer.step()

                            if (iteration % 10 == 0):  # 10回の学習を行う終えるたびににlossを表示
                                t_iter_finish = time.time()
                                duration = t_iter_finish - t_iter_start
                                acc = (torch.sum(preds == labels.data))
                                t_iter_start = time.time()

                        iteration += 1

                        # 損失と正解数の合計を更新
                        epoch_loss += loss.item() * batch_size
                        epoch_corrects += torch.sum(preds == labels.data)

                # epochごとのlossと正解率
                t_epoch_finish = time.time()
                epoch_loss = epoch_loss / len(dataloaders_dict[phase].dataset)
                epoch_acc = epoch_corrects.double(
                ) / len(dataloaders_dict[phase].dataset)
                if net.num_labels > 2:
                    calc_f1_average = 'macro'
                else:
                    calc_f1_average = 'binary'
                epoch_f1_score = f1_score(np.concatenate(np.array(ground_truths)), np.concatenate(np.array(predictions)), average=calc_f1_average)
                logger.info('Epoch {}/{} | {:^5} |  Loss: {:.4f} Acc: {:.4f} F1-Score: {:4f}'.format(epoch+1, num_epochs,
                                                                            phase, epoch_loss, epoch_acc, epoch_f1_score))
    
                if phase == 'val':
                    early_stopping(epoch_loss, net)
        
                if early_stopping.early_stop:
                    logger.info("Early stopping")
                    # load the last checkpoint with the best model
                    net.load_state_dict(torch.load('checkpoint.pt'))
                    return net
        
                t_epoch_start = time.time()
        
        torch.cuda.empty_cache()
        return net
    
    def predict(self, test_df):
        print("-------------------------Prediction Phase----------------------------\n")
        print('Creating dataaet and dataloader from pandas dataframe')
        test_ds = DataFrameDataset(test_df, fields={'Text': self.TEXT})
        if not hasattr(self.TEXT, 'vocab'):
            self._build_vocab(test_ds, min_freq=1)
        test_dl = torchtext.data.Iterator(test_ds, batch_size=self.batch_size, train=False, sort=False)
        logger.info('Dataset and DataLoader from pandas dataframe has finished')
        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        logger.info(f"使用デバイス：{device}")
        logger.info('-----Start Prediction -------')
        self.net.eval()
        self.net.to(device)

        logits = []
        for batch in tqdm(test_dl):
            inputs = batch.Text[0].to(device)
            with torch.set_grad_enabled(False):
                logit = self.net(input_ids=inputs)
                logit = F.softmax(logit[0], dim=1).cpu().numpy()
                logits.append(logit)

        y_pred = []
        for i in range(len(logits)):
            for each_pred_label in logits[i]:
                y_pred.append(np.argmax(each_pred_label))

        logger.info('------Finished Prediction------')
        return np.array(y_pred)
        
    def predict_proba(self, test_df):
        test_ds = DataFrameDataset(test_df, fields={'Text': self.TEXT})
        if not hasattr(self.TEXT, 'vocab'):
          self._build_vocab(test_ds, min_freq=1)
        test_dl = torchtext.data.Iterator(test_ds, batch_size=self.batch_size, train=False, sort=False)
        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        logger.info(f"使用デバイス：{device}")
        logger.info('-----start-------')
        self.net.eval()
        self.net.to(device)
        
        logits = []
        for batch in tqdm(test_dl):
          inputs = batch.Text[0].to(device)
          with torch.set_grad_enabled(False):
            logit = self.net(input_ids=inputs)
            logit = F.softmax(logit[0], dim=1).cpu().numpy()
            logits.append(logit)
        
        pred_proba = []
        for i in range(len(logits)):
            for each_pred_proba in logits[i]:
                pred_proba.append(each_pred_proba)

        logger.info('-----finished-------')
        return np.array(pred_proba)

In [None]:
import glob

# 分類のテスト用コーパスを生成するための関数 
def load_livedoor_news_corpus():
    # カテゴリー辞書を定義
    category = {
        "dokujo-tsushin": 0,
        "it-life-hack":1,
        "kaden-channel": 2,
        "livedoor-homme": 3,
        "movie-enter": 4,
        "peachy": 5,
        "smax": 6,
        "sports-watch": 7,
        "topic-news":8
    }
 
    docs  = []
    labels = []
    
    
    # 全てのカテゴリーのディレクトリについて実行
    for c_name, c_id in category.items():
        # ファイルのパスを取得
        files = glob.glob("/content/drive/My Drive/corpus/text/{c_name}/{c_name}*.txt".format(c_name=c_name))
        # カテゴリー名とファイル数を表示
        print("category: ", c_name, ", ",  len(files))
 
        # 各記事について、URL、 日付、タイトル、 本文の情報を以下のようにして取得
        for file in files:
            with open(file, "r", encoding="utf-8") as f:
                # 改行文字で分割してリストで返す
                lines = f.read().splitlines()
                # url, 日付、タイトル、本文を取得
                url = lines[0]  
                datetime = lines[1]   
                subject = lines[2] 
                # 記事中の本文を1行にまとめる
                body = "".join(lines[3:])
                # タイトルと本文をまとめる
                text = subject + body
  
            # textをdocsに追加
            docs.append(text)
            # c_idをlabelsに追加
            labels.append(c_id)
 
    return docs, labels

In [None]:
if os.path.exists("/content/drive/My Drive/corpus/test_corpus_using_livedoor_dataset.csv"):
    df = pd.read_csv("/content/drive/My Drive/corpus/test_corpus_using_livedoor_dataset.csv")
else:
    docs, labels = load_livedoor_news_corpus()
    df_text = pd.DataFrame(docs, columns=["Text"])
    df_label = pd.DataFrame(labels, columns=["Label"])
    df = pd.concat([df_text, df_label], axis=1)
    df.to_csv("/content/drive/My Drive/corpus/test_corpus_using_livedoor_dataset.csv")

In [None]:
from sklearn.model_selection import train_test_split
train_df, test_df = train_test_split(df, test_size=0.2, stratify=df["Label"])

In [None]:
model = BertClassifier(num_labels=len(np.unique(df["Label"])), num_epochs=100)
model.fit(train_df, test_df, early_stopping_rounds=10)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=257706.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=433.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=445021143.0, style=ProgressStyle(descri…




Some weights of the model checkpoint at cl-tohoku/bert-base-japanese-whole-word-masking were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialize

-------------------------Training Phase----------------------------

Creating Datasets from pandas dataframes


[I 201017 05:11:38 <ipython-input-9-f1de98c0c243>:66] Creating datasets from  pandas dataframes has finished 
[I 201017 05:11:39 <ipython-input-9-f1de98c0c243>:73] Creating dataLoaders has finished 
[I 201017 05:11:39 <ipython-input-9-f1de98c0c243>:118] 使用デバイス：cuda
[I 201017 05:11:39 <ipython-input-9-f1de98c0c243>:119] -----start-------


Creating dataLoaders


[I 201017 05:15:56 <ipython-input-9-f1de98c0c243>:200] Epoch 1/100 | train |  Loss: 1.1523 Acc: 0.6559 F1-Score: 0.620737
[I 201017 05:16:49 <ipython-input-9-f1de98c0c243>:200] Epoch 1/100 | test  |  Loss: 0.5000 Acc: 0.8643 F1-Score: 0.854155
[I 201017 05:20:56 <ipython-input-9-f1de98c0c243>:200] Epoch 2/100 | train |  Loss: 0.4092 Acc: 0.8843 F1-Score: 0.876274
[I 201017 05:21:49 <ipython-input-9-f1de98c0c243>:200] Epoch 2/100 | test  |  Loss: 0.2853 Acc: 0.9213 F1-Score: 0.916354
[I 201017 05:25:56 <ipython-input-9-f1de98c0c243>:200] Epoch 3/100 | train |  Loss: 0.2574 Acc: 0.9296 F1-Score: 0.924051
[I 201017 05:26:49 <ipython-input-9-f1de98c0c243>:200] Epoch 3/100 | test  |  Loss: 0.2287 Acc: 0.9355 F1-Score: 0.931431
[I 201017 05:30:56 <ipython-input-9-f1de98c0c243>:200] Epoch 4/100 | train |  Loss: 0.1833 Acc: 0.9481 F1-Score: 0.943707
[I 201017 05:31:49 <ipython-input-9-f1de98c0c243>:200] Epoch 4/100 | test  |  Loss: 0.1750 Acc: 0.9450 F1-Score: 0.941733
[I 201017 05:35:57 <ipyt

<__main__.BertClassifier at 0x7f7649d9bf98>

In [None]:
y_true = test_df["Label"]
y_pred = model.predict(test_df)

-------------------------Prediction Phase----------------------------

Creating dataaet and dataloader from pandas dataframe


[I 201017 13:32:31 <ipython-input-9-f1de98c0c243>:223] Dataset and DataLoader from pandas dataframe has finished
[I 201017 13:32:31 <ipython-input-9-f1de98c0c243>:226] 使用デバイス：cuda
[I 201017 13:32:31 <ipython-input-9-f1de98c0c243>:227] -----Start Prediction -------
100%|██████████| 47/47 [00:52<00:00,  1.12s/it]
[I 201017 13:33:24 <ipython-input-9-f1de98c0c243>:244] ------Finished Prediction------


In [None]:
label_names = ["dokujo-tsushin", "it-life-hack",  "kaden-channel", 
               "livedoor-homme", "movie-enter",  "peachy",
               "smax", "sports-watch","topic-news"]
print("Acuuracy: {:.4f}".format(accuracy_score(y_true, y_pred)))
print("{}\n".format(classification_report(y_true=y_true, y_pred=y_pred, target_names = label_names)))

Acuuracy: 0.9647
                precision    recall  f1-score   support

dokujo-tsushin       0.95      0.96      0.95       174
  it-life-hack       0.98      0.97      0.97       174
 kaden-channel       0.97      0.98      0.97       173
livedoor-homme       0.92      0.86      0.89       102
   movie-enter       0.98      0.97      0.97       174
        peachy       0.93      0.93      0.93       169
          smax       0.97      0.99      0.98       174
  sports-watch       0.99      0.98      0.99       180
    topic-news       0.97      0.99      0.98       154

      accuracy                           0.96      1474
     macro avg       0.96      0.96      0.96      1474
  weighted avg       0.96      0.96      0.96      1474


