In [2]:
from transformers import BertTokenizer, BertForSequenceClassification
from torch.utils.data import DataLoader
from transformers import AdamW
import pandas as pd
import os
import torch
import torch.nn as nn
from torch.nn import CrossEntropyLoss
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, roc_auc_score, classification_report
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from tqdm import tqdm
import json
from collections import OrderedDict
from datetime import datetime
import logging
logging.basicConfig(level=logging.INFO, format='%(message)s')

  return torch._C._cuda_getDeviceCount() > 0


In [3]:
tokenizer = BertTokenizer.from_pretrained("kykim/bert-kor-base")

In [4]:
## 读入数据
def read_data(file):
    texts = []
    labels = []
    data = pd.read_excel(file, engine='openpyxl')
    for row in data.itertuples():
        label = getattr(row, 'label')
        review = str(getattr(row, 'review'))[1:-1]
        texts.append(review)
        labels.append(label)
    assert len(texts) == len(labels)
    return texts, labels

In [5]:
texts, labels = read_data('Ko.xlsx')

In [6]:
train_texts, val_texts, train_labels, val_labels = train_test_split(
texts, labels, test_size=0.2, random_state=43, stratify=labels)

In [7]:
len(train_labels),len(val_labels),len(train_texts),len(val_texts)

(24000, 6000, 24000, 6000)

In [8]:
###  求最大长度，为后面分词做准备
max_len = max([len(item) for item in train_texts])
print(max_len)

max_len = max([len(item) for item in val_texts])
print(max_len)

298
298


In [9]:
# ### label和id进行映射
label2id = OrderedDict({item: idx for idx, item in enumerate(set(train_labels + val_labels))})
id2label = OrderedDict({v: k for k, v in label2id.items()})

In [10]:
# 训练集和验证集 分词
train_encodings = tokenizer(train_texts,
                            truncation=True,
                            padding=True,
                            max_length=128)
val_encodings = tokenizer(val_texts,
                          truncation=True,
                          padding=True,
                          max_length=128)

In [11]:
# PyTorch Dataset  创建Dataset
class CuDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        idx = int(idx)
        item = {
            key: torch.tensor(val[idx])
            for key, val in self.encodings.items()
        }
        item['labels'] = torch.tensor(label2id[self.labels[idx]])
        return item

    def __len__(self):
        return len(self.labels)

In [12]:
train_dataset = CuDataset(train_encodings, train_labels)
val_dataset = CuDataset(val_encodings, val_labels)

In [13]:
#  创建Dataloader
batch_size = 64
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
eval_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

In [14]:
device = torch.device('cuda:0') if torch.cuda.is_available() else torch.device('cpu')  # 使用cpu或者gpu
model = BertForSequenceClassification.from_pretrained("kykim/bert-kor-base", num_labels=len(label2id))
model.to(device)
model.train()

Some weights of the model checkpoint at kykim/bert-kor-base were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initia

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(42000, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, element

In [15]:
## 计算Accuracy，Precision，Recall，F1 score，confusion_matrix，classification_report
def compute_metrics(labels, preds):
    accuracy = accuracy_score(labels, preds)
    precision = precision_score(labels, preds)
    recall = recall_score(labels, preds)
    f1 = f1_score(labels, preds)
    print(f'accuracy: {accuracy}\n')
    print(f'precision: {precision}\n')
    print(f'recall: {recall}\n')
    print(f'f1: {f1}\n')
    print(confusion_matrix(labels, preds))
    print(classification_report(labels, preds))
    return f1

In [16]:
@torch.no_grad()
def eval_model(model, eval_loader):
    model.eval()
    labels = []
    preds = []
    for idx, batch in enumerate(eval_loader):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels.extend(batch['labels'].numpy())
        outputs = model(input_ids,attention_mask=attention_mask)  # 输出所有概率
        preds.extend(torch.argmax(outputs[0], dim=-1).cpu().numpy())  # 拿到标签
    f1 = compute_metrics(labels, preds)
    model.train()
    return f1

In [17]:
optim = AdamW(model.parameters(), lr=1e-5)  # 声明优化器
step = 0
best_f1 = 0
epoch = 5
for epoch in range(epoch):
    for idx, batch in tqdm(enumerate(train_loader),
                           total=len(train_texts) // batch_size):
        optim.zero_grad()
        input_ids = batch['input_ids'].to(device)
        labels = batch['labels'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        outputs = model(input_ids=input_ids, labels=labels, attention_mask=attention_mask)
        loss = outputs[0]  # 计算Loss
        logging.info(f'Epoch-{epoch}, Step-{step}, Loss: {loss.cpu().detach().numpy()}')
        step += 1
        loss.backward()
        optim.step()

    print(f'Epoch {epoch}, start evaluating.')
    f1 = eval_model(model, eval_loader)  # 评估模型
    if f1 > best_f1:
        print(f'best_f1: {f1}')
        model.save_pretrained('model_best')  # 保存模型
        tokenizer.save_pretrained('model_best')
        best_f1 = f1

  0%|                                                                          | 0/375 [00:00<?, ?it/s]Epoch-0, Step-0, Loss: 0.7549371719360352
  0%|                                                                          | 0/375 [03:35<?, ?it/s]


KeyboardInterrupt: 

In [18]:
def predict(model, tokenizer, text):
    encoding = tokenizer(text,
                         return_tensors="pt",
                         max_length=128,
                         truncation=True,
                         padding=True)
    encoding = {k:v.to(device) for  k,v in encoding.items()}
    outputs = model(**encoding)
    #pred = id2label[torch.argmax(outputs[0], dim=-1).numpy()[0]]
    pred = id2label[torch.argmax(outputs[0], dim=-1).cpu().detach().numpy()[0]]
    return pred

In [19]:
tokenizer = BertTokenizer.from_pretrained("model_best")
model = BertForSequenceClassification.from_pretrained(
    "model_best", num_labels=len(label2id))
model.to(device) 

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(42000, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, element

In [23]:
import numpy as np
import pickle
result_dict = []
for root, dirs, files in os.walk('test', topdown=True):
    for name in files:
        print(f'process file: {name}')
        with open(os.path.join(root, name), 'r',encoding='utf8') as f:
            lines = f.readlines()
    for idx, line in enumerate(lines):
        line = line.strip()
        if not line:
            continue
        text = line
        pred = predict(model,tokenizer,line)
        result_dict.append(pred)
np.save('preds.npy',result_dict)

process file: 20210623.csv


In [24]:
result_dict = np.load('preds.npy', allow_pickle=True)

In [25]:
print(predict(model, tokenizer, "바보"))

0
