<a href="https://colab.research.google.com/github/ICM-AI/retail_cloud/blob/main/retail_cloud_product_cls.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
!pip install --upgrade paddlehub
!pip install --upgrade paddlepaddle-gpu

Collecting paddlehub
  Downloading paddlehub-2.2.0-py3-none-any.whl (212 kB)
[?25l[K     |█▌                              | 10 kB 21.2 MB/s eta 0:00:01[K     |███                             | 20 kB 8.5 MB/s eta 0:00:01[K     |████▋                           | 30 kB 7.9 MB/s eta 0:00:01[K     |██████▏                         | 40 kB 7.3 MB/s eta 0:00:01[K     |███████▊                        | 51 kB 4.6 MB/s eta 0:00:01[K     |█████████▎                      | 61 kB 5.5 MB/s eta 0:00:01[K     |██████████▉                     | 71 kB 5.4 MB/s eta 0:00:01[K     |████████████▍                   | 81 kB 5.3 MB/s eta 0:00:01[K     |██████████████                  | 92 kB 5.9 MB/s eta 0:00:01[K     |███████████████▌                | 102 kB 5.3 MB/s eta 0:00:01[K     |█████████████████               | 112 kB 5.3 MB/s eta 0:00:01[K     |██████████████████▌             | 122 kB 5.3 MB/s eta 0:00:01[K     |████████████████████            | 133 kB 5.3 MB/s eta 0:00:01

In [4]:
import os
import random
import re

import numpy as np
import pandas as pd

import time
from datetime import datetime, timedelta

import warnings

warnings.filterwarnings('ignore')

import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib.pyplot import MultipleLocator

%matplotlib inline

import paddle
import paddlehub as hub
from paddlehub.datasets.base_nlp_dataset import TextClassificationDataset

from sklearn.model_selection import train_test_split

In [None]:
# 挂载 google drive
from google.colab import drive
drive.mount("/content/drive")

In [8]:
mf_standard_data = pd.read_excel('product_cls_data/mf_data.xlsx')
target_list = mf_standard_data['product_category_first_name'].drop_duplicates(
).tolist()

In [10]:
class MyDataset(TextClassificationDataset):
    # 数据集存放目录
    base_path = 'product_cls_data'
    # 数据集的标签列表
    label_list = target_list

    def __init__(self, tokenizer, max_seq_len: int = 128, mode: str = 'train'):
        if mode == 'train':
            data_file = 'train.txt'
        elif mode == 'test':
            data_file = 'test.txt'
        else:
            data_file = 'test.txt'
        super().__init__(base_path=self.base_path,
                         tokenizer=tokenizer,
                         max_seq_len=max_seq_len,
                         mode=mode,
                         data_file=data_file,
                         label_list=self.label_list,
                         is_file_with_header=True)


# 选择所需要的模型，获取对应的tokenizer
model = hub.Module(name='chinese-bert-wwm',
                   task='seq-cls',
                   num_classes=len(MyDataset.label_list))
tokenizer = model.get_tokenizer()

# 实例化训练集
train_dataset = MyDataset(tokenizer)

[32m[2022-03-26 23:18:00,668] [    INFO][0m - Already cached /root/.paddlenlp/models/bert-wwm-chinese/bert-wwm-chinese.pdparams[0m
[32m[2022-03-26 23:18:02,472] [    INFO][0m - Already cached /root/.paddlenlp/models/bert-wwm-chinese/bert-wwm-chinese-vocab.txt[0m


In [11]:
rain_dataset = MyDataset(tokenizer=tokenizer,
                         max_seq_len=128,
                         mode='train')
test_dataset = MyDataset(tokenizer=tokenizer,
                         max_seq_len=128,
                         mode='test')
dev_dataset = MyDataset(tokenizer=tokenizer,
                           max_seq_len=128,
                           mode='dev')

[32m[2022-03-26 23:18:32,044] [    INFO][0m - Already cached /root/.paddlenlp/models/bert-wwm-chinese/bert-wwm-chinese-vocab.txt[0m
[32m[2022-03-26 23:18:55,753] [    INFO][0m - Already cached /root/.paddlenlp/models/bert-wwm-chinese/bert-wwm-chinese-vocab.txt[0m
[32m[2022-03-26 23:19:01,322] [    INFO][0m - Already cached /root/.paddlenlp/models/bert-wwm-chinese/bert-wwm-chinese-vocab.txt[0m


In [None]:
optimizer = paddle.optimizer.Adam(learning_rate=5e-5,
                                  parameters=model.parameters())
trainer = hub.Trainer(model,
                      optimizer,
                      use_gpu=True,
                      checkpoint_dir='chinese-bert-wwm_product_cls')

trainer.train(train_dataset, epochs=3, batch_size=32, eval_dataset=dev_dataset)

# 在测试集上评估当前训练模型
trainer.evaluate(test_dataset, batch_size=32)

[36m[2022-03-26 23:21:55,921] [   TRAIN][0m - Epoch=1/3, Step=10/2776 loss=2.9935 acc=0.1719 lr=0.000050 step/sec=0.86 | ETA 02:42:07[0m
[36m[2022-03-26 23:22:07,401] [   TRAIN][0m - Epoch=1/3, Step=20/2776 loss=2.7023 acc=0.2687 lr=0.000050 step/sec=0.87 | ETA 02:40:44[0m
[36m[2022-03-26 23:22:18,914] [   TRAIN][0m - Epoch=1/3, Step=30/2776 loss=2.5055 acc=0.3250 lr=0.000050 step/sec=0.87 | ETA 02:40:25[0m
[36m[2022-03-26 23:22:30,431] [   TRAIN][0m - Epoch=1/3, Step=40/2776 loss=2.2168 acc=0.3969 lr=0.000050 step/sec=0.87 | ETA 02:40:16[0m
[36m[2022-03-26 23:22:42,014] [   TRAIN][0m - Epoch=1/3, Step=50/2776 loss=2.2693 acc=0.3563 lr=0.000050 step/sec=0.86 | ETA 02:40:22[0m
[36m[2022-03-26 23:22:53,591] [   TRAIN][0m - Epoch=1/3, Step=60/2776 loss=1.9012 acc=0.5000 lr=0.000050 step/sec=0.86 | ETA 02:40:25[0m
[36m[2022-03-26 23:23:05,210] [   TRAIN][0m - Epoch=1/3, Step=70/2776 loss=1.7348 acc=0.5500 lr=0.000050 step/sec=0.86 | ETA 02:40:33[0m
[36m[2022-03-26 23: