In [1]:
import time, math
import torch
import torch.utils.data as data
import os
import pandas as pd
import json
import numpy

In [None]:
# from google.colab import drive
# drive.mount('/content/drive')
# %cd /content/drive/My\ Drive/Colab Notebooks/OursRepository/public-opinion-monitor

# !pip install transformers

In [2]:
from transformers import BertForSequenceClassification as Model, BertTokenizer as Tokenizer
import logging
logging.basicConfig(level=logging.INFO)

In [3]:
class DotDict(dict):
    def __init__(self, *args, **kwargs):
        dict.__init__(self, *args, **kwargs)
        self.__dict__ = self

In [4]:
opt = DotDict(
    num_labels = 6,
    batch_size = 10,
    num_workers = 0,
    cache_dir = '../../PretrainedData/Transformers/bert-base-chinese',
    data_path = 'data',
               )

print('cuda.is_available',torch.cuda.is_available())

if torch.cuda.is_available():
  opt.gpu = True
else:
  opt.gpu = False

cuda.is_available True


In [5]:
# num_labels是分类的类数

model = Model.from_pretrained('bert-base-chinese',
                                         num_labels=opt.num_labels,
                                         cache_dir = opt.cache_dir,
                                         )
tokenizer = Tokenizer.from_pretrained('bert-base-chinese',
                                          cache_dir = opt.cache_dir,)
print(tokenizer)

if opt.gpu:
    model = model.cuda()

INFO:transformers.configuration_utils:loading configuration file https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-chinese-config.json from cache at ../../PretrainedData/Transformers/bert-base-chinese\8a3b1cfe5da58286e12a0f5d7d182b8d6eca88c08e26c332ee3817548cf7e60a.3767c74c8ed285531d04153fe84a0791672aff52f7249b27df341dbce09b8305
INFO:transformers.configuration_utils:Model config BertConfig {
  "_num_labels": 6,
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bad_words_ids": null,
  "bos_token_id": null,
  "decoder_start_token_id": null,
  "directionality": "bidi",
  "do_sample": false,
  "early_stopping": false,
  "eos_token_id": null,
  "finetuning_task": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2",
    "3": "LABEL_3",
    "4": "LABEL_4",
    "5": "LABEL_5"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
 

<transformers.tokenization_bert.BertTokenizer object at 0x000001D114FFC648>


In [None]:
df = pd.DataFrame([[1, 2], [3, 4]], columns=list('AB'))
df2 = pd.DataFrame([[5, 6], [7, 8]], columns=list('AB'))
df = df.append(df2)
print(df)

In [18]:
def isNan(a):
    return a != a

class RatingData4Bert(data.Dataset):
    def __init__(self, path, tokenizer:Tokenizer, max_row = -1, trainTestRate = 0.85, isTrain = True):
        self.token_list = []
        self.label_list = []
        self.token_positions = torch.tensor([i for i in range(100)])

        print('地址不应该包含 ratings.csv   path:',path) # 地址不应该包含 ratings.csv

        ratings_clean_filename = os.path.join(path, 'ratings_clean.csv')
        ratings_filename = os.path.join(path, 'ratings.csv')
        if os.path.isfile(ratings_clean_filename):
            clean_pd = pd.read_csv(ratings_clean_filename)
        else:
            print('没有找到缓存的文件%s, 读取源文件%s'%(ratings_clean_filename, ratings_filename))
            ratings_pd = pd.read_csv(ratings_filename)
            print('开始生成缓存文件%s'%(ratings_clean_filename))
            clean_pd = pd.DataFrame({
                'userId':[],
                'restId':[],
                'rating':[],
                'comment':[],
            })
            nonRatingCount = 0
            for i, row in ratings_pd.iterrows():
                if max_row != -1 and i > max_row:
                    break
                if not isinstance(row['comment'], str) or row['comment'] == '':
                    # print(i + 1, row['comment'])
                    nonRatingCount += 1
                    continue
                r0 = row['rating']
                r1 = row['rating_env']
                r2 = row['rating_flavor']
                r3 = row['rating_service']
                if r0 == '' or isNan(r0): r0 = 0 # 假设总评分为 0 表示未评分
                if r1 == '' or isNan(r1): r1 = 3
                if r2 == '' or isNan(r2): r2 = 3
                if r3 == '' or isNan(r3): r3 = 3
                r0 = round(r0 * 0.5 + (r1 + r2 + r3) * 0.1666666)
                if i % 10000 == 9999:
                    print(i + 1, r0)

                token = tokenizer.encode(text=str(row['comment']), max_length=100, pad_to_max_length = True)
                # print('token', token)
                # token = [101] + token + [102]

                newRow = DotDict()
                newRow.userId = [row['userId']]
                newRow.restId = [row['restId']]
                newRow.rating = [r0]
                newRow.comment = [json.dumps(token)]

                clean_pd = clean_pd.append(pd.DataFrame(newRow), ignore_index=True)
            print('空的评论数量： %d'%(nonRatingCount))
            clean_pd.to_csv(ratings_clean_filename)

        # 读取
        if isTrain:
            temp_pd = clean_pd[ : int(len(clean_pd) * trainTestRate)]
        else:
            temp_pd = clean_pd[int(len(clean_pd) * trainTestRate) : ]

        for i, row in temp_pd.iterrows():
            if max_row != -1 and i > max_row:
                break

            self.label_list.append(torch.tensor(row['rating']).long())
            self.token_list.append(torch.from_numpy(numpy.array( json.loads(row['comment']) ) ).long())

    def __getitem__(self, index):
        # print(self.token_list[index], self.label_list[index], self.token_positions)
        return self.token_list[index], self.label_list[index], self.token_positions

    def __len__(self):
        return len(self.label_list)

In [None]:
ratingData = RatingData4Bert('../../DataSets/yf_dianping',
                             tokenizer=tokenizer,
                             max_row= 200000,
                             isTrain=True,
                             )
trainLoader = torch.utils.data.DataLoader(dataset=ratingData,
                                          batch_size=opt.batch_size,
                                          shuffle = True,
                                          # num_workers = 0,
                                          )
ratingData2 = RatingData4Bert('../../DataSets/yf_dianping',
                             tokenizer=tokenizer,
                             max_row= 200000,
                             isTrain=False,
                             )
testLoader = torch.utils.data.DataLoader(dataset=ratingData2,
                                          batch_size=opt.batch_size,
                                          shuffle = True,
                                          # num_workers = 0,
                                          )
print(len(ratingData.label_list))
print(len(ratingData2.label_list))

地址不应该包含 ratings.csv   path: ../../DataSets/yf_dianping


In [None]:
from torch.optim.optimizer import Optimizer
from torch.nn.modules.loss import MSELoss as Loss

def trainOneEpoch(epoch, model:Model, trainLoader, optimizer:Optimizer, opt):
    model.train()
    
    startTime = time.time()
    for i, (x, y, p) in enumerate(trainLoader):
        if opt.gpu:
            x = x.cuda()
            y = y.cuda()
            p = p.cuda()

        outputs = model(input_ids = x, labels = y, position_ids = p)
        loss = outputs[0]
        # logits = outputs[1]
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        if i % 1000 == 999:
            print('Epoch %d, %d/%d, loss:%f ' % (epoch, i, len(trainLoader), loss.means()))
    print('Epoch %d cost time: %.3fs' % (epoch, time.time() - startTime))


def testModel(epoch, model:Model, testLoader, opt):
    model.eval()

    total = 0
    correct = 0

    startTime = time.time()
    for i, (x, y, p) in enumerate(testLoader):
        if opt.gpu:
            x = x.cuda()
            y = y.cuda()
            p = p.cuda()

        outputs = model(input_ids = x, labels = y, position_ids = p)
        loss = outputs[0]
        logits = outputs[1]
        _, predicted = torch.max(logits.data, 1)

        total += x.size(0)
        correct += predicted.data.eq(y.data).cpu().sum()

        if i % 1000 == 999:
            print('Epoch Test %d, %d/%d, loss:%f ' % (epoch, i, len(testLoader), loss))
    print('Epoch Test %d cost time: %.3fs' % (epoch, time.time() - startTime))
    print('准确率： %.3f' % (correct / total))


def train(nepoch, modelSavePath):
    optimizer=torch.optim.SGD(model.parameters(), momentum=0.9, lr=0.001)
    for epoch in range(nepoch):
        trainOneEpoch(epoch, model, trainLoader, optimizer, opt)
        testModel(epoch, model, testLoader, opt)
    torch.save(model.state_dict(), modelSavePath)

def eval(modelSavePath, isLoad = True):
    if isLoad: model.load_state_dict(torch.load(modelSavePath))
    testModel(0, model, testLoader, opt)

train(10, 'EmotionAnalyzeModelData.model')

In [None]:
print( *torch.max(torch.tensor([[5, 33, 2, 65, 4]]), 1))  # 1是维度，用*来取值可得到两个值:最大值和索引
