In [1]:
import joblib
import re
import string
import jieba
import pandas as pd
from tokenizers import (
    models,
    normalizers,
    pre_tokenizers,
    trainers,
    Tokenizer,
)

from datasets import Dataset
from tqdm import tqdm
tqdm.pandas()
from transformers import PreTrainedTokenizerFast
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [2]:
# 请修改测试集文件名
test_dir = 'test.csv'

In [3]:
# 如测试集中不存在topic列，请使用 test = df[['content']]
df = pd.read_csv(test_dir)
test = df[['content', 'topic']]
#test = df[['content']]

In [4]:
def get_stopword():
    with open('baidu_stopwords.txt', 'r', encoding='utf-8') as f:
        stopwords = set(line.strip() for line in f)
    stopwords.update(["年","月","日"])
    return stopwords

def wordopt_cn(text):
    # replace punctuation with space
    no_punct = re.sub(pattern, ' ', text)
    return str(no_punct)

def remove_stopwords(words):
    return (word for word in words if word not in stopwords)

def process_content(content):
    content = wordopt_cn(content)
    words = jieba.cut(content)
    words = remove_stopwords(words)
    content = ' '.join(words)
    content = re.sub(spaces_pattern, ' ', content)
    return content

# Precompile the regular expressions
punctuation = "！？｡。＂＃＄％＆＇（）＊＋，－／：；＜＝＞＠［＼］＾＿｀｛｜｝～｟｠｢｣､、〃》「」『』【】〔〕〖〗〘〙〚〛〜〝〞〟〰〾〿–—‘’‛“”„‟…‧﹏." + string.punctuation
pattern = re.compile('[%s]' % re.escape(punctuation))
spaces_pattern = re.compile(r'\s+')
stopwords = get_stopword()

In [5]:
test['content'] = test['content'].progress_apply(process_content)

  0%|          | 0/3405 [00:00<?, ?it/s]Building prefix dict from the default dictionary ...
Loading model from cache C:\Users\shens\AppData\Local\Temp\jieba.cache
Loading model cost 0.349 seconds.
Prefix dict has been built successfully.
100%|██████████| 3405/3405 [00:11<00:00, 290.86it/s]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test['content'] = test['content'].progress_apply(process_content)


In [6]:
def dummy(text):
    return text
#test=pd.read_csv('test_jieba.csv')
vectorizer = joblib.load('vectorizer.pkl')
model = joblib.load('ensemble.pkl')

In [7]:
LOWERCASE = False
VOCAB_SIZE = 100_000
raw_tokenizer = Tokenizer(models.BPE(unk_token="[UNK]"))
raw_tokenizer.normalizer = normalizers.Sequence([normalizers.NFC()] + [normalizers.Lowercase()] if LOWERCASE else [])
raw_tokenizer.pre_tokenizer = pre_tokenizers.WhitespaceSplit()

special_tokens = ["[UNK]", "[PAD]", "[CLS]", "[SEP]", "[MASK]"]
trainer = trainers.BpeTrainer(vocab_size=VOCAB_SIZE, special_tokens=special_tokens)

dataset = Dataset.from_pandas(test[['content']])


def train_corp_iter():
    for i in range(0, len(dataset), 1000):
        yield dataset[i: i + 1000]["content"]


raw_tokenizer.train_from_iterator(train_corp_iter(), trainer=trainer)
tokenizer = PreTrainedTokenizerFast(
    tokenizer_object=raw_tokenizer,
    unk_token="[UNK]",
    pad_token="[PAD]",
    cls_token="[CLS]",
    sep_token="[SEP]",
    mask_token="[MASK]",
)
tokenized_texts_test = []

for text in tqdm(test['content'].tolist()):
    tokenized_texts_test.append(tokenizer.tokenize(text))
    
tf_test = vectorizer.transform(tokenized_texts_test)

100%|██████████| 3405/3405 [00:01<00:00, 1825.90it/s]


In [8]:
pred = model.predict(tf_test)
prob = model.predict_proba(tf_test)
test["pred"] = pred
test["prob"] = prob.max(axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test["pred"] = pred
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test["prob"] = prob.max(axis=1)


In [9]:
accuracy = accuracy_score(test['topic'], test['pred'])
precision = precision_score(test['topic'], test['pred'], average='weighted')
recall = recall_score(test['topic'], test['pred'], average='weighted')
f1 = f1_score(test['topic'], test['pred'], average='weighted')
print(f'Accuracy: {accuracy}\nPrecision: {precision}\nRecall: {recall}\nF1: {f1}')

Accuracy: 0.9750367107195301
Precision: 0.9752436834777642
Recall: 0.9750367107195301
F1: 0.9750324678322444


In [ ]:
#test.to_csv('test_result.csv', index=False)