In [63]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from datasets import Dataset,load_metric
from transformers import BertTokenizerFast, AlbertForSequenceClassification, TrainingArguments, Trainer, AutoModel
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from sklearn.model_selection import train_test_split
import os
import wandb

In [64]:
wandb.login()

True

In [65]:
tokenizer = AutoTokenizer.from_pretrained('bert-base-chinese')
model = AlbertForSequenceClassification.from_pretrained('ckiplab/albert-base-chinese', num_labels=13)

loading configuration file config.json from cache at C:\Users\User/.cache\huggingface\hub\models--bert-base-chinese\snapshots\c30a6ed22ab4564dc1e3b2ecbf6e766b0611a33f\config.json
Model config BertConfig {
  "_name_or_path": "bert-base-chinese",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "directionality": "bidi",
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "pooler_fc_size": 768,
  "pooler_num_attention_heads": 12,
  "pooler_num_fc_layers": 3,
  "pooler_size_per_head": 128,
  "pooler_type": "first_token_transform",
  "position_embedding_type": "absolute",
  "transformers_version": "4.25.1",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 21128
}

load

Data Preprocessing

In [66]:
dataset = pd.read_csv('Bert_text_class_1600.csv')
print(dataset.shape[0])

20090


In [67]:
# #先將總資料集切成train&test(8:2)
train_data, test_data = train_test_split(dataset, random_state=777, train_size=0.2)

print('train : test =', train_data.shape[0], ':', test_data.shape[0])

train : test = 4018 : 16072


In [68]:
#train & eval的tag代換成類別代碼
train_data['tag'].replace({'政治':"0", '財經':"1", '運動':"2", '健康':"3", '旅遊':"4", '生活':"5", '娛樂影劇':"6", 
                           '國際':"7", '美容':"8", '藝文':"9", '銷售':"10", '食品':"11", '教育':"12"}, inplace=True)

In [69]:
#按照類別重新排列
train_data = train_data.sort_values('tag', key=pd.to_numeric)

In [70]:
#Cleaning Training data
print(train_data.keys())

train_data = train_data.drop(['h1', 'h2', 'time'], axis=1)

Index(['tag', 'h1', 'h2', 'text', 'time'], dtype='object')


In [71]:
train_data.to_csv('train_1600_cleaned.csv',index=False,encoding='utf-8-sig')

In [72]:
train_data = pd.read_csv('train_1600_cleaned.csv')
train_data.dropna()

Unnamed: 0,tag,text
0,0,食安專案報告明天下午再協商 國民黨團：盼朝野同意23日專案報告記者周志豪／台北報導國民黨團昨...
1,0,基隆人號召罷免！謝國樑指「民進黨操作」 藍主委：林右昌妒忌高滿意度記者陳怡潔／台北報導基隆市...
2,0,北市府小內閣3月改組？蔣萬安曝通盤考量中：每天持續對局處首長做考核記者許皓婷／台北報導台北市...
3,0,向中國低頭就能回到過去？矢板明夫：只有台灣親中派完全罔顧事實[Newtalk新聞] 中國百業...
4,0,為打擊網路霸凌與假新聞，菲律賓國會於3日通過法案，規定所有手機用戶在註冊社群媒體帳號時需提供...
...,...,...
4013,12,文： Steve Wallace提問：我讀了一篇文章，想將該文作者從其他來源引用的概念用於自...
4014,12,文：陳雅萍（人本教育中心主任）沒有人喜歡吼小孩，不過奇怪的是，沒有吼過小孩的，也是沒有人；如...
4015,12,（中央社）113學年度學測國文科的綜合能力測驗，全中教解題團隊認為難度屬「中偏易」，估5標與...
4016,12,文：李明憲教授、顏家棟研究員（國立東華大學台灣安全促進學校研究中心）一連串的學生自殺與自傷事...


In [73]:
#將train切成train&valid(8:2)
train_data, eval_data = train_test_split(train_data, random_state=777, train_size=0.8) 

print('train : valid =', train_data.shape[0], ':', eval_data.shape[0])


train : valid = 3214 : 804


In [74]:
train_data = train_data.dropna()
eval_data = eval_data.dropna()

In [75]:
train_data, eval_data = Dataset.from_pandas(train_data), Dataset.from_dict(eval_data)

In [76]:
def tokenize( data ):
    return tokenizer( data['text'], truncation=True, padding="max_length" )

In [77]:
# 對每筆資料進行分詞處理
train_data = train_data.map( tokenize, batched=True )
eval_data = eval_data.map( tokenize, batched=True )

  0%|          | 0/4 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

In [78]:
# 將預測目標設為 labels
train_data = train_data.map( lambda data : {'labels' : data['tag'] }, batched=True )
eval_data = eval_data.map( lambda data : {'labels' : data['tag'] }, batched=True )

  0%|          | 0/4 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

In [79]:
# 將資料集格式化為 torch 格式，以訓練 pyTorch 模型
columns = [ 'input_ids', 'token_type_ids', 'attention_mask', 'labels' ]
train_data.set_format( type = 'torch', columns = columns )
eval_data.set_format( type = 'torch', columns = columns )

Define evaluate baseline

In [80]:
# 使用 F1 指標
metric = load_metric("f1")
# 定義評價指標
def compute_metrics( eval_pred ):
    predictions, labels = eval_pred
    return metric.compute(predictions=np.argmax(predictions, axis=1), references=labels, average="micro")

Build model

In [81]:
# 定義訓練參數，預設使用AdamW最佳化器
args = TrainingArguments (
    output_dir= "./title_bert_test",        # 輸出路徑
    overwrite_output_dir=True,
    evaluation_strategy = "epoch",      # 每輪結束後評價
    learning_rate = 2e-5,               # 定義初始學習率
    per_device_train_batch_size=1, 
    per_device_eval_batch_size=1,
    gradient_accumulation_steps=64,
    num_train_epochs=12,                 # 定義訓練幾輪
    report_to="wandb",
)

os.environ["WANDB_PROJECT"] = "<Bert Text Classification>"  # name your W&B project
os.environ["WANDB_LOG_MODEL"] = "checkpoint"

PyTorch: setting up devices


In [82]:
# 定義訓練器， 指定模型、參數、訓練資料集、評價資料集、分詞器、評價指標
trainer = Trainer(
    model,
    args,
    train_dataset = train_data,
    eval_dataset = eval_data,
    tokenizer = tokenizer,
    compute_metrics = compute_metrics,
)

Train & Evaluate

In [83]:
#開始訓練
trainer.train()
wandb.finish()

The following columns in the training set don't have a corresponding argument in `AlbertForSequenceClassification.forward` and have been ignored: text, __index_level_0__, tag. If text, __index_level_0__, tag are not expected by `AlbertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 3212
  Num Epochs = 12
  Instantaneous batch size per device = 1
  Total train batch size (w. parallel, distributed & accumulation) = 64
  Gradient Accumulation steps = 64
  Total optimization steps = 600
  Number of trainable parameters = 10557965
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.011111111111111112, max=1.0…

  0%|          | 0/600 [00:00<?, ?it/s]

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
The following columns in the evaluation set don't have a corresponding argument in `AlbertForSequenceClassification.forward` and have been ignored: text, tag. If text, tag are not expected by `AlbertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 804
  Batch size = 1


  0%|          | 0/804 [00:00<?, ?it/s]

{'eval_loss': 1.9091088771820068, 'eval_f1': 0.6119402985074627, 'eval_runtime': 32.6824, 'eval_samples_per_second': 24.6, 'eval_steps_per_second': 24.6, 'epoch': 1.0}


The following columns in the evaluation set don't have a corresponding argument in `AlbertForSequenceClassification.forward` and have been ignored: text, tag. If text, tag are not expected by `AlbertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 804
  Batch size = 1


  0%|          | 0/804 [00:00<?, ?it/s]

{'eval_loss': 1.2353811264038086, 'eval_f1': 0.7512437810945274, 'eval_runtime': 38.7129, 'eval_samples_per_second': 20.768, 'eval_steps_per_second': 20.768, 'epoch': 2.0}


The following columns in the evaluation set don't have a corresponding argument in `AlbertForSequenceClassification.forward` and have been ignored: text, tag. If text, tag are not expected by `AlbertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 804
  Batch size = 1


  0%|          | 0/804 [00:00<?, ?it/s]

{'eval_loss': 0.9154615998268127, 'eval_f1': 0.8109452736318408, 'eval_runtime': 31.6851, 'eval_samples_per_second': 25.375, 'eval_steps_per_second': 25.375, 'epoch': 3.0}


The following columns in the evaluation set don't have a corresponding argument in `AlbertForSequenceClassification.forward` and have been ignored: text, tag. If text, tag are not expected by `AlbertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 804
  Batch size = 1


  0%|          | 0/804 [00:00<?, ?it/s]

{'eval_loss': 0.7706443071365356, 'eval_f1': 0.8271144278606966, 'eval_runtime': 32.0836, 'eval_samples_per_second': 25.06, 'eval_steps_per_second': 25.06, 'epoch': 4.0}


The following columns in the evaluation set don't have a corresponding argument in `AlbertForSequenceClassification.forward` and have been ignored: text, tag. If text, tag are not expected by `AlbertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 804
  Batch size = 1


  0%|          | 0/804 [00:00<?, ?it/s]

{'eval_loss': 0.709685742855072, 'eval_f1': 0.8296019900497511, 'eval_runtime': 31.9756, 'eval_samples_per_second': 25.144, 'eval_steps_per_second': 25.144, 'epoch': 5.0}


The following columns in the evaluation set don't have a corresponding argument in `AlbertForSequenceClassification.forward` and have been ignored: text, tag. If text, tag are not expected by `AlbertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 804
  Batch size = 1


  0%|          | 0/804 [00:00<?, ?it/s]

{'eval_loss': 0.6469682455062866, 'eval_f1': 0.8507462686567164, 'eval_runtime': 31.3582, 'eval_samples_per_second': 25.639, 'eval_steps_per_second': 25.639, 'epoch': 6.0}


The following columns in the evaluation set don't have a corresponding argument in `AlbertForSequenceClassification.forward` and have been ignored: text, tag. If text, tag are not expected by `AlbertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 804
  Batch size = 1


  0%|          | 0/804 [00:00<?, ?it/s]

{'eval_loss': 0.6210914850234985, 'eval_f1': 0.8495024875621892, 'eval_runtime': 31.4435, 'eval_samples_per_second': 25.57, 'eval_steps_per_second': 25.57, 'epoch': 7.0}


The following columns in the evaluation set don't have a corresponding argument in `AlbertForSequenceClassification.forward` and have been ignored: text, tag. If text, tag are not expected by `AlbertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 804
  Batch size = 1


  0%|          | 0/804 [00:00<?, ?it/s]

{'eval_loss': 0.5985808372497559, 'eval_f1': 0.8532338308457711, 'eval_runtime': 32.4973, 'eval_samples_per_second': 24.741, 'eval_steps_per_second': 24.741, 'epoch': 8.0}


The following columns in the evaluation set don't have a corresponding argument in `AlbertForSequenceClassification.forward` and have been ignored: text, tag. If text, tag are not expected by `AlbertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 804
  Batch size = 1


  0%|          | 0/804 [00:00<?, ?it/s]

{'eval_loss': 0.5857416391372681, 'eval_f1': 0.8569651741293532, 'eval_runtime': 40.0801, 'eval_samples_per_second': 20.06, 'eval_steps_per_second': 20.06, 'epoch': 9.0}
{'loss': 0.8663, 'learning_rate': 3.3333333333333333e-06, 'epoch': 10.0}


Saving model checkpoint to ./title_bert_test\checkpoint-500
Configuration saved in ./title_bert_test\checkpoint-500\config.json
Model weights saved in ./title_bert_test\checkpoint-500\pytorch_model.bin
tokenizer config file saved in ./title_bert_test\checkpoint-500\tokenizer_config.json
Special tokens file saved in ./title_bert_test\checkpoint-500\special_tokens_map.json
The following columns in the evaluation set don't have a corresponding argument in `AlbertForSequenceClassification.forward` and have been ignored: text, tag. If text, tag are not expected by `AlbertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 804
  Batch size = 1


  0%|          | 0/804 [00:00<?, ?it/s]

{'eval_loss': 0.5834717154502869, 'eval_f1': 0.8495024875621892, 'eval_runtime': 34.1929, 'eval_samples_per_second': 23.514, 'eval_steps_per_second': 23.514, 'epoch': 10.0}


The following columns in the evaluation set don't have a corresponding argument in `AlbertForSequenceClassification.forward` and have been ignored: text, tag. If text, tag are not expected by `AlbertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 804
  Batch size = 1


  0%|          | 0/804 [00:00<?, ?it/s]

{'eval_loss': 0.5731849074363708, 'eval_f1': 0.8569651741293532, 'eval_runtime': 34.473, 'eval_samples_per_second': 23.323, 'eval_steps_per_second': 23.323, 'epoch': 11.0}


The following columns in the evaluation set don't have a corresponding argument in `AlbertForSequenceClassification.forward` and have been ignored: text, tag. If text, tag are not expected by `AlbertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 804
  Batch size = 1


  0%|          | 0/804 [00:00<?, ?it/s]

{'eval_loss': 0.5700264573097229, 'eval_f1': 0.8594527363184078, 'eval_runtime': 31.2828, 'eval_samples_per_second': 25.701, 'eval_steps_per_second': 25.701, 'epoch': 12.0}




Training completed. Do not forget to share your model on huggingface.co/models =)




{'train_runtime': 5078.4828, 'train_samples_per_second': 7.59, 'train_steps_per_second': 0.118, 'train_loss': 0.777147782643636, 'epoch': 12.0}


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
eval/f1,▁▅▇▇▇███████
eval/loss,█▄▃▂▂▁▁▁▁▁▁▁
eval/runtime,▂▇▁▂▂▁▁▂█▃▄▁
eval/samples_per_second,▇▂█▇▇██▇▁▅▅█
eval/steps_per_second,▇▂█▇▇██▇▁▅▅█
train/epoch,▁▂▂▃▄▄▅▅▆▇▇▇██
train/global_step,▁▂▂▃▄▄▅▅▆▇▇▇██
train/learning_rate,▁
train/loss,▁
train/total_flos,▁

0,1
eval/f1,0.85945
eval/loss,0.57003
eval/runtime,31.2828
eval/samples_per_second,25.701
eval/steps_per_second,25.701
train/epoch,12.0
train/global_step,600.0
train/learning_rate,0.0
train/loss,0.8663
train/total_flos,921842951417856.0


In [84]:
trainer.save_model()

Saving model checkpoint to ./title_bert_test
Configuration saved in ./title_bert_test\config.json
Model weights saved in ./title_bert_test\pytorch_model.bin
tokenizer config file saved in ./title_bert_test\tokenizer_config.json
Special tokens file saved in ./title_bert_test\special_tokens_map.json


In [85]:
trainer.evaluate()

The following columns in the evaluation set don't have a corresponding argument in `AlbertForSequenceClassification.forward` and have been ignored: text, tag. If text, tag are not expected by `AlbertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 804
  Batch size = 1


  0%|          | 0/804 [00:00<?, ?it/s]

Error: You must call wandb.init() before wandb.log()

Output & Test

In [86]:
from transformers import pipeline,BertForSequenceClassification
# define all label
news_labels = ['政治','財經','運動','健康','旅遊','生活','娛樂影劇','國際','美容','藝文','銷售','食品','教育']
# set label reference
id2label = {idx:label for idx, label in enumerate(news_labels)}
label2id = {label:idx for idx, label in enumerate(news_labels)}
# load model
model = AutoModelForSequenceClassification.from_pretrained('./title_bert_test', num_labels=13, label2id=label2id, id2label=id2label)
# init pipline
pipe = pipeline('text-classification', model=model, tokenizer='bert-base-chinese', device='cuda:0')

loading configuration file ./title_bert_test\config.json
Model config AlbertConfig {
  "_name_or_path": "./title_bert_test",
  "architectures": [
    "AlbertForSequenceClassification"
  ],
  "attention_probs_dropout_prob": 0,
  "bos_token_id": 101,
  "classifier_dropout_prob": 0.1,
  "down_scale_factor": 1,
  "embedding_size": 128,
  "eos_token_id": 102,
  "gap_size": 0,
  "hidden_act": "relu",
  "hidden_dropout_prob": 0,
  "hidden_size": 768,
  "id2label": {
    "0": "\u653f\u6cbb",
    "1": "\u8ca1\u7d93",
    "2": "\u904b\u52d5",
    "3": "\u5065\u5eb7",
    "4": "\u65c5\u904a",
    "5": "\u751f\u6d3b",
    "6": "\u5a1b\u6a02\u5f71\u5287",
    "7": "\u570b\u969b",
    "8": "\u7f8e\u5bb9",
    "9": "\u85dd\u6587",
    "10": "\u92b7\u552e",
    "11": "\u98df\u54c1",
    "12": "\u6559\u80b2"
  },
  "initializer_range": 0.02,
  "inner_group_num": 1,
  "intermediate_size": 3072,
  "label2id": {
    "\u5065\u5eb7": 3,
    "\u570b\u969b": 7,
    "\u5a1b\u6a02\u5f71\u5287": 6,
    "\u653f\u

loading weights file ./title_bert_test\pytorch_model.bin
All model checkpoint weights were used when initializing AlbertForSequenceClassification.

All the weights of AlbertForSequenceClassification were initialized from the model checkpoint at ./title_bert_test.
If your task is similar to the task the model of the checkpoint was trained on, you can already use AlbertForSequenceClassification for predictions without further training.
loading configuration file config.json from cache at C:\Users\User/.cache\huggingface\hub\models--bert-base-chinese\snapshots\c30a6ed22ab4564dc1e3b2ecbf6e766b0611a33f\config.json
Model config BertConfig {
  "_name_or_path": "bert-base-chinese",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "directionality": "bidi",
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_posi

In [87]:
from tqdm import tqdm
import pandas as pd
tqdm.pandas()

In [88]:
def classfic( title ):
    result = pipe(str(title), max_length=510)
    return result[0]['label']

In [89]:
test_data = pd.read_csv('text/分類_test_result.csv')
len(test_data)
test_data.keys()

Index(['h1', 'h2', 'text', 'tag', 'predicted_tag'], dtype='object')

In [90]:
new_data = pd.DataFrame({
    'h1' : test_data.h1,
    'h2' : test_data.h2,
    'text' : test_data['text'],
    'tag' : test_data.tag,
    'predicted_tag' : ''
})

In [91]:
new_data['predicted_tag'] = new_data.text.progress_apply(classfic)




  0%|          | 0/4018 [00:00<?, ?it/s]Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.



  0%|          | 2/4018 [00:00<08:14,  8.12it/s]


  0%|          | 4/4018 [00:00<05:26, 12.29it/s]


  0%|          | 7/4018 [00:00<04:07, 16.24it/s]





  0%|          | 13/4018 [00:00<03:18, 20.13it/s]


  0%|          | 16/4018 [00:00<03:24, 19.58it/s]


  0%|          | 19/4018 [00:01<03:11, 20.89it/s]


  1%|          | 22/4018 [00:01<03:06, 21.45it/s]


  1%|          | 25/4018 [00:01<03:02, 21.85it/s]


  1%|          | 28/4018 [00:01<02:55, 22.69it/s]


  1%|          | 31/4018 [00:01<02:59, 22.15it/s]


  1%|          | 34/4018 [00:01<03:00, 22.05it/s]


  1%|    

In [93]:
new_data.head()

Unnamed: 0,h1,h2,text,tag,predicted_tag
0,《傳產》年度平價宅釋出 恐掀搶標風潮,其他人也在看熱門新聞編輯精選,《傳產》年度平價宅釋出 恐掀搶標風潮【時報記者王逸芯台北報導】台灣金聯今天公布113年度的平...,財經,財經
1,NBA》恩比德調侃灌籃大賽：這種分數我也能摘冠,其他人也在看熱門新聞編輯精選,NBA》恩比德調侃灌籃大賽：這種分數我也能摘冠The defending #ATTSlamD...,運動,運動
2,半導體人才戰：劉德音示警，大學理工科學生三大危機，2030年台灣科技霸業能否延續？,,文：劉煥彥台灣半導體霸業眾所周知，在IC製造及IC封裝都是全球第一，IC設計則是全球第二位，...,教育,財經
3,高美館、衛武營首次跨界合作 《老鷹的眼淚—劉冠妏個展》12/30展演,,爆高美館、衛武營首次跨界合作 《老鷹的眼淚—劉冠妏個展》12/30展演劉冠妏出身藝術世家，...,藝文,藝文
4,《農產品》巴西天氣不利 CBOT農產品全面上漲,其他人也在看熱門新聞編輯精選,《農產品》巴西天氣不利 CBOT農產品全面上漲MoneyDJ新聞 2024-02-21 06...,國際,財經


In [94]:
#將test data分類結果輸出
new_data.to_csv('text/分類_test_result2.csv',index=False,encoding='utf-8-sig')

In [95]:
#calculate test accuracy
import csv
i=0
count=0
csv_w = open('text/missort.csv', "w", encoding='utf-8-sig', newline='')    # 開啟csv
writer = csv.writer(csv_w)
writer.writerow(['text', 'tag', 'predicted_tag'])
for tag in new_data.tag:
    if tag == new_data.predicted_tag[i]:
        count+=1
    else: 
        writer.writerow([new_data.text[i], new_data.tag[i], new_data.predicted_tag[i]])
    i+=1
print("Test Accuracy = ", count/len(new_data))

Test Accuracy =  0.8464410154305625


----用模型直接分類----

In [None]:
# new_data = pd.read_csv('h2/result_target-4.csv',encoding='utf-8-sig')

In [None]:
# news_labels = ['娛樂','國際','政治','社會','財經','生活','運動','科技','健康']
# for label in news_labels:
#     topic_data = new_data[ new_data['topic'] == label ]
#     topic_data.to_csv(f'topic/h2/{label}.csv',index=False,encoding='utf-8-sig')

In [None]:
# new_data