In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from datasets import Dataset,load_metric
from transformers import BertTokenizerFast, AlbertForSequenceClassification, TrainingArguments, Trainer, AutoModel
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from sklearn.model_selection import train_test_split
import os
import wandb

In [2]:
wandb.login() 

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
wandb: Currently logged in as: monicachen0331. Use `wandb login --relogin` to force relogin


True

In [3]:
tokenizer = AutoTokenizer.from_pretrained('bert-base-multilingual-uncased')
model = AlbertForSequenceClassification.from_pretrained('bert-base-multilingual-uncased', num_labels=13)

You are using a model of type bert to instantiate a model of type albert. This is not supported for all configurations of models and can yield errors.
Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing AlbertForSequenceClassification: ['cls.seq_relationship.weight', 'bert.encoder.layer.2.output.LayerNorm.weight', 'bert.encoder.layer.8.output.LayerNorm.weight', 'bert.encoder.layer.5.attention.self.value.bias', 'bert.encoder.layer.8.attention.self.value.weight', 'bert.encoder.layer.10.attention.self.key.weight', 'bert.encoder.layer.4.output.LayerNorm.weight', 'bert.encoder.layer.11.attention.output.LayerNorm.bias', 'bert.encoder.layer.3.attention.self.key.bias', 'bert.encoder.layer.5.attention.self.key.weight', 'bert.encoder.layer.4.attention.self.value.weight', 'cls.predictions.bias', 'bert.encoder.layer.9.attention.self.value.weight', 'bert.encoder.layer.7.output.dense.bias', 'bert.encoder.layer.9.attention.self.key.weight', 'bert.enc

Data Preprocessing

In [4]:
dataset = pd.read_csv('Bert_data.csv')
print(dataset.shape[0])

28289


In [5]:
#dropna
dataset = pd.DataFrame({
    'tag' : dataset.tag,
    'text' : dataset.text
})
dataset = dataset.dropna()
print(dataset.shape[0])

28238


In [6]:
# small sample
# import random
# dataset = dataset.sample(n=10000, random_state=1)
# print(dataset.shape[0])

In [7]:
#tag代換成類別代碼
dataset['tag'].replace({'Politics':"0", 'Finance':"1", 'Sports':"2", 'Health':"3", 'Travel':"4", 'Life':"5", 'Video':"6", 
                           'World':"7", 'Beauty':"8", 'Art':"9", 'Shopping':"10", 'Food':"11", 'Education':"12"}, inplace=True)

#按照類別重新排列
dataset = dataset.sort_values(by='tag', key=pd.to_numeric)

In [8]:
#將總資料集切成train&test(2:8)
train_data, test_data = train_test_split(dataset, random_state=777, train_size=0.2)
print('train : test =', train_data.shape[0], ':', test_data.shape[0])

train : test = 5647 : 22591


In [9]:
#為解決不明格式問體，先輸出再讀入
train_data.to_csv('train.csv',index=False,encoding='utf-8-sig')
test_data.to_csv('test.csv',index=False,encoding='utf-8-sig')

In [10]:
#重新讀入資料
train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')

In [11]:
#將train切成train&valid(8:2)
train_data, eval_data = train_test_split(train_data, random_state=777, train_size=0.8) 

print('train : valid =', train_data.shape[0], ':', eval_data.shape[0])

train : valid = 4517 : 1130


In [12]:
train_data, eval_data = Dataset.from_pandas(train_data), Dataset.from_dict(eval_data)
# test_data = Dataset.from_pandas(test_data)

In [13]:
def tokenize( data ):
    return tokenizer( data['text'], truncation=True, padding="max_length" )

In [14]:
# 對每筆資料進行分詞處理
train_data = train_data.map( tokenize, batched=True )
eval_data = eval_data.map( tokenize, batched=True )

  0%|          | 0/5 [00:00<?, ?ba/s]

  0%|          | 0/2 [00:00<?, ?ba/s]

In [15]:
# 將預測目標設為 labels
train_data = train_data.map( lambda data : {'labels' : data['tag'] }, batched=True )
eval_data = eval_data.map( lambda data : {'labels' : data['tag'] }, batched=True )

  0%|          | 0/5 [00:00<?, ?ba/s]

  0%|          | 0/2 [00:00<?, ?ba/s]

In [16]:
# 將資料集格式化為 torch 格式，以訓練 pyTorch 模型
columns = [ 'input_ids', 'token_type_ids', 'attention_mask', 'labels' ]
train_data.set_format( type = 'torch', columns = columns )
eval_data.set_format( type = 'torch', columns = columns )

Define evaluate baseline

In [17]:
# 使用 F1 指標
metric = load_metric("f1")
# 定義評價指標
def compute_metrics( eval_pred ):
    predictions, labels = eval_pred
    return metric.compute(predictions=np.argmax(predictions, axis=1), references=labels, average="micro")

  metric = load_metric("f1")


Build model

In [18]:
# 定義訓練參數，預設使用AdamW最佳化器
args = TrainingArguments (
    output_dir= "./bert_mullan_final",        # 輸出路徑
    overwrite_output_dir=True,
    evaluation_strategy = "epoch",      # 每輪結束後評價
    learning_rate = 3e-5,               # 定義初始學習率
    per_device_train_batch_size=1, 
    per_device_eval_batch_size=1,
    gradient_accumulation_steps=64,
    num_train_epochs=20,                 # 定義訓練幾輪
    report_to='wandb',
)

os.environ["WANDB_PROJECT"] = "<Bert Text Classification>"  # name your W&B project
os.environ["WANDB_LOG_MODEL"] = "checkpoint"

In [19]:
# 定義訓練器， 指定模型、參數、訓練資料集、評價資料集、分詞器、評價指標
trainer = Trainer(
    model,
    args,
    train_dataset = train_data,
    eval_dataset = eval_data,
    tokenizer = tokenizer,
    compute_metrics = compute_metrics,
)

Train & Evaluate

In [20]:
#開始訓練
trainer.train()

The following columns in the training set don't have a corresponding argument in `AlbertForSequenceClassification.forward` and have been ignored: text, tag, __index_level_0__. If text, tag, __index_level_0__ are not expected by `AlbertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 4517
  Num Epochs = 20
  Instantaneous batch size per device = 1
  Total train batch size (w. parallel, distributed & accumulation) = 64
  Gradient Accumulation steps = 64
  Total optimization steps = 1400
  Number of trainable parameters = 21406093
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"


  0%|          | 0/1400 [00:00<?, ?it/s]

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
The following columns in the evaluation set don't have a corresponding argument in `AlbertForSequenceClassification.forward` and have been ignored: text, tag. If text, tag are not expected by `AlbertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 1130
  Batch size = 1


  0%|          | 0/1130 [00:00<?, ?it/s]

{'eval_loss': 2.5661449432373047, 'eval_f1': 0.07964601769911504, 'eval_runtime': 45.446, 'eval_samples_per_second': 24.865, 'eval_steps_per_second': 24.865, 'epoch': 0.99}


The following columns in the evaluation set don't have a corresponding argument in `AlbertForSequenceClassification.forward` and have been ignored: text, tag. If text, tag are not expected by `AlbertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 1130
  Batch size = 1


  0%|          | 0/1130 [00:00<?, ?it/s]

{'eval_loss': 2.577064275741577, 'eval_f1': 0.09646017699115045, 'eval_runtime': 45.3751, 'eval_samples_per_second': 24.904, 'eval_steps_per_second': 24.904, 'epoch': 1.99}


The following columns in the evaluation set don't have a corresponding argument in `AlbertForSequenceClassification.forward` and have been ignored: text, tag. If text, tag are not expected by `AlbertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 1130
  Batch size = 1


  0%|          | 0/1130 [00:00<?, ?it/s]

{'eval_loss': 2.5540921688079834, 'eval_f1': 0.0920353982300885, 'eval_runtime': 45.9596, 'eval_samples_per_second': 24.587, 'eval_steps_per_second': 24.587, 'epoch': 2.99}


The following columns in the evaluation set don't have a corresponding argument in `AlbertForSequenceClassification.forward` and have been ignored: text, tag. If text, tag are not expected by `AlbertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 1130
  Batch size = 1


  0%|          | 0/1130 [00:00<?, ?it/s]

{'eval_loss': 2.9820148944854736, 'eval_f1': 0.06371681415929203, 'eval_runtime': 45.7789, 'eval_samples_per_second': 24.684, 'eval_steps_per_second': 24.684, 'epoch': 3.99}


The following columns in the evaluation set don't have a corresponding argument in `AlbertForSequenceClassification.forward` and have been ignored: text, tag. If text, tag are not expected by `AlbertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 1130
  Batch size = 1


  0%|          | 0/1130 [00:00<?, ?it/s]

{'eval_loss': 2.0106027126312256, 'eval_f1': 0.268141592920354, 'eval_runtime': 46.2502, 'eval_samples_per_second': 24.432, 'eval_steps_per_second': 24.432, 'epoch': 4.99}


The following columns in the evaluation set don't have a corresponding argument in `AlbertForSequenceClassification.forward` and have been ignored: text, tag. If text, tag are not expected by `AlbertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 1130
  Batch size = 1


  0%|          | 0/1130 [00:00<?, ?it/s]

{'eval_loss': 2.0748648643493652, 'eval_f1': 0.26548672566371684, 'eval_runtime': 46.6342, 'eval_samples_per_second': 24.231, 'eval_steps_per_second': 24.231, 'epoch': 5.99}


The following columns in the evaluation set don't have a corresponding argument in `AlbertForSequenceClassification.forward` and have been ignored: text, tag. If text, tag are not expected by `AlbertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 1130
  Batch size = 1


  0%|          | 0/1130 [00:00<?, ?it/s]

{'eval_loss': 1.7047635316848755, 'eval_f1': 0.3867256637168142, 'eval_runtime': 46.3863, 'eval_samples_per_second': 24.361, 'eval_steps_per_second': 24.361, 'epoch': 6.99}
{'loss': 2.2848, 'learning_rate': 1.928571428571429e-05, 'epoch': 7.14}


Saving model checkpoint to ./bert_mullan_final\checkpoint-500
Configuration saved in ./bert_mullan_final\checkpoint-500\config.json
Model weights saved in ./bert_mullan_final\checkpoint-500\pytorch_model.bin
tokenizer config file saved in ./bert_mullan_final\checkpoint-500\tokenizer_config.json
Special tokens file saved in ./bert_mullan_final\checkpoint-500\special_tokens_map.json
The following columns in the evaluation set don't have a corresponding argument in `AlbertForSequenceClassification.forward` and have been ignored: text, tag. If text, tag are not expected by `AlbertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 1130
  Batch size = 1


  0%|          | 0/1130 [00:00<?, ?it/s]

{'eval_loss': 1.6006624698638916, 'eval_f1': 0.44424778761061945, 'eval_runtime': 45.6742, 'eval_samples_per_second': 24.74, 'eval_steps_per_second': 24.74, 'epoch': 7.99}


The following columns in the evaluation set don't have a corresponding argument in `AlbertForSequenceClassification.forward` and have been ignored: text, tag. If text, tag are not expected by `AlbertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 1130
  Batch size = 1


  0%|          | 0/1130 [00:00<?, ?it/s]

{'eval_loss': 1.4776300191879272, 'eval_f1': 0.5274336283185841, 'eval_runtime': 45.9892, 'eval_samples_per_second': 24.571, 'eval_steps_per_second': 24.571, 'epoch': 8.99}


The following columns in the evaluation set don't have a corresponding argument in `AlbertForSequenceClassification.forward` and have been ignored: text, tag. If text, tag are not expected by `AlbertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 1130
  Batch size = 1


  0%|          | 0/1130 [00:00<?, ?it/s]

{'eval_loss': 1.408602237701416, 'eval_f1': 0.5283185840707965, 'eval_runtime': 45.6748, 'eval_samples_per_second': 24.74, 'eval_steps_per_second': 24.74, 'epoch': 9.99}


The following columns in the evaluation set don't have a corresponding argument in `AlbertForSequenceClassification.forward` and have been ignored: text, tag. If text, tag are not expected by `AlbertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 1130
  Batch size = 1


  0%|          | 0/1130 [00:00<?, ?it/s]

{'eval_loss': 1.3531742095947266, 'eval_f1': 0.5424778761061947, 'eval_runtime': 45.8534, 'eval_samples_per_second': 24.644, 'eval_steps_per_second': 24.644, 'epoch': 10.99}


The following columns in the evaluation set don't have a corresponding argument in `AlbertForSequenceClassification.forward` and have been ignored: text, tag. If text, tag are not expected by `AlbertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 1130
  Batch size = 1


  0%|          | 0/1130 [00:00<?, ?it/s]

{'eval_loss': 1.2189048528671265, 'eval_f1': 0.620353982300885, 'eval_runtime': 46.2627, 'eval_samples_per_second': 24.426, 'eval_steps_per_second': 24.426, 'epoch': 11.99}


The following columns in the evaluation set don't have a corresponding argument in `AlbertForSequenceClassification.forward` and have been ignored: text, tag. If text, tag are not expected by `AlbertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 1130
  Batch size = 1


  0%|          | 0/1130 [00:00<?, ?it/s]

{'eval_loss': 1.1083253622055054, 'eval_f1': 0.6646017699115044, 'eval_runtime': 45.6815, 'eval_samples_per_second': 24.736, 'eval_steps_per_second': 24.736, 'epoch': 12.99}


The following columns in the evaluation set don't have a corresponding argument in `AlbertForSequenceClassification.forward` and have been ignored: text, tag. If text, tag are not expected by `AlbertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 1130
  Batch size = 1


  0%|          | 0/1130 [00:00<?, ?it/s]

{'eval_loss': 1.0566505193710327, 'eval_f1': 0.6707964601769911, 'eval_runtime': 45.9058, 'eval_samples_per_second': 24.616, 'eval_steps_per_second': 24.616, 'epoch': 13.99}
{'loss': 1.1874, 'learning_rate': 8.571428571428571e-06, 'epoch': 14.28}


Saving model checkpoint to ./bert_mullan_final\checkpoint-1000
Configuration saved in ./bert_mullan_final\checkpoint-1000\config.json
Model weights saved in ./bert_mullan_final\checkpoint-1000\pytorch_model.bin
tokenizer config file saved in ./bert_mullan_final\checkpoint-1000\tokenizer_config.json
Special tokens file saved in ./bert_mullan_final\checkpoint-1000\special_tokens_map.json
The following columns in the evaluation set don't have a corresponding argument in `AlbertForSequenceClassification.forward` and have been ignored: text, tag. If text, tag are not expected by `AlbertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 1130
  Batch size = 1


  0%|          | 0/1130 [00:00<?, ?it/s]

{'eval_loss': 1.102202296257019, 'eval_f1': 0.6389380530973451, 'eval_runtime': 45.5354, 'eval_samples_per_second': 24.816, 'eval_steps_per_second': 24.816, 'epoch': 14.99}


The following columns in the evaluation set don't have a corresponding argument in `AlbertForSequenceClassification.forward` and have been ignored: text, tag. If text, tag are not expected by `AlbertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 1130
  Batch size = 1


  0%|          | 0/1130 [00:00<?, ?it/s]

{'eval_loss': 1.0483438968658447, 'eval_f1': 0.6548672566371682, 'eval_runtime': 45.5033, 'eval_samples_per_second': 24.833, 'eval_steps_per_second': 24.833, 'epoch': 15.99}


The following columns in the evaluation set don't have a corresponding argument in `AlbertForSequenceClassification.forward` and have been ignored: text, tag. If text, tag are not expected by `AlbertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 1130
  Batch size = 1


  0%|          | 0/1130 [00:00<?, ?it/s]

{'eval_loss': 1.0365251302719116, 'eval_f1': 0.6734513274336283, 'eval_runtime': 47.3309, 'eval_samples_per_second': 23.874, 'eval_steps_per_second': 23.874, 'epoch': 16.99}


The following columns in the evaluation set don't have a corresponding argument in `AlbertForSequenceClassification.forward` and have been ignored: text, tag. If text, tag are not expected by `AlbertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 1130
  Batch size = 1


  0%|          | 0/1130 [00:00<?, ?it/s]

{'eval_loss': 0.9983851313591003, 'eval_f1': 0.6867256637168142, 'eval_runtime': 46.1984, 'eval_samples_per_second': 24.46, 'eval_steps_per_second': 24.46, 'epoch': 17.99}


The following columns in the evaluation set don't have a corresponding argument in `AlbertForSequenceClassification.forward` and have been ignored: text, tag. If text, tag are not expected by `AlbertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 1130
  Batch size = 1


  0%|          | 0/1130 [00:00<?, ?it/s]

{'eval_loss': 0.9672853946685791, 'eval_f1': 0.7070796460176991, 'eval_runtime': 45.1878, 'eval_samples_per_second': 25.007, 'eval_steps_per_second': 25.007, 'epoch': 18.99}


The following columns in the evaluation set don't have a corresponding argument in `AlbertForSequenceClassification.forward` and have been ignored: text, tag. If text, tag are not expected by `AlbertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 1130
  Batch size = 1


  0%|          | 0/1130 [00:00<?, ?it/s]

{'eval_loss': 0.9847228527069092, 'eval_f1': 0.6911504424778762, 'eval_runtime': 45.3676, 'eval_samples_per_second': 24.908, 'eval_steps_per_second': 24.908, 'epoch': 19.99}




Training completed. Do not forget to share your model on huggingface.co/models =)




{'train_runtime': 11710.9409, 'train_samples_per_second': 7.714, 'train_steps_per_second': 0.12, 'train_loss': 1.438581324986049, 'epoch': 19.99}


TrainOutput(global_step=1400, training_loss=1.438581324986049, metrics={'train_runtime': 11710.9409, 'train_samples_per_second': 7.714, 'train_steps_per_second': 0.12, 'train_loss': 1.438581324986049, 'epoch': 19.99})

In [21]:
trainer.save_model()

Saving model checkpoint to ./bert_mullan_final
Configuration saved in ./bert_mullan_final\config.json
Model weights saved in ./bert_mullan_final\pytorch_model.bin
tokenizer config file saved in ./bert_mullan_final\tokenizer_config.json
Special tokens file saved in ./bert_mullan_final\special_tokens_map.json


In [22]:
trainer.evaluate()
wandb.finish()

The following columns in the evaluation set don't have a corresponding argument in `AlbertForSequenceClassification.forward` and have been ignored: text, tag. If text, tag are not expected by `AlbertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 1130
  Batch size = 1


  0%|          | 0/1130 [00:00<?, ?it/s]

VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
eval/f1,▁▁▁▁▃▃▅▅▆▆▆▇██▇▇█████
eval/loss,▇▇▇█▅▅▄▃▃▃▂▂▁▁▁▁▁▁▁▁▁
eval/runtime,▂▂▄▃▅▆▅▃▄▃▃▅▃▄▃▂█▅▁▂▁
eval/samples_per_second,▇▇▅▆▄▃▄▆▅▆▅▄▆▅▆▇▁▄█▇█
eval/steps_per_second,▇▇▅▆▄▃▄▆▅▆▅▄▆▅▆▇▁▄█▇█
train/epoch,▁▁▂▂▂▃▃▃▄▄▄▅▅▅▆▆▆▇▇▇████
train/global_step,▁▁▂▂▂▃▃▃▄▄▄▅▅▅▆▆▆▇▇▇████
train/learning_rate,█▁
train/loss,█▁
train/total_flos,▁

0,1
eval/f1,0.69115
eval/loss,0.98472
eval/runtime,45.0349
eval/samples_per_second,25.092
eval/steps_per_second,25.092
train/epoch,19.99
train/global_step,1400.0
train/learning_rate,1e-05
train/loss,1.1874
train/total_flos,2160416901325824.0


Predict model

In [23]:
from transformers import pipeline,BertForSequenceClassification
from transformers import BertTokenizer
from tqdm import tqdm
import pandas as pd
tqdm.pandas()
import torch
import torch.nn.functional as F

In [24]:
# define all label
news_labels = ['Politics','Finance','Sports','Health','Travel','Life','Video','World','Beauty','Art','Shopping','Food','Education']
# set label reference
id2label = {idx:label for idx, label in enumerate(news_labels)}
label2id = {label:idx for idx, label in enumerate(news_labels)}
# load model
model = AutoModelForSequenceClassification.from_pretrained('./bert_mullan_final', num_labels=13, label2id=label2id, id2label=id2label)
tokenizer = BertTokenizer.from_pretrained("./bert_mullan_final")

loading configuration file ./bert_mullan_final\config.json
Model config AlbertConfig {
  "_name_or_path": "./bert_mullan_final",
  "architectures": [
    "AlbertForSequenceClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 2,
  "classifier_dropout_prob": 0.1,
  "directionality": "bidi",
  "embedding_size": 128,
  "eos_token_id": 3,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "Politics",
    "1": "Finance",
    "2": "Sports",
    "3": "Health",
    "4": "Travel",
    "5": "Life",
    "6": "Video",
    "7": "World",
    "8": "Beauty",
    "9": "Art",
    "10": "Shopping",
    "11": "Food",
    "12": "Education"
  },
  "initializer_range": 0.02,
  "inner_group_num": 1,
  "intermediate_size": 3072,
  "label2id": {
    "Art": 9,
    "Beauty": 8,
    "Education": 12,
    "Finance": 1,
    "Food": 11,
    "Health": 3,
    "Life": 5,
    "Politics": 0,
    "Shopping": 10,
    "Sports": 2,
    "Travel": 4,
    "

loading weights file ./bert_mullan_final\pytorch_model.bin
All model checkpoint weights were used when initializing AlbertForSequenceClassification.

All the weights of AlbertForSequenceClassification were initialized from the model checkpoint at ./bert_mullan_final.
If your task is similar to the task the model of the checkpoint was trained on, you can already use AlbertForSequenceClassification for predictions without further training.
loading file vocab.txt
loading file added_tokens.json
loading file special_tokens_map.json
loading file tokenizer_config.json


In [25]:
test_data = pd.read_csv('test.csv')
test_data = test_data.dropna()
test_data.head()

Unnamed: 0,tag,text
0,9,文：莊冠群李安的電影《比利林恩的中場戰事》，改編自班方登（Ben Fountain）的同名小...
1,9,What would Mark Rothko make of the world today...
2,11,['台灣的美式炭烤餐廳不知凡幾？但真正道地的有幾間？就在板橋車水馬龍的中山路二段 77 號 ...
3,9,文：賴文堅1993年的寒假，一群仍顯稚氣的大學生，半夜裡從台北車站搭著往台東火車，凜冽空氣中...
4,0,最近又有親共的退役將領發表狂言，前國防管理學院院長，立法委員帥化民中將聲稱F-16是美國空軍...


In [26]:
test_data = Dataset.from_pandas(test_data)
test_data = test_data.map( tokenize, batched=True )
test_data = test_data.map( lambda data : {'labels' : data['tag'] }, batched=True )
test_data.set_format( type = 'torch', columns = columns )

  0%|          | 0/23 [00:00<?, ?ba/s]

  0%|          | 0/23 [00:00<?, ?ba/s]

In [27]:
#predict & softmax for few input of test data
# y_prob = []
# for text in test_data.text:
#     encoding = tokenizer(text, return_tensors="pt")
#     logits = model(**encoding)
    # prob = F.softmax(logits, dim=-1)
    # y_prob.append(prob.detach().numpy()[0])

In [28]:
#predict test data
logits = trainer.predict(test_data)

The following columns in the test set don't have a corresponding argument in `AlbertForSequenceClassification.forward` and have been ignored: text, tag. If text, tag are not expected by `AlbertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Prediction *****
  Num examples = 22591
  Batch size = 1


  0%|          | 0/22591 [00:00<?, ?it/s]

In [29]:
#softmax
logits = torch.from_numpy(logits[0])
y_prob = F.softmax(logits, dim=-1)
y_prob = y_prob.numpy()

In [30]:
#show all the probabilities
for i in range(0, 5):
    print(y_prob[i])

[0.03333062 0.00842803 0.11808243 0.00107023 0.017169   0.0067964
 0.07151329 0.0320967  0.00149787 0.1593068  0.00399898 0.01025133
 0.5364584 ]
[0.01988744 0.20229118 0.03788507 0.00199494 0.0188586  0.01127079
 0.08973163 0.03348115 0.00140502 0.52560776 0.03922483 0.00428871
 0.01407292]
[3.4736753e-03 1.4394970e-03 1.7745224e-03 1.5690863e-03 4.6364017e-02
 7.9456932e-04 1.7269204e-03 1.7084248e-03 2.1820672e-02 4.3325061e-03
 2.7238589e-03 9.0881807e-01 3.4541949e-03]
[0.02614006 0.00487297 0.10661872 0.00119004 0.01371314 0.00688748
 0.04840862 0.02471075 0.00154073 0.07364974 0.00328488 0.00919605
 0.6797868 ]
[0.38101515 0.04513416 0.06421798 0.00262486 0.00237971 0.00402636
 0.00680682 0.30228448 0.00207736 0.01287866 0.00156024 0.00533434
 0.16965988]


In [31]:
def predict(y_prob, threshold):
    y_pred = []
    for prob in y_prob:
        temp = []
        prob_list = prob.tolist()
        prob_list.sort(reverse=True)
        sort_index = np.argsort(-prob)
        for i in range(0, 13):
            if prob_list[i] > threshold:
                temp.append(sort_index[i])
            else: temp.append('Nan')
        y_pred.append(temp)
    return(y_pred)

def predict_class(y_pred):
    predict_tag=[]
    for i in range(0, len(y_pred)):
        predict_list = y_pred[i] #a list
        temp = []
        for j in range(0, 3): #收錄分數最高的前三名
            temp.append(predict_list[j])
        predict_tag.append(temp)
        i += 1
    return predict_tag

def accuracy(y_true, predict_tag):
    count = 0
    for i in range(0, len(y_true)):
        tag = y_true[i]
        for pre_tag in predict_tag[i]:
            if pre_tag == tag:
                count += 1
        accuracy = count/len(y_true)
    return(accuracy)

In [32]:
#threshold = 0.1
y_pred = predict(y_prob, 0.2)
for i in range(0, 5):
    print(y_pred[i])

[12, 'Nan', 'Nan', 'Nan', 'Nan', 'Nan', 'Nan', 'Nan', 'Nan', 'Nan', 'Nan', 'Nan', 'Nan']
[9, 1, 'Nan', 'Nan', 'Nan', 'Nan', 'Nan', 'Nan', 'Nan', 'Nan', 'Nan', 'Nan', 'Nan']
[11, 'Nan', 'Nan', 'Nan', 'Nan', 'Nan', 'Nan', 'Nan', 'Nan', 'Nan', 'Nan', 'Nan', 'Nan']
[12, 'Nan', 'Nan', 'Nan', 'Nan', 'Nan', 'Nan', 'Nan', 'Nan', 'Nan', 'Nan', 'Nan', 'Nan']
[0, 7, 'Nan', 'Nan', 'Nan', 'Nan', 'Nan', 'Nan', 'Nan', 'Nan', 'Nan', 'Nan', 'Nan']


In [33]:
#top 3
predict_tag=[]
for i in range(0, len(y_pred)):
    predict_list = y_pred[i] #a list
    temp = []
    for j in range(0, 3): #收錄分數最高的前三名
        temp.append(predict_list[j])
    predict_tag.append(temp)
    i += 1
print(predict_tag)

[[12, 'Nan', 'Nan'], [9, 1, 'Nan'], [11, 'Nan', 'Nan'], [12, 'Nan', 'Nan'], [0, 7, 'Nan'], [4, 'Nan', 'Nan'], [1, 'Nan', 'Nan'], [8, 'Nan', 'Nan'], [5, 3, 10], [2, 'Nan', 'Nan'], [4, 10, 'Nan'], [4, 9, 'Nan'], [9, 'Nan', 'Nan'], [4, 'Nan', 'Nan'], [6, 5, 'Nan'], [11, 'Nan', 'Nan'], [7, 0, 'Nan'], [12, 'Nan', 'Nan'], [4, 8, 'Nan'], [4, 8, 'Nan'], [11, 'Nan', 'Nan'], [6, 'Nan', 'Nan'], [2, 'Nan', 'Nan'], [7, 0, 'Nan'], [3, 5, 'Nan'], [9, 'Nan', 'Nan'], [7, 1, 'Nan'], [1, 'Nan', 'Nan'], [0, 7, 'Nan'], [5, 10, 3], [5, 'Nan', 'Nan'], [7, 2, 'Nan'], [0, 7, 'Nan'], [4, 'Nan', 'Nan'], [2, 5, 'Nan'], [8, 'Nan', 'Nan'], [3, 8, 'Nan'], [5, 2, 'Nan'], [10, 'Nan', 'Nan'], [1, 7, 'Nan'], [6, 9, 'Nan'], [10, 'Nan', 'Nan'], [2, 12, 'Nan'], [2, 5, 'Nan'], [4, 'Nan', 'Nan'], [9, 'Nan', 'Nan'], [3, 'Nan', 'Nan'], [9, 6, 'Nan'], [12, 'Nan', 'Nan'], [10, 'Nan', 'Nan'], [7, 0, 'Nan'], [9, 'Nan', 'Nan'], [4, 11, 'Nan'], [10, 'Nan', 'Nan'], [9, 'Nan', 'Nan'], [12, 'Nan', 'Nan'], [4, 9, 6], [11, 'Nan', 'Nan'],

In [34]:
#多標籤正確率
test_data = pd.read_csv('test.csv')
y_true = test_data.tag
y_pred = predict(y_prob, 0.1)
predict_tag = predict_class(y_pred)
accuracy = accuracy(y_true, predict_tag)
print('accuracy=', accuracy)

accuracy= 0.891328405117082


In [35]:
#Top1(單標籤)正確率
count=0
for i in range(0, len(y_true)):
    tag = y_true[i]
    pre_tag = predict_tag[i]
    if pre_tag[0] == tag:
        count += 1
    accuracy = count/len(y_true)
print(accuracy)

0.7073613385861627


輸出結果

In [36]:
df_predict_tag = pd.DataFrame(predict_tag, columns=["predict1", "predict2", "predict3"])
df_predict_tag.replace({0:'Politics', 1:'Finance', 2:'Sports', 3:'Health', 4:'Travel', 5:'Life', 6:'Video', 
                           7:'World', 8:'Beauty', 9:'Art', 10:'Shopping', 11:'Food', 12:'Education'}, inplace=True)


In [37]:
df_predict_tag

Unnamed: 0,predict1,predict2,predict3
0,Education,Art,Sports
1,Art,Finance,Nan
2,Food,Nan,Nan
3,Education,Sports,Nan
4,Politics,World,Education
...,...,...,...
22586,Art,Video,World
22587,World,Politics,Finance
22588,World,Finance,Politics
22589,Education,Sports,Politics


In [38]:
new_data = pd.DataFrame({
    'text' : test_data.text,
    'tag' : test_data.tag,
    'predict1' : df_predict_tag.predict1,
    'predict2' : df_predict_tag.predict2,
    'predict3' : df_predict_tag.predict3
})

new_data.tag.replace({0:'Politics', 1:'Finance', 2:'Sports', 3:'Health', 4:'Travel', 5:'Life', 6:'Video', 
                           7:'World', 8:'Beauty', 9:'Art', 10:'Shopping', 11:'Food', 12:'Education'}, inplace=True)

In [39]:
new_data.head(30)

Unnamed: 0,text,tag,predict1,predict2,predict3
0,文：莊冠群李安的電影《比利林恩的中場戰事》，改編自班方登（Ben Fountain）的同名小...,Art,Education,Art,Sports
1,What would Mark Rothko make of the world today...,Art,Art,Finance,Nan
2,['台灣的美式炭烤餐廳不知凡幾？但真正道地的有幾間？就在板橋車水馬龍的中山路二段 77 號 ...,Food,Food,Nan,Nan
3,文：賴文堅1993年的寒假，一群仍顯稚氣的大學生，半夜裡從台北車站搭著往台東火車，凜冽空氣中...,Art,Education,Sports,Nan
4,最近又有親共的退役將領發表狂言，前國防管理學院院長，立法委員帥化民中將聲稱F-16是美國空軍...,Politics,Politics,World,Education
5,20 Best Weekend Getaways in New EnglandFrom ch...,Travel,Travel,Food,Nan
6,"Sara Menker, CEO of Gro Intelligence, Warns th...",Finance,Finance,World,Nan
7,Dua Lipa’s Hair Transformation “Wasn’t Anythin...,Beauty,Beauty,Nan,Nan
8,'Made my daily commute safer': This bestsellin...,Shopping,Life,Health,Shopping
9,NBA／最佳五人未挑「字母哥」 里拉德選擇引發揣測本週收穫滿滿，成功衛冕三分球大賽冠軍，並拿...,Sports,Sports,Life,Nan


In [40]:
#將test data分類結果輸出
new_data.to_csv('result/multilan_final.csv',index=False,encoding='utf-8-sig')