In [None]:
import json
import numpy as np
import tensorflow as tf
from transformers import BertTokenizer, TFBertForSequenceClassification
from tqdm import tqdm

In [None]:
# 資料預處理
tokenizer = BertTokenizer.from_pretrained('bert-base-chinese')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/110k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/269k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/624 [00:00<?, ?B/s]

In [None]:
# 載入資料預處理結果
input_ids_files = ['input_ids_part1.npy', 'input_ids_part2.npy', 'input_ids_part3.npy', 'input_ids_part4.npy', 'input_ids_part5.npy', 'input_ids_part6.npy']
attention_masks_files = ['attention_masks_part1.npy', 'attention_masks_part2.npy', 'attention_masks_part3.npy', 'attention_masks_part4.npy', 'attention_masks_part5.npy', 'attention_masks_part6.npy']

input_ids_list = []
attention_masks_list = []

for input_ids_file, attention_masks_file in zip(input_ids_files, attention_masks_files):
    input_ids_part = np.load(input_ids_file, mmap_mode='r')
    attention_masks_part = np.load(attention_masks_file, mmap_mode='r')
    input_ids_list.append(input_ids_part)
    attention_masks_list.append(attention_masks_part)

input_ids = np.concatenate(input_ids_list, axis=0)
attention_masks = np.concatenate(attention_masks_list, axis=0)

In [None]:
# 訓練資料
train_data = []
with open('public_train.jsonl', 'r', encoding='utf-8') as f:
    for line in f:
        entry = json.loads(line)
        claim = entry['claim']
        label = entry['label']
        train_data.append((claim, label))

In [None]:
# 合併資料
combined_data = [claim for claim, _ in train_data]
labels = [label for _, label in train_data]
label_map = {
    'supports': 0,
    'refutes': 1,
    'NOT ENOUGH INFO': 2
}
labels = np.array([label_map[label] for label in labels])

In [None]:
# 裁剪或截断 input_ids、attention_masks 和 labels
max_length = 128  # 设定最大长度

input_ids = input_ids[:len(labels)]
attention_masks = attention_masks[:len(labels)]
combined_data = combined_data[:len(labels)]

In [None]:
# 对 input_ids 和 attention_masks 进行裁剪或截断
input_ids = input_ids[:,:max_length]
attention_masks = attention_masks[:,:max_length]

In [None]:
# 建立模型
model = TFBertForSequenceClassification.from_pretrained('bert-base-chinese', num_labels=3)
# 編譯模型
optimizer = tf.keras.optimizers.Adam(learning_rate=1e-5)
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')

model.compile(optimizer=optimizer, loss=loss, metrics=[metric])

All model checkpoint layers were used when initializing TFBertForSequenceClassification.

Some layers of TFBertForSequenceClassification were not initialized from the model checkpoint at bert-base-chinese and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
# 訓練模型
model.fit(
    x={'input_ids': input_ids, 'attention_mask': attention_masks},
    y=labels,
    epochs=30,
    batch_size=32
)

# 儲存模型
model.save_pretrained('bert_model')
print('模型儲存完成')

# 載入模型
loaded_model = TFBertForSequenceClassification.from_pretrained('bert_model')

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30
模型儲存完成


Some layers from the model checkpoint at bert_model were not used when initializing TFBertForSequenceClassification: ['dropout_113']
- This IS expected if you are initializing TFBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertForSequenceClassification were initialized from the model checkpoint at bert_model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertForSequenceClassification for predictions without further training.


### 與訓練模型無關 測試結果用

In [None]:
# 測試資料預處理
test_data = []
with open('public_test.jsonl', 'r', encoding='utf-8') as f:
    for line in tqdm(f):
        entry = json.loads(line)
        test_data.append((entry['id'], entry['claim']))

test_input_ids = []
test_attention_masks = []

for id, claim in tqdm(test_data):
    encoded = tokenizer.encode_plus(
        claim,
        add_special_tokens=True,
        max_length=512,
        padding='max_length',
        truncation=True,
        return_attention_mask=True,
        return_tensors='np'  # 修改此处为'np'
    )
    test_input_ids.append(encoded['input_ids'])
    test_attention_masks.append(encoded['attention_mask'])

test_input_ids = np.concatenate(test_input_ids, axis=0)  # 修改此处为np.concatenate
test_attention_masks = np.concatenate(test_attention_masks, axis=0)  # 修改此处为np.concatenate


989it [00:00, 109856.11it/s]
100%|██████████████████████████████████████████████████████████████████████████████| 989/989 [00:00<00:00, 2731.29it/s]


In [None]:
# 進行預測
predictions = loaded_model.predict(
    x={'input_ids': test_input_ids, 'attention_mask': test_attention_masks}
)



In [None]:
# 根據預測結果進行後續處理
output_data = []
for i, (id, claim) in enumerate(test_data):
    prediction = predictions.logits[i]
#     prediction = predictions[i]
    output_data.append({
        'id': id,
        'claim': claim,
        'prediction': int(prediction.argmax())
    })

In [None]:
prediction = predictions.logits[988]
print(prediction)

[-0.67270195  1.2710423   0.4669395 ]


In [None]:
output_file = 'predictions.jsonl'
with open(output_file, 'w', encoding='utf-8') as f:
    for entry in output_data:
        f.write(json.dumps(entry, ensure_ascii=False) + '\n')

print('預測結果輸出完成')

預測結果輸出完成
