In [1]:
import gdown
import pandas as pd

# ID của file Google Drive
file_id = "1jkuOxJ2zl-3bbXPE4UXELKGoisgKC8bx"
url = f"https://drive.google.com/uc?id={file_id}"

# Tải file tạm thời trên Google Colab
output_path = "/content/processed_arxiv_cs_papers_2.csv"
gdown.download(url, output_path, quiet=False)

# Đọc file CSV
raw_df = pd.read_csv(output_path)

# Hiển thị dữ liệu
print(raw_df.head())

Downloading...
From (original): https://drive.google.com/uc?id=1jkuOxJ2zl-3bbXPE4UXELKGoisgKC8bx
From (redirected): https://drive.google.com/uc?id=1jkuOxJ2zl-3bbXPE4UXELKGoisgKC8bx&confirm=t&uuid=88e42ea5-ed62-4c61-83d5-348ca4f63e8d
To: /content/processed_arxiv_cs_papers_2.csv
100%|██████████| 311M/311M [00:03<00:00, 92.4MB/s]


                                                text  cs.AI  cs.CL  cs.CR  \
0  gaussianworld gaussian world model streaming o...      1      0      0   
1  gaussianad gaussiancentric endtoend autonomous...      1      0      0   
2  inverse reinforcement learning estimating expe...      1      0      0   
3  apollo exploration video understanding large m...      1      0      0   
4  library learning neural operator present neura...      1      0      0   

   cs.CV  cs.CY  cs.DC  cs.DM  cs.DS  cs.HC  ...  cs.IT  cs.LG  cs.LO  cs.NA  \
0      1      0      0      0      0      0  ...      0      1      0      0   
1      1      0      0      0      0      0  ...      0      1      0      0   
2      0      0      0      0      0      0  ...      0      1      0      0   
3      1      0      0      0      0      0  ...      0      0      0      0   
4      0      0      0      0      0      0  ...      0      1      0      0   

   cs.NE  cs.NI  cs.RO  cs.SE  cs.SI  cs.SY  
0      0  

In [2]:
raw_df.shape

(314049, 21)

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer
from transformers import BertTokenizer, TFBertForSequenceClassification
import tensorflow as tf

In [5]:
# Đường dẫn lưu mô hình
model_save_path = '/content/drive/MyDrive/updated_model_NMKHDL/'

# Tải lại mô hình đã lưu
model = tf.keras.models.load_model(model_save_path, custom_objects={"TFBertForSequenceClassification": TFBertForSequenceClassification})
print("Model loaded successfully!")

Model loaded successfully!


In [6]:
text = raw_df["text"]

labels = np.asarray(raw_df[raw_df.columns[1:]])
X_train, X_test, y_train, y_test = train_test_split(text, labels, test_size=0.3, random_state=42)

In [7]:
# Tokenizer BERT
tokenizer = BertTokenizer.from_pretrained('allenai/scibert_scivocab_uncased')

# Tokenization
def tokenize_data(texts):
    return tokenizer(list(texts), padding=True, truncation=True, max_length=100, return_tensors='tf')

# Chuyển đổi dữ liệu văn bản thành token
# train_encodings = tokenize_data(X_train)
test_encodings = tokenize_data(X_test)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


vocab.txt:   0%|          | 0.00/228k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/385 [00:00<?, ?B/s]

In [8]:
# Đánh giá mô hình
results = model.evaluate(test_encodings['input_ids'], y_test, batch_size=8)
print(f"Test Loss: {results[0]}, Test Accuracy: {results[1]}")

Test Loss: 0.09130296856164932, Test Accuracy: 0.6238921880722046


In [9]:
# Trích xuất tên nhãn từ cột tiêu đề
label_names = raw_df.columns[1:]

# Khởi tạo MultiLabelBinarizer và fit với nhãn
mlb = MultiLabelBinarizer(classes=label_names)
mlb.fit([label_names])  # Fit với tất cả các nhãn

In [16]:

# Dự đoán nhãn cho tập kiểm tra
y_pred = model.predict(test_encodings['input_ids'], batch_size=8).logits
y_pred = tf.nn.sigmoid(y_pred)  # Chuyển logits thành xác suất
y_pred = tf.cast(y_pred > 0.5, tf.int32)  # Chuyển xác suất thành nhãn nhị phân

# Giải mã nhãn dự đoán
predicted_labels = mlb.inverse_transform(y_pred.numpy())
print(predicted_labels[:5])  # Xem 5 kết quả đầu tiên

[('cs.CL', 'cs.CV'), ('cs.AI', 'cs.IR', 'cs.LG'), ('cs.DC',), ('cs.CV', 'cs.LG'), ('cs.AI', 'cs.CL')]


In [17]:
from sklearn.metrics import hamming_loss, classification_report

print('Hamming Loss:', hamming_loss(y_test, y_pred.numpy()))
print('Classification Report:\n', classification_report(y_test,  y_pred.numpy(), target_names=mlb.classes_))

Hamming Loss: 0.0356015496470838
Classification Report:
               precision    recall  f1-score   support

       cs.AI       0.67      0.24      0.35     14597
       cs.CL       0.84      0.81      0.82      9178
       cs.CR       0.84      0.84      0.84      7475
       cs.CV       0.81      0.85      0.83      9718
       cs.CY       0.74      0.56      0.63      5727
       cs.DC       0.73      0.78      0.75      5450
       cs.DM       0.84      0.58      0.69      3278
       cs.DS       0.88      0.71      0.79      5845
       cs.HC       0.85      0.62      0.71      5444
       cs.IR       0.89      0.68      0.77      5031
       cs.IT       0.87      0.81      0.84      8047
       cs.LG       0.75      0.65      0.70     20720
       cs.LO       0.91      0.87      0.89      3442
       cs.NA       0.89      0.91      0.90      6803
       cs.NE       0.75      0.76      0.75      4029
       cs.NI       0.78      0.79      0.78      6350
       cs.RO       0.85 

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
