In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split


train = pd.read_csv("/kaggle/input/ta-nlp/ag_news_csv/train.csv",header=None,names=["label","title","description"])
test = pd.read_csv('/kaggle/input/ta-nlp/ag_news_csv/test.csv',header = None,names=["label","title","description"])

train['text'] = train['title'] + " " + train['description']
test['text'] = test['title'] + " " + test['description']

x_train = train['text'].tolist()
y_train = train['label'].tolist()

x_test = test['text'].tolist()
y_test = test['label'].tolist()

In [3]:
import torch
import torch.nn as nn
import numpy as np
from databits import CreateModel
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score, accuracy_score

BATCH_SIZE = 32
SEQUENCE_LENGTH = 100
EPOCHS = 5
EMBED_DIM = 512
N_LAYERS = 2
DROPOUT_RATE = 0.1
NUM_CLASSES = len(np.unique(np.array(y_train)))
OPTIMIZER = torch.optim.Adam
LR = 0.001
LOSS = nn.CrossEntropyLoss

In [4]:
model = CreateModel(x_train, y_train,
                 x_test, y_test,
                 batch=BATCH_SIZE,
                 seq=SEQUENCE_LENGTH,
                 embedding_dim=EMBED_DIM,
                 n_layers=N_LAYERS,
                 dropout_rate=DROPOUT_RATE,
                 num_classes=NUM_CLASSES)

Loading setup data ...
Loading train data ...
Loading val data ...
Successful load model


In [5]:
model.BERT() # bert model

BERT(
  (embedding): Embedding(98639, 512)
  (positional_encoding): PositionalEncoding()
  (fc): Linear(in_features=512, out_features=512, bias=True)
  (out): Linear(in_features=512, out_features=4, bias=True)
)

In [6]:
model.BERT()
history = model.fit(epochs=EPOCHS, optimizer=OPTIMIZER, lr=LR, loss=LOSS)

Training: 100%|██████████| 3750/3750 [02:35<00:00, 24.18batch/s]
Validation: 100%|██████████| 238/238 [00:02<00:00, 88.72batch/s]


Epoch 1/5 | Train Loss: 0.5346 | Train Acc: 0.7882 | Val Loss: 0.3113 | Val Acc: 0.9000



Training: 100%|██████████| 3750/3750 [02:37<00:00, 23.82batch/s]
Validation: 100%|██████████| 238/238 [00:02<00:00, 89.05batch/s]


Epoch 2/5 | Train Loss: 0.2739 | Train Acc: 0.9090 | Val Loss: 0.2810 | Val Acc: 0.9058



Training: 100%|██████████| 3750/3750 [02:37<00:00, 23.78batch/s]
Validation: 100%|██████████| 238/238 [00:02<00:00, 88.58batch/s]


Epoch 3/5 | Train Loss: 0.2156 | Train Acc: 0.9270 | Val Loss: 0.2769 | Val Acc: 0.9134



Training: 100%|██████████| 3750/3750 [02:37<00:00, 23.77batch/s]
Validation: 100%|██████████| 238/238 [00:02<00:00, 89.61batch/s]


Epoch 4/5 | Train Loss: 0.1777 | Train Acc: 0.9395 | Val Loss: 0.2774 | Val Acc: 0.9129



Training: 100%|██████████| 3750/3750 [02:36<00:00, 23.96batch/s]
Validation: 100%|██████████| 238/238 [00:02<00:00, 89.22batch/s]

Epoch 5/5 | Train Loss: 0.1481 | Train Acc: 0.9487 | Val Loss: 0.3029 | Val Acc: 0.9068

Restored model to the best state based on validation loss.





In [7]:
y_true, y_pred = model.eval()

Validation: 100%|██████████| 238/238 [00:02<00:00, 89.67batch/s]


In [8]:
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score, accuracy_score

precision = precision_score(y_true, y_pred, average='macro')
recall = recall_score(y_true, y_pred, average='macro')
f1 = f1_score(y_true, y_pred, average='macro')
accuracy = accuracy_score(y_true, y_pred)

print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")
print(f"Akurasi: {accuracy:.4f}")

cm = confusion_matrix(y_true, y_pred)
print(cm)

Precision: 0.9129
Recall: 0.9097
F1 Score: 0.9101
Akurasi: 0.9097
[[1703   33   64  100]
 [  38 1790   38   34]
 [  57    3 1623  217]
 [  25    5   72 1798]]


In [26]:
text = "Richard Faulds and Stephen Parry are going for gold for Great Britain on day four in Athens."
pred = model.predict(text) # or
print(pred) # text label in int format

tensor([1], device='cuda:0')
