In [32]:
import pandas as pd
from sklearn.model_selection import train_test_split
import torch
import re
from transformers import AutoTokenizer ,BertForSequenceClassification, AdamW
from sklearn.metrics import classification_report

In [33]:
# read train data
train_data=pd.read_csv("new_train.csv")
# read test data
test_data=pd.read_csv("new_test.csv")

In [34]:
train_data.head()

Unnamed: 0,text,label
0,our deed reason earthquak may ah forgiv us,1
1,forest fire near la rong sask canada,1
2,all resid ask shelter place beg notifi off...,1
3,peopl receiv wildfir evacu order california,1
4,just got sent photo rubi alka smoke wildfi...,1


In [35]:
test_data.head()

Unnamed: 0,id,text
0,0,just happen terribl car crash
1,2,heard earthquak differ citi stay safe everyon
2,3,forest fire spot pond gees flee across st...
3,9,apocalyps light spokan wildfir
4,11,typhoon soudelor kill cha taiwan


In [36]:
train_data=train_data.values.tolist()
test_data=test_data.values.tolist()

### Transform data

In [37]:
class Mydata(torch.utils.data.Dataset):
    def __init__(self, data):
        self.data = data

    def __getitem__(self, idx):
        return self.data[idx]

    def __len__(self):
        return len(self.data)

In [38]:
train, valid = train_test_split(train_data, test_size = 0.24)

In [39]:
train=Mydata(train)
valid=Mydata(valid)

In [40]:
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

In [41]:
print("Model max_length: ", tokenizer.model_max_length)

Model max_length:  512


In [42]:
def collate_fn(data: list[tuple[str, int]]):
    texts = []
    labels = []
    for content, label in data:
        texts.append(content)
        labels.append(label)    
    input_ids = tokenizer.batch_encode_plus(texts, padding = True, truncation = True)['input_ids']
    input_ids = torch.tensor(input_ids)
    labels = torch.tensor(labels)
    return input_ids, labels

In [43]:
train_dataloader = torch.utils.data.DataLoader(dataset = train, batch_size = 32, collate_fn = collate_fn, shuffle = True)
valid_dataloader = torch.utils.data.DataLoader(dataset = valid, batch_size = 32, collate_fn = collate_fn)

In [44]:
for input_ids, labels in train_dataloader:
    print(input_ids, labels)
    break

tensor([[  101,  2128,  4168,  ...,     0,     0,     0],
        [  101, 19387,  3602,  ...,     0,     0,     0],
        [  101,  8108,  2377,  ...,     0,     0,     0],
        ...,
        [  101,  9389,  2063,  ...,     0,     0,     0],
        [  101,  5034,  2213,  ...,  8909,   102,     0],
        [  101,  1058,  2419,  ...,  2615,  7186,   102]]) tensor([0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0,
        1, 0, 0, 0, 0, 0, 1, 0])


### Model

In [45]:
class MyBertForClassification(torch.nn.Module):
    def __init__(self, num_labels) -> None:
        super().__init__()
        self.bert = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=num_labels)
    def forward(self, input_ids, labels):
        outputs = self.bert(input_ids=input_ids, labels=labels)
        probs = torch.softmax(outputs['logits'], dim = -1)
        preds = torch.argmax(probs, dim = -1)
        outputs['preds'] = preds
        return outputs

In [46]:
model = MyBertForClassification(2)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

## Train

In [47]:
LR = 4e-6
EPOCH = 4
LOG_STEP = 30

In [48]:
optimizer = AdamW(model.parameters(), lr=LR)



In [49]:
print(f"Train Loader Step: {len(train_dataloader)}")
print(f"Valid Loader Step: {len(valid_dataloader)}")

Train Loader Step: 181
Valid Loader Step: 58


In [50]:
for epoch in range(EPOCH):
    running_loss = 0.0
    print(f"Epoch {epoch}:")
    model.train()  # Chuyển sang chế độ huấn luyện
    for i, (input_id, label) in enumerate(train_dataloader):
        input_ids = input_id
        labels = label

        optimizer.zero_grad()

        outputs = model(input_ids, labels)
        loss = outputs['loss']
        loss.backward()
        optimizer.step()

        running_loss += loss.item()

        if i % LOG_STEP == LOG_STEP-1:
            print('[Epoch %d, Batch %d] loss: %.3f' %
                  (epoch + 1, i + 1, running_loss / LOG_STEP))
            running_loss = 0.0

    model.eval()
    valid_loss = 0
    with torch.no_grad():
        for step, (input_ids, labels) in enumerate(valid_dataloader):
            outputs = model(input_ids, labels)
            loss = outputs['loss']
            valid_loss += loss.item()
    print(f"Valid loss: {valid_loss / len(valid_dataloader)}")

# Lưu mô hình
torch.save(model.state_dict(), 'bert_classifier.pth')
# model.load_state_dict(torch.load('bert_classifier.pth'))
# model.eval()

Epoch 0:
[Epoch 1, Batch 30] loss: 0.670
[Epoch 1, Batch 60] loss: 0.587
[Epoch 1, Batch 90] loss: 0.528
[Epoch 1, Batch 120] loss: 0.537
[Epoch 1, Batch 150] loss: 0.510
[Epoch 1, Batch 180] loss: 0.463
Valid loss: 0.48113364084013577
Epoch 1:
[Epoch 2, Batch 30] loss: 0.381
[Epoch 2, Batch 60] loss: 0.423
[Epoch 2, Batch 90] loss: 0.439
[Epoch 2, Batch 120] loss: 0.408
[Epoch 2, Batch 150] loss: 0.417
[Epoch 2, Batch 180] loss: 0.406
Valid loss: 0.42330224889105766
Epoch 2:
[Epoch 3, Batch 30] loss: 0.293
[Epoch 3, Batch 60] loss: 0.315
[Epoch 3, Batch 90] loss: 0.283
[Epoch 3, Batch 120] loss: 0.334
[Epoch 3, Batch 150] loss: 0.360
[Epoch 3, Batch 180] loss: 0.322
Valid loss: 0.44283307934629507
Epoch 3:
[Epoch 4, Batch 30] loss: 0.223
[Epoch 4, Batch 60] loss: 0.244
[Epoch 4, Batch 90] loss: 0.207
[Epoch 4, Batch 120] loss: 0.223
[Epoch 4, Batch 150] loss: 0.223
[Epoch 4, Batch 180] loss: 0.254
Valid loss: 0.5008486488769794


### Test

In [69]:
def collate_fn(data: list[tuple[str, int]]):
    texts = []
    ids = []
    for id, content in data:
        texts.append(content)
        ids.append(id)    
    input_ids = tokenizer.batch_encode_plus(texts, padding = True, truncation = True)['input_ids']
    input_ids = torch.tensor(input_ids)
    ids = torch.tensor(ids)
    return input_ids, ids

In [70]:
inputs,ids=collate_fn(test_data)

In [71]:
test=Mydata(test_data)

In [72]:
test_dataloader = torch.utils.data.DataLoader(dataset = test_data, batch_size = 1, collate_fn = collate_fn)

In [73]:
result_test={
    'id':[],
    'target':[]
}

In [74]:
with torch.no_grad():
    for step, (input_ids, idx) in enumerate(test_dataloader):
        outputs = model(input_ids=input_ids, labels=None)
        preds = outputs['preds']
        span = preds[0].item()
        result_test['id'].append(idx.item())
        result_test['target'].append(span)

In [75]:
df=pd.DataFrame(result_test)

In [76]:
df.to_csv("result1.csv",index=False)