In [1]:

from datasets import Dataset
import os
import torch
import pandas as pd
from transformers import BertTokenizerFast, BertForSequenceClassification, TrainingArguments, Trainer
import numpy as np
import evaluate

metric = evaluate.load("accuracy")
%load_ext autoreload
%autoreload 2


In [2]:
cwd = os.getcwd()
data_dir = os.path.join(cwd, "dataset", "disaster_tweets")
train_path = os.path.join(data_dir, "train.csv")
test_path = os.path.join(data_dir, "test.csv")
model_dir = os.path.join(cwd, "model")
submission_path = os.path.join(data_dir, "sample_submission.csv")

In [3]:
train_df = pd.read_csv(train_path)
test_df = pd.read_csv(test_path)

In [4]:
train_df["text_length"] = train_df["text"].apply(lambda x: len(x.split()))
test_df["text_length"] = test_df["text"].apply(lambda x: len(x.split()))
train_df["text_length"].max() , test_df["text_length"].max()

(31, 31)

In [5]:
train_df

Unnamed: 0,id,keyword,location,text,target,text_length
0,1,,,Our Deeds are the Reason of this #earthquake M...,1,13
1,4,,,Forest fire near La Ronge Sask. Canada,1,7
2,5,,,All residents asked to 'shelter in place' are ...,1,22
3,6,,,"13,000 people receive #wildfires evacuation or...",1,8
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1,16
...,...,...,...,...,...,...
7608,10869,,,Two giant cranes holding a bridge collapse int...,1,11
7609,10870,,,@aria_ahrary @TheTawniest The out of control w...,1,20
7610,10871,,,M1.94 [01:04 UTC]?5km S of Volcano Hawaii. htt...,1,8
7611,10872,,,Police investigating after an e-bike collided ...,1,19


In [6]:
train_dataset = Dataset.from_pandas(train_df[["text", "target"]])
test_dataset = Dataset.from_pandas(test_df[["text"]])
train_dataset = train_dataset.shuffle(seed=118010142).train_test_split(test_size=0.1, seed=118010142)
train_dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'target'],
        num_rows: 6851
    })
    test: Dataset({
        features: ['text', 'target'],
        num_rows: 762
    })
})

In [7]:
tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')

def tokenize(batch):
    text = tokenizer(batch['text'], padding='max_length', max_length=64, truncation=True)
    if 'target' in batch:
        text['labels'] = batch['target']
    return text

train_dataset = train_dataset.map(tokenize, batched=True, remove_columns=train_dataset['train'].column_names)
test_dataset = test_dataset.map(tokenize, batched=True, remove_columns=test_dataset.column_names)

Map:   0%|          | 0/6851 [00:00<?, ? examples/s]

Map:   0%|          | 0/762 [00:00<?, ? examples/s]

Map:   0%|          | 0/3263 [00:00<?, ? examples/s]

In [8]:
# split train and validation
train_dataset['train'].set_format('torch', columns=['input_ids', 'attention_mask', 'labels'])
train_dataset['test'].set_format('torch', columns=['input_ids', 'attention_mask', 'labels'])
train_dataset

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 6851
    })
    test: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 762
    })
})

In [9]:
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2, mirror='tuna')
model

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12,

In [10]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    
num_workers = os.cpu_count()
device, num_workers

(device(type='cuda', index=0), 12)

In [11]:
metric = evaluate.load("accuracy")
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return {"accuracy": metric.compute(predictions=predictions, references=labels)["accuracy"]}

In [12]:
training_args = TrainingArguments(
    output_dir="logs", 
    save_strategy="epoch",
    evaluation_strategy="epoch",
    data_seed=118010142,
    load_best_model_at_end=True,
    num_train_epochs=4,
    )


In [13]:
trainer = Trainer(
    model=model,
    args=training_args,
    compute_metrics=compute_metrics,
    train_dataset=train_dataset["train"],
    eval_dataset=train_dataset["test"],
)

In [14]:
train_result = trainer.train()



  0%|          | 0/3428 [00:00<?, ?it/s]

{'loss': 0.5084, 'learning_rate': 4.2707117852975496e-05, 'epoch': 0.58}


  0%|          | 0/96 [00:00<?, ?it/s]

{'eval_loss': 0.45667776465415955, 'eval_accuracy': 0.8044619422572179, 'eval_runtime': 1.5694, 'eval_samples_per_second': 485.536, 'eval_steps_per_second': 61.17, 'epoch': 1.0}
{'loss': 0.4315, 'learning_rate': 3.541423570595099e-05, 'epoch': 1.17}
{'loss': 0.3665, 'learning_rate': 2.8121353558926487e-05, 'epoch': 1.75}


  0%|          | 0/96 [00:00<?, ?it/s]

{'eval_loss': 0.4341958463191986, 'eval_accuracy': 0.8188976377952756, 'eval_runtime': 1.5181, 'eval_samples_per_second': 501.941, 'eval_steps_per_second': 63.237, 'epoch': 2.0}
{'loss': 0.3096, 'learning_rate': 2.0828471411901985e-05, 'epoch': 2.33}
{'loss': 0.2588, 'learning_rate': 1.353558926487748e-05, 'epoch': 2.92}


  0%|          | 0/96 [00:00<?, ?it/s]

{'eval_loss': 0.686164140701294, 'eval_accuracy': 0.8320209973753281, 'eval_runtime': 1.61, 'eval_samples_per_second': 473.286, 'eval_steps_per_second': 59.627, 'epoch': 3.0}
{'loss': 0.2036, 'learning_rate': 6.2427071178529756e-06, 'epoch': 3.5}


  0%|          | 0/96 [00:00<?, ?it/s]

{'eval_loss': 0.8756476044654846, 'eval_accuracy': 0.821522309711286, 'eval_runtime': 1.4971, 'eval_samples_per_second': 508.979, 'eval_steps_per_second': 64.123, 'epoch': 4.0}
{'train_runtime': 272.0972, 'train_samples_per_second': 100.714, 'train_steps_per_second': 12.598, 'train_loss': 0.3218648859333408, 'epoch': 4.0}


In [65]:
datetime.now().strftime("%Y%m%d-%H%M%S")

'20230529-214837'

In [48]:
os.path.join(model_dir, modelname)

'd:\\GitHub\\HF-NLP-Startup\\model\\bert2023-05-29-21:45.pt'

In [66]:
from datetime import datetime
modelname = "bert"+datetime.now().strftime("%Y%m%d-%H%M%S")+".pt"
torch.save(model, os.path.join(model_dir, modelname))

In [15]:
train_result.metrics

{'train_runtime': 272.0972,
 'train_samples_per_second': 100.714,
 'train_steps_per_second': 12.598,
 'train_loss': 0.3218648859333408,
 'epoch': 4.0}

In [16]:
trainer.evaluate()

  0%|          | 0/96 [00:00<?, ?it/s]

{'eval_loss': 0.4341958463191986,
 'eval_accuracy': 0.8188976377952756,
 'eval_runtime': 1.6205,
 'eval_samples_per_second': 470.236,
 'eval_steps_per_second': 59.242,
 'epoch': 4.0}

In [17]:
result = trainer.predict(test_dataset)
result

  0%|          | 0/408 [00:00<?, ?it/s]

PredictionOutput(predictions=array([[-0.98553145,  0.63314605],
       [-1.886865  ,  1.4082017 ],
       [-2.0321603 ,  1.5700531 ],
       ...,
       [-2.0505126 ,  1.61204   ],
       [-0.8815236 ,  0.58445   ],
       [-1.1728534 ,  0.72372985]], dtype=float32), label_ids=None, metrics={'test_runtime': 6.3372, 'test_samples_per_second': 514.897, 'test_steps_per_second': 64.382})

In [18]:
result = np.argmax(result.predictions, axis=1)
result

array([1, 1, 1, ..., 1, 1, 1], dtype=int64)

In [19]:
from datetime import time


submission_df = pd.read_csv(submission_path)
submission_df["target"] = result
submission_df.to_csv(os.path.join(data_dir, "submission.csv"), index=False)