In [5]:
import torch
from torch.utils.data import DataLoader, TensorDataset, RandomSampler, SequentialSampler
from transformers import BertTokenizer, BertForSequenceClassification
from transformers import AdamW, get_linear_schedule_with_warmup
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import pandas as pd

In [19]:
df = pd.read_csv("C:\\Users\\m'j\\Desktop\\data_train_preprocessed_harmful.csv")

# BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

df['processed_text_alt'] = df['processed_text_alt'].fillna('')

from sklearn.preprocessing import LabelEncoder

# create LabelEncoder instance
label_encoder = LabelEncoder()

# fit and convert labels
df['encoded_labels'] = label_encoder.fit_transform(df['labels'])



# Tokenization
input_ids = []
attention_masks = []

for text in df['processed_text_alt']:
    encoded_dict = tokenizer.encode_plus(
                        text,                      # input text
                        add_special_tokens = True, # add special characters '[CLS]' and '[SEP]'
                        max_length = 64,           
                        pad_to_max_length = True,
                        return_attention_mask = True,   # build attention masks
                        return_tensors = 'pt',     
                   )
    
    input_ids.append(encoded_dict['input_ids'])
    attention_masks.append(encoded_dict['attention_mask'])

# convert list into tensor
input_ids = torch.cat(input_ids, dim=0)
attention_masks = torch.cat(attention_masks, dim=0)
# labels = torch.tensor(df['encoded_labels'].values)
labels = torch.tensor(df['encoded_labels'].values, dtype=torch.long)


# 训练集和验证集划分
train_inputs, validation_inputs, train_labels, validation_labels = train_test_split(input_ids, labels, random_state=2018, test_size=0.1)
train_masks, validation_masks, _, _ = train_test_split(attention_masks, labels, random_state=2018, test_size=0.1)

# Create DataLoader
train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=32)

validation_data = TensorDataset(validation_inputs, validation_masks, validation_labels)
validation_sampler = SequentialSampler(validation_data)
validation_dataloader = DataLoader(validation_data, sampler=validation_sampler, batch_size=32)


Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


In [20]:
# Define BERT model
from sklearn.metrics import classification_report

model = BertForSequenceClassification.from_pretrained(
    "bert-base-uncased", 
    num_labels = 4, 
    output_attentions = False, 
    output_hidden_states = False, 
)

# GPU or CPU device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

epochs = 4  

optimizer = AdamW(model.parameters(),
                  lr = 2e-5, 
                  eps = 1e-8 
                )

# total trainning steps
total_steps = len(train_dataloader) * epochs

scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps = 0, # Default value
                                            num_training_steps = total_steps)

# Trainning
def train(epoch):
    model.train()
    total_loss = 0
    for step, batch in enumerate(train_dataloader):
        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)

        model.zero_grad()        

        outputs = model(b_input_ids, 
                        token_type_ids=None, 
                        attention_mask=b_input_mask, 
                        labels=b_labels)
        
        loss = outputs.loss
        total_loss += loss.item()

        loss.backward()

        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        optimizer.step()
        scheduler.step()

    avg_train_loss = total_loss / len(train_dataloader)            
    print(f"  Average training loss: {avg_train_loss:.2f}")

# Validation
def evaluate():
    model.eval()
    predictions , true_labels = [], []
    for batch in validation_dataloader:
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_input_mask, b_labels = batch
        
        with torch.no_grad():
            outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask)
        
        logits = outputs.logits
        logits = logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()
        
        predictions.extend(np.argmax(logits, axis=1).flatten())
        true_labels.extend(label_ids.flatten())

    print('DONE.')
    print(classification_report(true_labels, predictions, target_names=['individual', 'organization', 'community', 'society']))



Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [21]:
epochs = 4

for epoch in range(0, epochs):
    print('======== Epoch {:} / {:} ========'.format(epoch + 1, epochs))
    train(epoch)
    evaluate()


  Average training loss: 1.26
DONE.
              precision    recall  f1-score   support

  individual       0.65      0.81      0.72        37
organization       0.73      0.89      0.80        46
   community       0.00      0.00      0.00         4
     society       0.20      0.05      0.08        20

    accuracy                           0.67       107
   macro avg       0.40      0.44      0.40       107
weighted avg       0.58      0.67      0.61       107



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


  Average training loss: 1.04
DONE.
              precision    recall  f1-score   support

  individual       0.73      0.86      0.79        37
organization       0.93      0.85      0.89        46
   community       0.00      0.00      0.00         4
     society       0.57      0.60      0.59        20

    accuracy                           0.78       107
   macro avg       0.56      0.58      0.57       107
weighted avg       0.76      0.78      0.76       107



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


  Average training loss: 0.87
DONE.
              precision    recall  f1-score   support

  individual       0.79      0.84      0.82        37
organization       0.91      0.87      0.89        46
   community       0.00      0.00      0.00         4
     society       0.58      0.70      0.64        20

    accuracy                           0.79       107
   macro avg       0.57      0.60      0.59       107
weighted avg       0.77      0.79      0.78       107



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


  Average training loss: 0.79
DONE.
              precision    recall  f1-score   support

  individual       0.79      0.84      0.82        37
organization       0.91      0.87      0.89        46
   community       0.00      0.00      0.00         4
     society       0.58      0.70      0.64        20

    accuracy                           0.79       107
   macro avg       0.57      0.60      0.59       107
weighted avg       0.77      0.79      0.78       107



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
