In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [None]:
pip install transformers



In [None]:
import re
import os

import numpy as np
import torch
from torch.nn import CrossEntropyLoss
from torch.utils.data import DataLoader
from torch.utils.data import RandomSampler
from tqdm import tqdm, trange
from transformers import AdamW, get_linear_schedule_with_warmup
from transformers import BertConfig, BertForSequenceClassification, BertTokenizer


In [None]:
PAD_TOKEN_LABEL_ID = CrossEntropyLoss().ignore_index     # ignore unrelevent padding infomation

BATCH_SIZE = 16
LEARNING_RATE_MODEL = 1e-5             # model main part learning rate
LEARNING_RATE_CLASSIFIER = 1e-3        # for the classifier at the top of the model
WARMUP_STEPS = 0                       # defines the number of steps for gradually increasing the learning rate at the beginning of training. Here, set to 0, it means no warmup is used.
                                       #Warmup can help stabilize the early phase of training, preventing the model from diverging due to a high learning rate initially.
GRADIENT_ACCUMULATION_STEPS = 1        #Gradient accumulation allows for simulating larger batches in smaller memory by accumulating gradients over multiple smaller batches. here set to 1, meaning no gradient accumulation is used.
MAX_GRAD_NORM = 1.0                     #This is the threshold for gradient clipping. Gradient clipping is used to prevent the exploding gradients
SEED = 42
NO_CUDA = False                          #This flag indicates whether to use CUDA (i.e., GPU acceleration).


In [None]:
#rpad function To ensure the consistency of sequence lengths.
#If a sequence is longer than the specified length (here 256), it is truncated; if shorter, it is padded with zeros to the specified length.

def rpad(array, n):
    current_len = len(array)
    if current_len > n:
        return array[:n]
    extra = n - current_len
    return array + ([0] * extra)

# To convert text data into a format suitable for processing by the BERT model.
# It tokenizes the text using a tokenizer (e.g., BertTokenizer), converts tokens into IDs understandable by the model,
# and adds special tokens [CLS] and [SEP] at the beginning and end of the sequence.
#tokens = tokens[:250] means that the list of tokens, after tokenization,
#is truncated to retain only the first 250 elements. This is typically done to ensure consistency and efficiency in the model's input.

def convert_to_embedding(tokenizer, sentences_with_labels):
    for sentence, label in sentences_with_labels:
        tokens = tokenizer.tokenize(sentence)
        tokens = tokens[:250]
        bert_sent = rpad(tokenizer.convert_tokens_to_ids(["CLS"] + tokens + ["SEP"]), n=256)          #encoding，add ID and add special sign
        yield torch.tensor(bert_sent), torch.tensor(label, dtype=torch.int64)                       # data is converted into PyTorch tensors, ready to be fed into the model.



# parse line function is for Text Cleaning，To clean and format text, such as removing HTML tags, converting to lowercase,
def parse_line(line):
    line = line.strip().lower()
    line = line.replace("&nbsp;", " ")
    line = re.sub(r'<br(\s\/)?>', ' ', line)
    line = re.sub(r' +', ' ', line)  # merge multiple spaces into one

    return line

#To read text data from a file, applying the parse_line function to clean each line.
def read_imdb_data(filename):
    data = []
    for line in open(filename, 'r', encoding="utf-8"):
        data.append(parse_line(line))

    return data


def prepare_dataloader(tokenizer, sampler=RandomSampler, train=False):
    filename = "/content/gdrive/My Drive/bert/imdb_train.txt" if train else "/content/gdrive/My Drive/bert/imdb_test.txt"

    data = read_imdb_data(filename)
    y = np.append(np.zeros(12500), np.ones(12500)) #creates an array of labels where the first 12500 labels are 0 (negative reviews), the next 12500 labels are 1 (positive reviews).
    sentences_with_labels = zip(data, y.tolist())  #combines the text data and corresponding labels into pairs.

    dataset = list(convert_to_embedding(tokenizer, sentences_with_labels))  #converts the paired text and labels into a format that can be understood by the model

    sampler_func = sampler(dataset) if sampler is not None else None
    dataloader = DataLoader(dataset, sampler=sampler_func, batch_size=BATCH_SIZE)

    return dataloader


In [None]:
#Transformers Class is used to load, train, and evaluate the BERT model.

class Transformers:
    model = None

    def __init__(self, tokenizer):
        self.pad_token_label_id = PAD_TOKEN_LABEL_ID
        self.device = torch.device("cuda" if torch.cuda.is_available() and not NO_CUDA else "cpu")
        self.tokenizer = tokenizer

    def predict(self, sentence):
        if self.model is None or self.tokenizer is None:
            self.load()

        embeddings = list(convert_to_embedding([(sentence, -1)]))
        preds = self._predict_tags_batched(embeddings)        #predict the sentments is positive or negative
        return preds

    def evaluate(self, dataloader):
        from sklearn.metrics import classification_report
        y_pred = self._predict_tags_batched(dataloader)
        y_true = np.append(np.zeros(12500), np.ones(12500))

        score = classification_report(y_true, y_pred)
        print(score)


#Batch prediction can improve efficiency and lead to more stable results, especially in scenarios involving gradient descent and backpropagation.
    def _predict_tags_batched(self, dataloader):
        preds = []
        self.model.eval()
        for batch in tqdm(dataloader, desc="Computing NER tags"):
            batch = tuple(t.to(self.device) for t in batch)

            with torch.no_grad():
                outputs = self.model(batch[0])
                _, is_neg = torch.max(outputs[0], 1)
                preds.extend(is_neg.cpu().detach().numpy())

        return preds

# traning funciton here is a general fine-tuning method that can be used to fine-tune a loaded BERT model for various different tasks（ text classification, named entity recognition）
#It sets up the optimizer, learning rate scheduler, performs gradient accumulation, and trains the entire model.

    def train(self, dataloader, model, epochs):
        assert self.model is None  # make sure we are not use training model
        model.to(self.device)
        self.model = model

        t_total = len(dataloader) // GRADIENT_ACCUMULATION_STEPS * epochs  #calculate the training step

        # Prepare optimizer and schedule (linear warmup and decay)
        optimizer_grouped_parameters = [
            {"params": model.bert.parameters(), "lr": LEARNING_RATE_MODEL},
            {"params": model.classifier.parameters(), "lr": LEARNING_RATE_CLASSIFIER}
        ]
        optimizer = AdamW(optimizer_grouped_parameters)
        scheduler = get_linear_schedule_with_warmup(
            optimizer, num_warmup_steps=WARMUP_STEPS, num_training_steps=t_total)

        # Train!
        print("***** Running training *****")
        print("Training on %d examples", len(dataloader))
        print("Num Epochs = %d", epochs)
        print("Total optimization steps = %d", t_total)

        global_step = 0
        tr_loss, logging_loss = 0.0, 0.0
        model.zero_grad()
        train_iterator = trange(epochs, desc="Epoch")
        self._set_seed()
        for _ in train_iterator:
            epoch_iterator = tqdm(dataloader, desc="Iteration")
            for step, batch in enumerate(epoch_iterator):
                model.train()
                batch = tuple(t.to(self.device) for t in batch)
                outputs = model(batch[0], labels=batch[1]) #feedforward
                loss = outputs[0]  #calculate loss, model outputs are always tuple in pytorch-transformers (see doc)

                if GRADIENT_ACCUMULATION_STEPS > 1:
                    loss = loss / GRADIENT_ACCUMULATION_STEPS

                loss.backward() # calcualte gredient in backpropagation

                tr_loss += loss.item()
                if (step + 1) % GRADIENT_ACCUMULATION_STEPS == 0:
                    torch.nn.utils.clip_grad_norm_(model.parameters(), MAX_GRAD_NORM)

                    scheduler.step()  # Update learning rate schedule
                    optimizer.step()
                    model.zero_grad() #
                    global_step += 1

        self.model = model
        return global_step, tr_loss / global_step

    def _set_seed(self):
        torch.manual_seed(SEED)
        if self.device == 'gpu':
            torch.cuda.manual_seed_all(SEED)


    def load(self, model_dir='weights/'):
        self.tokenizer = BertTokenizer.from_pretrained(model_dir)
        self.model = BertForSequenceClassification.from_pretrained(model_dir)
        self.model.to(self.device)


In [None]:
# training function is specific to fine-tuning a BERT model for the task of sentiment analysis
def train(epochs=20, output_dir="weights/"):
    num_labels = 2  # negative and positive reviews
    config = BertConfig.from_pretrained('bert-base-uncased', num_labels=num_labels)
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)          #预训练的 BERT 模型是通过 from_pretrained 方法加载的，这意味着该模型已经在一个大型的语料库（如维基百科）上进行了预训练
    model = BertForSequenceClassification.from_pretrained('bert-base-uncased', config=config)    #选择预训练的 BERT 模型，准备对其进行微调

    dataloader = prepare_dataloader(tokenizer, train=True) # prepare data for fine tune
    predictor = Transformers(tokenizer)
    predictor.train(dataloader, model, epochs) #  model fine tune

    model.save_pretrained(output_dir)
    tokenizer.save_pretrained(output_dir)

def evaluate(model_dir="weights/"):
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

    dataloader = prepare_dataloader(tokenizer, train=False, sampler=None)
    predictor = Transformers(tokenizer)
    predictor.load(model_dir=model_dir)
    predictor.evaluate(dataloader)



path = '/content/gdrive/My Drive/bert/weights/'
#os.makedirs(path, exist_ok=True)
#train(epochs=10, output_dir=path)
evaluate(model_dir=path)


Computing NER tags: 100%|██████████| 1563/1563 [05:48<00:00,  4.48it/s]


              precision    recall  f1-score   support

         0.0       0.86      0.87      0.86     12500
         1.0       0.87      0.85      0.86     12500

    accuracy                           0.86     25000
   macro avg       0.86      0.86      0.86     25000
weighted avg       0.86      0.86      0.86     25000

