In [1]:
import pandas as pd
import numpy as np
import time
import datetime
import os
import torch
import torch.optim as optim
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset
from packaging import version
from scipy.stats import pearsonr
from transformers import AutoTokenizer, AutoModel, BertTokenizerFast
from transformers.models.bert.modeling_bert import *
import sklearn

  from .autonotebook import tqdm as notebook_tqdm


### Set GPU

In [2]:
if torch.cuda.is_available():
    device = "cuda"
elif torch.backends.mps.is_available():
    device = "mps"
else:
    device = "cpu"
print(f"Using {device}")

Using mps


### Set parameters

In [3]:
model_name = "ckiplab/bert-base-chinese"
data_path = "data"
batch_size = 16
hidden_dim = 384
hidden_acti = nn.Tanh()
dropout_rate = 0.2
learning_rate = 2e-5
epochs = 10

### preprocessing

In [4]:
def load_df(csv):
    df = pd.read_csv(csv, sep="\t", encoding="utf-8")
    df.drop("No.", axis=1, inplace=True)
    return df

In [5]:
tokenizer = BertTokenizerFast.from_pretrained(model_name)
def get_max_sentence_len(data_path):
    max_tokens = 0
    for csv in os.listdir(data_path):
        sentences = load_df(os.path.join(data_path, csv))["Text"]
        for sentence in sentences:
            ids = tokenizer.encode(sentence)
            max_tokens = max(max_tokens, len(ids))
    return max_tokens


max_tokens = get_max_sentence_len(data_path)
print(max_tokens)

214


### data preparation


In [6]:
# TO-DO: 研究一下padding要用longest, max_length, or True
class ValenceDataset(Dataset):
    def __init__(self, df, tokenizer, max_tokens):
        self.df = df
        self.tokenizer = tokenizer
        self.max_tokens = max_tokens

    def __len__(self):
        return len(self.df)

    # encode : transform sentence to vocab no., tokenize : transform sentence to text
    def tokenize(self, sentence, max_tokens):
        encoded_dict = self.tokenizer.encode_plus(
            sentence,
            add_special_tokens=True,
            max_length=max_tokens,
            truncation=True,
            padding="max_length",
            return_tensors="pt",
        )
        # one sample per call, so need to sqeeze
        input_ids = encoded_dict["input_ids"].squeeze()
        token_type_ids = encoded_dict["token_type_ids"].squeeze()
        attention_mask = encoded_dict["attention_mask"]
        return input_ids, token_type_ids, attention_mask

    def __getitem__(self, index):
        sentence = self.df.iloc[index]["Text"]
        label = self.df.iloc[index]["Valence_Mean"]
        input_ids, token_by_ids, attention_mask = self.tokenize(
            sentence, self.max_tokens
        )
        sample = {
            "input_ids": input_ids,
            "attention_mask": attention_mask,
            "label": torch.tensor(label, dtype=torch.float32),
        }
        return sample


class ArousalDataset(Dataset):
    def __init__(self, df, tokenizer, max_tokens):
        self.df = df
        self.tokenizer = tokenizer
        self.max_tokens = max_tokens

    def __len__(self):
        return len(self.df)

    # encode : transform sentence to vocab no., tokenize : transform sentence to text
    def tokenize(self, sentence, max_tokens):
        encoded_dict = self.tokenizer.encode_plus(
            sentence,
            add_special_tokens=True,
            max_length=max_tokens,
            truncation=True,
            padding="max_length",
            return_tensors="pt",
        )
        # one sample per call, so need to sqeeze
        input_ids = encoded_dict["input_ids"].squeeze()
        token_type_ids = encoded_dict["token_type_ids"].squeeze()
        attention_mask = encoded_dict["attention_mask"]
        return input_ids, token_type_ids, attention_mask

    def __getitem__(self, index):
        sentence = self.df.iloc[index]["Text"]
        label = self.df.iloc[index]["Arousal_Mean"]
        input_ids, token_by_ids, attention_mask = self.tokenize(
            sentence, self.max_tokens
        )
        sample = {
            "input_ids": input_ids,
            "attention_mask": attention_mask,
            "label": torch.tensor(label, dtype=torch.float32),
        }
        return sample

In [7]:
v_datasets = []
a_datasets = []
df_list = [load_df(os.path.join(data_path, csv)) for csv in os.listdir(data_path)]
for fold_df in df_list:
    v_dataset = ValenceDataset(fold_df, tokenizer, max_tokens)
    a_dataset = ArousalDataset(fold_df, tokenizer, max_tokens)
    v_datasets.append(v_dataset)
    a_datasets.append(a_dataset)

In [8]:
os.listdir(data_path)

['CVAT_4_SD.csv',
 'CVAT_2_SD.csv',
 'CVAT_5_SD.csv',
 'CVAT_1_SD.csv',
 'CVAT_3_SD.csv']

### model (BERT)

In [9]:
class BERTForRegression(nn.Module):
    def __init__(self, pm, hidden_dim, hidden_acti, dropout_rate, output_dim):
        super(BERTForRegression, self).__init__()
        self.pm = AutoModel.from_pretrained(
            pm, output_attentions=False, output_hidden_states=False
        )
        self.hidden = nn.Linear(self.pm.config.hidden_size, hidden_dim)
        self.activation = hidden_acti
        self.dropout = nn.Dropout(dropout_rate)
        self.output_layer = nn.Linear(384, output_dim)
    
    def forward(self, input_sentence, input_mask):
        # [0]:last_hidden_state (batch_size, max_length, 768), [1]:pooling layer
        pm_output = self.pm(input_sentence, attention_mask=input_mask)[1] #(batch_size, 768)
        hidden = self.hidden(pm_output) 
        hidden = self.activation(hidden)
        hidden = self.dropout(hidden) # (batch_size, 384)
        output = self.output_layer(hidden) # (batch_size, 1)
        return output

### model(CNNBERT)

In [10]:
class CustomWordEmbedding(nn.Module):
    """Construct the embeddings from word, position and token_type embeddings."""
    def __init__(self, hidden_size):
        super().__init__()
        self.hidden_size = hidden_size
        self.conv = nn.Conv1d(in_channels=hidden_size, out_channels=hidden_size, kernel_size=3, padding=1)
        self.batch_norm = nn.BatchNorm1d(hidden_size) 
        self.embed = nn.Linear(64, hidden_size)
        self.dropout = nn.Dropout(0.4)

    def forward(self, embeddings):
        # (batch, seq_len, hidden_size) to (batch, hidden_size, seq_len)
        embeddings = nn.functional.tanh(embeddings)
        embeddings = embeddings.permute(0, 2, 1)
        embeddings = self.dropout(self.batch_norm(nn.functional.tanh(self.conv(embeddings))))
        # back to (batch, hidden_size, seq_len)
        embeddings = embeddings.permute(0, 2, 1)
        embeddings = embeddings/10
        return embeddings
    
class CustomBertEmbedding(BertEmbeddings):
    def __init__(self, config):
        super().__init__(config)
        # change the word_embedding that defined by BertEmbeddings
        self.word_embeddings1 = CustomWordEmbedding(config.hidden_size)
    def forward(
        self,
        input_ids: Optional[torch.LongTensor] = None,
        token_type_ids: Optional[torch.LongTensor] = None,
        position_ids: Optional[torch.LongTensor] = None,
        inputs_embeds: Optional[torch.FloatTensor] = None,
        past_key_values_length: int = 0,
        ) -> torch.Tensor:
            if input_ids is not None:
                input_shape = input_ids.size()
            else:
                input_shape = inputs_embeds.size()[:-1]

            seq_length = input_shape[1]

            if position_ids is None:
                position_ids = self.position_ids[:, past_key_values_length : seq_length + past_key_values_length]

            # Setting the token_type_ids to the registered buffer in constructor where it is all zeros, which usually occurs
            # when its auto-generated, registered buffer helps users when tracing the model without passing token_type_ids, solves
            # issue #5664
            if token_type_ids is None:
                if hasattr(self, "token_type_ids"):
                    buffered_token_type_ids = self.token_type_ids[:, :seq_length]
                    buffered_token_type_ids_expanded = buffered_token_type_ids.expand(input_shape[0], seq_length)
                    token_type_ids = buffered_token_type_ids_expanded
                else:
                    token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=self.position_ids.device)

            if inputs_embeds is None:
                inputs_embeds = self.word_embeddings(input_ids)
                # print("\nBefore CNN\n\n", inputs_embeds)
                inputs_embeds = self.word_embeddings1(inputs_embeds)
                # print("\nAfter CNN\n\n", inputs_embeds)
            token_type_embeddings = self.token_type_embeddings(token_type_ids)

            embeddings = inputs_embeds + token_type_embeddings
            if self.position_embedding_type == "absolute":
                position_embeddings = self.position_embeddings(position_ids)
                embeddings += position_embeddings
            embeddings = self.LayerNorm(embeddings)
            embeddings = self.dropout(embeddings)
            # print(embeddings)
            return embeddings

class CustomBertModel(BertModel):
    def __init__(self, config, add_pooling_layer=True):
        super().__init__(config, add_pooling_layer)
        self.embeddings = CustomBertEmbedding(config)

class CNNBERT(nn.Module):
    def __init__(self, pm, hidden_dim, hidden_acti, dropout_rate, output_dim):
        super(CNNBERT, self).__init__()
        self.pm = CustomBertModel.from_pretrained(
            pm, output_attentions=False, output_hidden_states=False
        )
        self.hidden = nn.Linear(self.pm.config.hidden_size, hidden_dim)
        self.activation = hidden_acti
        self.dropout = nn.Dropout(dropout_rate)
        self.output_layer = nn.Linear(384, output_dim)
    
    def forward(self, input_sentence, input_mask):
        # [0]:last_hidden_state (batch_size, max_length, 768), [1]:pooling layer
        pm_output = self.pm(input_sentence, attention_mask=input_mask)[1] #(batch_size, 768)
        hidden = self.hidden(pm_output) 
        hidden = self.activation(hidden)
        hidden = self.dropout(hidden) # (batch_size, 384)
        output = self.output_layer(hidden) # (batch_size, 1)
        return output


model = CNNBERT(model_name, hidden_dim, hidden_acti, dropout_rate, 1)
print(model)


Some weights of CustomBertModel were not initialized from the model checkpoint at ckiplab/bert-base-chinese and are newly initialized: ['bert.embeddings.word_embeddings1.batch_norm.bias', 'bert.embeddings.word_embeddings1.batch_norm.num_batches_tracked', 'bert.embeddings.word_embeddings1.batch_norm.running_mean', 'bert.embeddings.word_embeddings1.batch_norm.running_var', 'bert.embeddings.word_embeddings1.batch_norm.weight', 'bert.embeddings.word_embeddings1.conv.bias', 'bert.embeddings.word_embeddings1.conv.weight', 'bert.embeddings.word_embeddings1.embed.bias', 'bert.embeddings.word_embeddings1.embed.weight', 'bert.pooler.dense.bias', 'bert.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


CNNBERT(
  (pm): CustomBertModel(
    (embeddings): CustomBertEmbedding(
      (word_embeddings): Embedding(21128, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
      (word_embeddings1): CustomWordEmbedding(
        (conv): Conv1d(768, 768, kernel_size=(3,), stride=(1,), padding=(1,))
        (batch_norm): BatchNorm1d(768, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (embed): Linear(in_features=64, out_features=768, bias=True)
        (dropout): Dropout(p=0.4, inplace=False)
      )
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, ou

### training

In [11]:
# format time to hh:mm:ss
def format_time(time):
    time_round = int(round((time)))
    return str(datetime.timedelta(seconds=time_round))

def to_excel(save_string, training_stats):
    df_stats = pd.DataFrame(data=training_stats)
    df_stats = df_stats.set_index('Fold')
    df_stats.to_csv(save_string, index=False)
    return df_stats

# save model to path
def save_checkpoint(save_path, model):
    if save_path == None:
        return
    torch.save(model.state_dict(), save_path)
    print(f'Model saved to ==> {save_path}')

# load model from path
def load_checkpoint(load_path, model, device):
    if load_path==None:
        return
    state_dict = torch.load(load_path, map_location=device)
    print(f'\nModel loaded from <== {load_path}')

    model.load_state_dict(state_dict)
    return model

In [12]:
def Trainer(model, fold_index, train_loader, val_loader, training_stats):
    # optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate, eps=1e-8)
    torch.autograd.set_detect_anomaly(True)
    optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate, eps=1e-8)
    # MAE
    loss_function = torch.nn.L1Loss()
    # Measure the total training time for the whole run.
    total_t0 = time.time()
    for epoch in range(epochs):
    # for epoch in range(5):
        start_time = time.time()
        total_loss = 0
        total_val_outputs = []
        total_val_labels = []
        print(f"\n---------------EPOCH {epoch+1}-----------------\n")
        # print(f"\n---------------EPOCH {epoch+16}-----------------\n")
        model.train()
        # model.eval()
        for step, batch in enumerate(train_loader):
            if step % 40 == 0 and not step == 0:
                pass_time = format_time(time.time() - start_time)
                print(f'\nComplete Batch {step}  of  {len(train_loader)}.   Time: {pass_time}\n')

            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["label"].to(device)
            
            # with torch.no_grad():
            #     outputs = model(input_ids, attention_mask)
            optimizer.zero_grad()
            outputs = model(input_ids, attention_mask)
            loss = loss_function(outputs.view(-1), labels.view(-1))
            total_loss += loss.item()
            # calculate the gradients
            loss.backward()
            # gradient descent
            optimizer.step()

        avg_train_loss = total_loss / len(train_loader)
        training_time = format_time(time.time() - start_time)
        
        print(f"\nTraining loss: {avg_train_loss}\n")
        print(f"\nTraining time: {training_time}\n")
        print(f"\nValidation ........\n")
        valid_start_time = time.time()
        model.eval()
        total_valid_loss = 0

        for batch in val_loader:
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["label"].to(device)
            with torch.no_grad():
                outputs = model(input_ids, attention_mask)
            loss = loss_function(outputs.view(-1), labels.view(-1))
            total_valid_loss += loss.item()

            total_val_outputs = total_val_outputs + outputs.view(-1).detach().cpu().numpy().tolist()
            print(total_val_outputs)
            total_val_labels = total_val_labels + labels.view(-1).detach().cpu().numpy().tolist()

        avg_val_loss = total_valid_loss / len(val_loader)
        validation_time = format_time(time.time() - valid_start_time)

        print(f"\nValidation Loss: {avg_val_loss}\n")
        print(f"\nValidation time: {validation_time}\n")
    # print(total_val_outputs)
    # print(total_val_labels)
    val_r, _ = pearsonr(total_val_outputs, total_val_labels)

    print(f'\nFinal validation correlation: {val_r}\n')
    training_stats.append(
    {
        'Fold': fold_index+1,
        'Training Loss': avg_train_loss,
        'Valid Loss': avg_val_loss,
        'Valid correlation' : val_r
    })
    
    print("\nTraining complete!\n")
    print(f"\nTotal training took {format_time(time.time()-total_t0)} (hh:mm:ss)")
    save_string = f'bert-base-chinese' + '_'+f'{fold_index+1}' + '_' + 'V'+ '_' + '0503' + '.csv'
    return training_stats, model

In [13]:
training_stats = []

In [14]:
# train for Arousal
for fold_index, dataset in enumerate(a_datasets):
    if fold_index == 0:
       continue
    if fold_index == 1:
       continue
    if fold_index == 2:
       continue
    print(f"\n========Training fold {fold_index + 1}===========\n")
    train_loaders = [
        data for index, data in enumerate(a_datasets) if index != fold_index
    ]
    val_loader = DataLoader(dataset, batch_size=batch_size, shuffle=True)
    # concat multiple dataloader to oen dataloader
    train_loader = torch.utils.data.ConcatDataset(train_loaders)
    train_loader = DataLoader(train_loader, batch_size=batch_size, shuffle=True)

    print(f"TrainDataLoader contains {len(train_loader.dataset)} samples")
    batch = next(iter(train_loader))
    print("Sample data from the first batch:")
    print("Input IDs:", batch["input_ids"].shape)  # (batch_size, max_length)
    print("Attention Mask:", batch["attention_mask"].shape)  # (batch_size, max_length)
    print("Label:", batch["label"].shape)  # (batch_size, )
    print("Example sentence:", batch["input_ids"][0])
    print("Example label:", batch["label"][0].item())

    model = CNNBERT(model_name, hidden_dim, hidden_acti, dropout_rate, 1)
    model.to(device)

    # model = load_checkpoint(f'checkpoint/bert-base-chinese_{fold_index+1}_A_0504.pt', model, device)
     
    # freeze some layers (top | middle | bottom):
    bottom = range(2, 12)
    middle = list(range(0,5))+list(range(7,12))
    top = range(0, 10)
    freeze_layers = top
    for i in freeze_layers:
    #   print(i)
      for param in model.pm.encoder.layer[i].parameters():
        param.requires_grad = False
    
    training_stat, model = Trainer(model, fold_index, train_loader, val_loader, training_stats)
    save_checkpoint(os.path.join('checkpoint',f'cnnBERT_{fold_index+1}_A_0507.pt'), model)

save_string = f"cnnBERT" + "_" + "A" + "_" + "0507" + ".csv"
to_excel(save_string, training_stats)



TrainDataLoader contains 2360 samples
Sample data from the first batch:
Input IDs: torch.Size([16, 214])
Attention Mask: torch.Size([16, 1, 214])
Label: torch.Size([16])
Example sentence: tensor([ 101, 6973, 2233, 3085, 2552, 2658, 5219, 4080, 1240, 4638, 3295, 3378,
        3298, 3300, 1927, 2971, 5647, 1240, 8024, 6634, 5215, 1841, 6356,  511,
         102,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          

Some weights of CustomBertModel were not initialized from the model checkpoint at ckiplab/bert-base-chinese and are newly initialized: ['bert.embeddings.word_embeddings1.batch_norm.bias', 'bert.embeddings.word_embeddings1.batch_norm.num_batches_tracked', 'bert.embeddings.word_embeddings1.batch_norm.running_mean', 'bert.embeddings.word_embeddings1.batch_norm.running_var', 'bert.embeddings.word_embeddings1.batch_norm.weight', 'bert.embeddings.word_embeddings1.conv.bias', 'bert.embeddings.word_embeddings1.conv.weight', 'bert.embeddings.word_embeddings1.embed.bias', 'bert.embeddings.word_embeddings1.embed.weight', 'bert.pooler.dense.bias', 'bert.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



---------------EPOCH 1-----------------


Complete Batch 40  of  148.   Time: 0:00:42


Complete Batch 80  of  148.   Time: 0:01:22


Complete Batch 120  of  148.   Time: 0:02:03


Training loss: 1.1841472127550356


Training time: 0:02:31


Validation ........

[5.134739875793457, 4.947647571563721, 5.256758689880371, 4.88532018661499, 5.207157135009766, 5.128076553344727, 5.2203593254089355, 4.719804763793945, 5.277894973754883, 5.257504463195801, 5.2892913818359375, 5.195303916931152, 5.229870319366455, 5.090665817260742, 4.924123764038086, 5.114296913146973]
[5.134739875793457, 4.947647571563721, 5.256758689880371, 4.88532018661499, 5.207157135009766, 5.128076553344727, 5.2203593254089355, 4.719804763793945, 5.277894973754883, 5.257504463195801, 5.2892913818359375, 5.195303916931152, 5.229870319366455, 5.090665817260742, 4.924123764038086, 5.114296913146973, 5.254773139953613, 5.073484420776367, 4.88151741027832, 5.253045082092285, 5.039278984069824, 4.663527488708496, 5.183251380

Some weights of CustomBertModel were not initialized from the model checkpoint at ckiplab/bert-base-chinese and are newly initialized: ['bert.embeddings.word_embeddings1.batch_norm.bias', 'bert.embeddings.word_embeddings1.batch_norm.num_batches_tracked', 'bert.embeddings.word_embeddings1.batch_norm.running_mean', 'bert.embeddings.word_embeddings1.batch_norm.running_var', 'bert.embeddings.word_embeddings1.batch_norm.weight', 'bert.embeddings.word_embeddings1.conv.bias', 'bert.embeddings.word_embeddings1.conv.weight', 'bert.embeddings.word_embeddings1.embed.bias', 'bert.embeddings.word_embeddings1.embed.weight', 'bert.pooler.dense.bias', 'bert.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



---------------EPOCH 1-----------------


Complete Batch 40  of  148.   Time: 0:00:41


Complete Batch 80  of  148.   Time: 0:01:22


Complete Batch 120  of  148.   Time: 0:02:03


Training loss: 1.2429128099937696


Training time: 0:02:31


Validation ........

[4.412953853607178, 4.767043113708496, 4.42141056060791, 4.306179046630859, 4.349179267883301, 5.219668388366699, 4.443629264831543, 4.503210067749023, 4.532857894897461, 4.454028129577637, 4.407139778137207, 5.19264030456543, 5.258548736572266, 4.395338535308838, 4.474963188171387, 5.117334365844727]
[4.412953853607178, 4.767043113708496, 4.42141056060791, 4.306179046630859, 4.349179267883301, 5.219668388366699, 4.443629264831543, 4.503210067749023, 4.532857894897461, 4.454028129577637, 4.407139778137207, 5.19264030456543, 5.258548736572266, 4.395338535308838, 4.474963188171387, 5.117334365844727, 4.440983772277832, 4.409388542175293, 4.416461944580078, 4.459528923034668, 4.436205863952637, 4.806233882904053, 4.43002510070800

Unnamed: 0_level_0,Training Loss,Valid Loss,Valid correlation
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
4,0.573546,0.918121,0.401056
5,0.520255,0.771235,0.443454


In [15]:
training_stats = []

In [16]:
# train for Valence
training_stats = []
for fold_index, dataset in enumerate(v_datasets):
    print(f"\n========Training fold {fold_index + 1}===========\n")
    train_loaders = [
        data for index, data in enumerate(v_datasets) if index != fold_index
    ]
    val_loader = DataLoader(dataset, batch_size=batch_size, shuffle=True)
    # concat multiple dataloader to oen dataloader
    train_loader = torch.utils.data.ConcatDataset(train_loaders)
    train_loader = DataLoader(train_loader, batch_size=batch_size, shuffle=True)

    print(f"TrainDataLoader contains {len(train_loader.dataset)} samples")
    batch = next(iter(train_loader))
    print("Sample data from the first batch:")
    print("Input IDs:", batch["input_ids"].shape)  # (batch_size, max_length)
    print("Attention Mask:", batch["attention_mask"].shape)  # (batch_size, max_length)
    print("Label:", batch["label"].shape)  # (batch_size, )
    print("Example sentence:", batch["input_ids"][0])
    print("Example label:", batch["label"][0].item())

    model = CNNBERT(model_name, hidden_dim, hidden_acti, dropout_rate, 1)
    model.to(device)

    # model = load_checkpoint(f'checkpoint/bert-base-chinese_{fold_index+1}_V_0504.pt', model, device)
    
    # freeze some layers (top | middle | bottom):
    bottom = range(2, 12)
    middle = list(range(0,5))+list(range(7,12))
    top = range(0, 10)
    freeze_layers = top
    for i in freeze_layers:
    #   print(i)
      for param in model.pm.encoder.layer[i].parameters():
        param.requires_grad = False
    
    training_stat, model = Trainer(model, fold_index, train_loader, val_loader, training_stats)
    save_checkpoint(os.path.join('checkpoint',f'cnnBERT_{fold_index+1}_V_0507.pt'), model)

save_string = f"cnnBERT" + "_" + "V" + "_" + "0507" + ".csv"
to_excel(save_string, training_stats)



TrainDataLoader contains 2360 samples
Sample data from the first batch:
Input IDs: torch.Size([16, 214])
Attention Mask: torch.Size([16, 1, 214])
Label: torch.Size([16])
Example sentence: tensor([ 101,  800,  947, 3300, 1048, 6527, 3582, 1842, 2970, 6843, 3302, 1243,
        8024, 5018,  671, 3613, 5481, 6303, 6857, 4934, 3302, 1243, 8024,  679,
        7097, 8024, 1962, 1008, 2802, 6722, 6206, 8164, 1914, 1846, 1435, 3800,
        2692, 1168, 6983, 2421, 6257, 3300, 3582, 1842, 1545, 4873, 5993, 8024,
         124, 1146, 7132, 3018, 2137, 3582, 4873, 8024, 3175,  912, 8013,  102,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          

Some weights of CustomBertModel were not initialized from the model checkpoint at ckiplab/bert-base-chinese and are newly initialized: ['bert.embeddings.word_embeddings1.batch_norm.bias', 'bert.embeddings.word_embeddings1.batch_norm.num_batches_tracked', 'bert.embeddings.word_embeddings1.batch_norm.running_mean', 'bert.embeddings.word_embeddings1.batch_norm.running_var', 'bert.embeddings.word_embeddings1.batch_norm.weight', 'bert.embeddings.word_embeddings1.conv.bias', 'bert.embeddings.word_embeddings1.conv.weight', 'bert.embeddings.word_embeddings1.embed.bias', 'bert.embeddings.word_embeddings1.embed.weight', 'bert.pooler.dense.bias', 'bert.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



---------------EPOCH 1-----------------


Complete Batch 40  of  148.   Time: 0:00:41


Complete Batch 80  of  148.   Time: 0:01:21


Complete Batch 120  of  148.   Time: 0:02:02


Training loss: 1.4357613107642613


Training time: 0:02:30


Validation ........

[4.975551128387451, 4.69728422164917, 5.357269763946533, 3.7180185317993164, 3.7670624256134033, 5.0015106201171875, 5.474505424499512, 5.660491466522217, 5.782063007354736, 4.423458576202393, 5.009364604949951, 4.470387935638428, 3.836327075958252, 3.872788906097412, 5.626951694488525, 5.071282386779785]
[4.975551128387451, 4.69728422164917, 5.357269763946533, 3.7180185317993164, 3.7670624256134033, 5.0015106201171875, 5.474505424499512, 5.660491466522217, 5.782063007354736, 4.423458576202393, 5.009364604949951, 4.470387935638428, 3.836327075958252, 3.872788906097412, 5.626951694488525, 5.071282386779785, 4.475772380828857, 4.9368577003479, 4.440558910369873, 5.393732070922852, 3.72066593170166, 4.11434268951416, 4.0181970596