In [1]:
import pandas as pd
import numpy as np
import time
import datetime
import os
import torch
import torch.optim as optim
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset
from scipy.stats import pearsonr
from transformers import AutoTokenizer, AutoModel, BertTokenizerFast
import sklearn

  from .autonotebook import tqdm as notebook_tqdm


### Set GPU

In [2]:
if torch.cuda.is_available():
    device = "cuda"
elif torch.backends.mps.is_available():
    device = "mps"
else:
    device = "cpu"
print(f"Using {device}")

Using mps


### Set parameters

In [3]:
model_name = "ckiplab/bert-base-chinese"
data_path = "data"
batch_size = 32
hidden_dim = 384
hidden_acti = nn.Tanh()
dropout_rate = 0.2
learning_rate = 2e-5
epochs = 15

### preprocessing

In [4]:
def load_df(csv):
    df = pd.read_csv(csv, sep="\t", encoding="utf-8")
    df.drop("No.", axis=1, inplace=True)
    return df

In [5]:
tokenizer = BertTokenizerFast.from_pretrained(model_name)
def get_max_sentence_len(data_path):
    max_tokens = 0
    for csv in os.listdir(data_path):
        sentences = load_df(os.path.join(data_path, csv))["Text"]
        for sentence in sentences:
            ids = tokenizer.encode(sentence)
            max_tokens = max(max_tokens, len(ids))
    return max_tokens


max_tokens = get_max_sentence_len(data_path)
print(max_tokens)

214


### data preparation


In [6]:
# TO-DO: 研究一下padding要用longest, max_length, or True
class ValenceDataset(Dataset):
    def __init__(self, df, tokenizer, max_tokens):
        self.df = df
        self.tokenizer = tokenizer
        self.max_tokens = max_tokens

    def __len__(self):
        return len(self.df)

    # encode : transform sentence to vocab no., tokenize : transform sentence to text
    def tokenize(self, sentence, max_tokens):
        encoded_dict = self.tokenizer.encode_plus(
            sentence,
            add_special_tokens=True,
            max_length=max_tokens,
            truncation=True,
            padding="max_length",
            return_tensors="pt",
        )
        # one sample per call, so need to sqeeze
        input_ids = encoded_dict["input_ids"].squeeze()
        token_type_ids = encoded_dict["token_type_ids"].squeeze()
        attention_mask = encoded_dict["attention_mask"]
        return input_ids, token_type_ids, attention_mask

    def __getitem__(self, index):
        sentence = self.df.iloc[index]["Text"]
        label = self.df.iloc[index]["Valence_Mean"]
        input_ids, token_by_ids, attention_mask = self.tokenize(
            sentence, self.max_tokens
        )
        sample = {
            "input_ids": input_ids,
            "attention_mask": attention_mask,
            "label": torch.tensor(label, dtype=torch.float32),
        }
        return sample


class ArousalDataset(Dataset):
    def __init__(self, df, tokenizer, max_tokens):
        self.df = df
        self.tokenizer = tokenizer
        self.max_tokens = max_tokens

    def __len__(self):
        return len(self.df)

    # encode : transform sentence to vocab no., tokenize : transform sentence to text
    def tokenize(self, sentence, max_tokens):
        encoded_dict = self.tokenizer.encode_plus(
            sentence,
            add_special_tokens=True,
            max_length=max_tokens,
            truncation=True,
            padding="max_length",
            return_tensors="pt",
        )
        # one sample per call, so need to sqeeze
        input_ids = encoded_dict["input_ids"].squeeze()
        token_type_ids = encoded_dict["token_type_ids"].squeeze()
        attention_mask = encoded_dict["attention_mask"]
        return input_ids, token_type_ids, attention_mask

    def __getitem__(self, index):
        sentence = self.df.iloc[index]["Text"]
        label = self.df.iloc[index]["Arousal_Mean"]
        input_ids, token_by_ids, attention_mask = self.tokenize(
            sentence, self.max_tokens
        )
        sample = {
            "input_ids": input_ids,
            "attention_mask": attention_mask,
            "label": torch.tensor(label, dtype=torch.float32),
        }
        return sample

In [7]:
v_datasets = []
a_datasets = []
df_list = [load_df(os.path.join(data_path, csv)) for csv in os.listdir(data_path)]
for fold_df in df_list:
    v_dataset = ValenceDataset(fold_df, tokenizer, max_tokens)
    a_dataset = ArousalDataset(fold_df, tokenizer, max_tokens)
    v_datasets.append(v_dataset)
    a_datasets.append(a_dataset)

In [8]:
os.listdir(data_path)

['CVAT_4_SD.csv',
 'CVAT_2_SD.csv',
 'CVAT_5_SD.csv',
 'CVAT_1_SD.csv',
 'CVAT_3_SD.csv']

### model (BERT)

In [9]:
class BERTForRegression(nn.Module):
    def __init__(self, pm, hidden_dim, hidden_acti, dropout_rate, output_dim):
        super(BERTForRegression, self).__init__()
        self.pm = AutoModel.from_pretrained(
            pm, output_attentions=False, output_hidden_states=False
        )
        self.hidden = nn.Linear(self.pm.config.hidden_size, hidden_dim)
        self.activation = hidden_acti
        self.dropout = nn.Dropout(dropout_rate)
        self.output_layer = nn.Linear(384, output_dim)
    
    def forward(self, input_sentence, input_mask):
        # [0]:last_hidden_state (batch_size, max_length, 768), [1]:pooling layer
        pm_output = self.pm(input_sentence, attention_mask=input_mask)[1] #(batch_size, 768)
        hidden = self.hidden(pm_output) 
        hidden = self.activation(hidden)
        hidden = self.dropout(hidden) # (batch_size, 384)
        output = self.output_layer(hidden) # (batch_size, 1)
        return output

### model (CNN + BERT)

### training

In [10]:
# format time to hh:mm:ss
def format_time(time):
    time_round = int(round((time)))
    return str(datetime.timedelta(seconds=time_round))

def to_excel(save_string, training_stats):
    df_stats = pd.DataFrame(data=training_stats)
    df_stats = df_stats.set_index('Fold')
    df_stats.to_csv(save_string, index=False)
    return df_stats

# save model to path
def save_checkpoint(save_path, model):
    if save_path == None:
        return
    torch.save(model.state_dict(), save_path)
    print(f'Model saved to ==> {save_path}')

# load model from path
def load_checkpoint(load_path, model, device):
    if load_path==None:
        return
    state_dict = torch.load(load_path, map_location=device)
    print(f'\nModel loaded from <== {load_path}')

    model.load_state_dict(state_dict)
    return model

In [11]:
def Trainer(model, fold_index, train_loader, val_loader, training_stats):
    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate, eps=1e-8)
    # MAE
    loss_function = torch.nn.L1Loss()
    # Measure the total training time for the whole run.
    total_t0 = time.time()
    for epoch in range(epochs):
    # for epoch in range(5):
        start_time = time.time()
        total_loss = 0
        total_val_outputs = []
        total_val_labels = []
        print(f"\n---------------EPOCH {epoch+1}-----------------\n")
        # print(f"\n---------------EPOCH {epoch+16}-----------------\n")
        model.train()
        # model.eval()
        for step, batch in enumerate(train_loader):
            if step % 40 == 0 and not step == 0:
                pass_time = format_time(time.time() - start_time)
                print(f'\nComplete Batch {step}  of  {len(train_loader)}.   Time: {pass_time}\n')

            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["label"].to(device)
            
            # with torch.no_grad():
            #     outputs = model(input_ids, attention_mask)
            optimizer.zero_grad()
            outputs = model(input_ids, attention_mask)
            loss = loss_function(outputs.view(-1), labels.view(-1))
            total_loss += loss.item()
            # calculate the gradients
            loss.backward()
            # gradient descent
            optimizer.step()

        avg_train_loss = total_loss / len(train_loader)
        training_time = format_time(time.time() - start_time)
        
        print(f"\nTraining loss: {avg_train_loss}\n")
        print(f"\nTraining time: {training_time}\n")
        print(f"\nValidation ........\n")
        valid_start_time = time.time()
        model.eval()
        total_valid_loss = 0

        for batch in val_loader:
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["label"].to(device)
            with torch.no_grad():
                outputs = model(input_ids, attention_mask)
            loss = loss_function(outputs.view(-1), labels.view(-1))
            total_valid_loss += loss.item()

            total_val_outputs = total_val_outputs + outputs.view(-1).detach().cpu().numpy().tolist()
            total_val_labels = total_val_labels + labels.view(-1).detach().cpu().numpy().tolist()

        avg_val_loss = total_valid_loss / len(val_loader)
        validation_time = format_time(time.time() - valid_start_time)

        print(f"\nValidation Loss: {avg_val_loss}\n")
        print(f"\nValidation time: {validation_time}\n")
        # 
        # break
    # print(total_val_outputs)
    # print(total_val_labels)
    val_r, _ = pearsonr(total_val_outputs, total_val_labels)

    print(f'\nFinal validation correlation: {val_r}\n')
    training_stats.append(
    {
        'Fold': fold_index+1,
        'Training Loss': avg_train_loss,
        'Valid Loss': avg_val_loss,
        'Valid correlation' : val_r
    })
    
    print("\nTraining complete!\n")
    print(f"\nTotal training took {format_time(time.time()-total_t0)} (hh:mm:ss)")
    save_string = f'bert-base-chinese' + '_'+f'{fold_index+1}' + '_' + 'V'+ '_' + '0503' + '.csv'
    return training_stats, model

In [12]:
training_stats = []

In [13]:
# train for Arousal
for fold_index, dataset in enumerate(a_datasets):
    if fold_index == 0:
        continue
    if fold_index == 1:
        continue
    print(f"\n========Training fold {fold_index + 1}===========\n")
    train_loaders = [
        data for index, data in enumerate(a_datasets) if index != fold_index
    ]
    val_loader = DataLoader(dataset, batch_size=batch_size, shuffle=True)
    # concat multiple dataloader to oen dataloader
    train_loader = torch.utils.data.ConcatDataset(train_loaders)
    train_loader = DataLoader(train_loader, batch_size=batch_size, shuffle=True)

    print(f"TrainDataLoader contains {len(train_loader.dataset)} samples")
    batch = next(iter(train_loader))
    print("Sample data from the first batch:")
    print("Input IDs:", batch["input_ids"].shape)  # (batch_size, max_length)
    print("Attention Mask:", batch["attention_mask"].shape)  # (batch_size, max_length)
    print("Label:", batch["label"].shape)  # (batch_size, )
    print("Example sentence:", batch["input_ids"][0])
    print("Example label:", batch["label"][0].item())

    model = BERTForRegression(model_name, hidden_dim, hidden_acti, dropout_rate, 1)
    model.to(device)

    # model = load_checkpoint(f'checkpoint/bert-base-chinese_{fold_index+1}_A_0504.pt', model, device)
    
    # freeze some layers (top | middle | bottom):
    # bottom = range(2, 12)
    # middle = list(range(0,5))+list(range(7,12))
    # top = range(0, 10)
    # freeze_layers = top
    # for i in freeze_layers:
    # #   print(i)
    #   for param in model.pm.encoder.layer[i].parameters():
    #     param.requires_grad = False
    
    training_stat, model = Trainer(model, fold_index, train_loader, val_loader, training_stats)
    save_checkpoint(os.path.join('checkpoint',f'bert-base-chinese_{fold_index+1}_A_0504.pt'), model)

save_string = f"bert-base-chinese" + "_" + "A2" + "_" + "0504" + ".csv"
# to_excel(save_string, training_stats)



TrainDataLoader contains 2376 samples
Sample data from the first batch:
Input IDs: torch.Size([32, 214])
Attention Mask: torch.Size([32, 1, 214])
Label: torch.Size([32])
Example sentence: tensor([ 101,  671, 7274, 1993, 3976, 3173, 2023, 1372, 3221, 3382, 2595, 2867,
        5179, 6857,  763, 6206, 3724, 8024, 2361, 3307, 2042, 2527, 1086, 2537,
        7269, 6243, 6359, 8024, 3760, 2682, 1168, 2849, 2113, 2552, 1147, 4638,
        3976, 2038, 2038, 4684, 2970, 7300, 1062, 1385, 8024, 6206, 3724, 1961,
        4989, 1174, 6894, 1139, 6798, 1439, 8024,  699,  684, 1762, 4707, 4680,
        4727, 4727,  678, 2200, 1961, 2870, 1343, 7015, 7368, 2970, 1358,  978,
        2434, 3596, 3389,  511,  102,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          

Some weights of BertModel were not initialized from the model checkpoint at ckiplab/bert-base-chinese and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



---------------EPOCH 1-----------------


Complete Batch 40  of  75.   Time: 0:00:54


Training loss: 1.3746547444661459


Training time: 0:01:40


Validation ........


Validation Loss: 0.8107217863986367


Validation time: 0:00:07


---------------EPOCH 2-----------------


Complete Batch 40  of  75.   Time: 0:00:55


Training loss: 0.8365668924649556


Training time: 0:01:43


Validation ........


Validation Loss: 0.8192062095591897


Validation time: 0:00:07


---------------EPOCH 3-----------------


Complete Batch 40  of  75.   Time: 0:00:55


Training loss: 0.7908128380775452


Training time: 0:01:42


Validation ........


Validation Loss: 0.8097955553155196


Validation time: 0:00:07


---------------EPOCH 4-----------------


Complete Batch 40  of  75.   Time: 0:00:56


Training loss: 0.7467973426977793


Training time: 0:01:44


Validation ........


Validation Loss: 0.733122235850284


Validation time: 0:00:07


---------------EPOCH 5-----------------


Complete Batch 40 

Some weights of BertModel were not initialized from the model checkpoint at ckiplab/bert-base-chinese and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



---------------EPOCH 1-----------------


Complete Batch 40  of  74.   Time: 0:00:56


Training loss: 1.2882628247544572


Training time: 0:01:43


Validation ........


Validation Loss: 0.8646664242995413


Validation time: 0:00:08


---------------EPOCH 2-----------------


Complete Batch 40  of  74.   Time: 0:00:55


Training loss: 0.8293853587395436


Training time: 0:01:41


Validation ........


Validation Loss: 0.7534728332569725


Validation time: 0:00:08


---------------EPOCH 3-----------------


Complete Batch 40  of  74.   Time: 0:00:54


Training loss: 0.7546451277024037


Training time: 0:01:40


Validation ........


Validation Loss: 0.7387192468894156


Validation time: 0:00:08


---------------EPOCH 4-----------------


Complete Batch 40  of  74.   Time: 0:00:55


Training loss: 0.6518057516297778


Training time: 0:01:41


Validation ........


Validation Loss: 0.6963475346565247


Validation time: 0:00:08


---------------EPOCH 5-----------------


Complete Batch 40

Some weights of BertModel were not initialized from the model checkpoint at ckiplab/bert-base-chinese and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



---------------EPOCH 1-----------------


Complete Batch 40  of  74.   Time: 0:01:01


Training loss: 1.2912166899925954


Training time: 0:01:49


Validation ........


Validation Loss: 0.8612936390073676


Validation time: 0:00:08


---------------EPOCH 2-----------------


Complete Batch 40  of  74.   Time: 0:00:55


Training loss: 0.824120828428784


Training time: 0:01:41


Validation ........


Validation Loss: 0.8158869021817258


Validation time: 0:00:08


---------------EPOCH 3-----------------


Complete Batch 40  of  74.   Time: 0:00:54


Training loss: 0.7282856090648754


Training time: 0:01:40


Validation ........


Validation Loss: 0.7300203850394801


Validation time: 0:00:08


---------------EPOCH 4-----------------


Complete Batch 40  of  74.   Time: 0:00:55


Training loss: 0.6308715693853997


Training time: 0:01:41


Validation ........


Validation Loss: 0.8165470706789117


Validation time: 0:00:08


---------------EPOCH 5-----------------


Complete Batch 40 

In [14]:
# # train for Valence
# training_stats = []
# for fold_index, dataset in enumerate(v_datasets):
#     if fold_index == 0:
#         continue
#     print(f"\n========Training fold {fold_index + 1}===========\n")
#     train_loaders = [
#         data for index, data in enumerate(v_datasets) if index != fold_index
#     ]
#     val_loader = DataLoader(dataset, batch_size=batch_size, shuffle=True)
#     # concat multiple dataloader to oen dataloader
#     train_loader = torch.utils.data.ConcatDataset(train_loaders)
#     train_loader = DataLoader(train_loader, batch_size=batch_size, shuffle=True)

#     print(f"TrainDataLoader contains {len(train_loader.dataset)} samples")
#     batch = next(iter(train_loader))
#     print("Sample data from the first batch:")
#     print("Input IDs:", batch["input_ids"].shape)  # (batch_size, max_length)
#     print("Attention Mask:", batch["attention_mask"].shape)  # (batch_size, max_length)
#     print("Label:", batch["label"].shape)  # (batch_size, )
#     print("Example sentence:", batch["input_ids"][0])
#     print("Example label:", batch["label"][0].item())

#     model = BERTForRegression(model_name, hidden_dim, hidden_acti, dropout_rate, 1)
#     model.to(device)

#     model = load_checkpoint(f'checkpoint/bert-base-chinese_{fold_index+1}_V_0504.pt', model, device)
    
#     # freeze some layers (top | middle | bottom):
#     # bottom = range(2, 12)
#     # middle = list(range(0,5))+list(range(7,12))
#     # top = range(0, 10)
#     # freeze_layers = top
#     # for i in freeze_layers:
#     # #   print(i)
#     #   for param in model.pm.encoder.layer[i].parameters():
#     #     param.requires_grad = False
    
#     training_stat, model = Trainer(model, fold_index, train_loader, val_loader, training_stats)
#     # save_checkpoint(os.path.join('checkpoint',f'bert-base-chinese_{fold_index+1}_V_0504.pt'), model)

# save_string = f"bert-base-chinese" + "_" + "V" + "_" + "0504" + ".csv"
# # to_excel(save_string, training_stats)