In [None]:
import os
import random
import functools
import csv
import numpy as np
import torch
import torch.nn.functional as F
from sklearn.metrics import f1_score
from datasets import Dataset, DatasetDict
# from peft import (
#     LoraConfig,
#     prepare_model_for_kbit_training,
#     get_peft_model
# )
from transformers import (
    AutoModelForSequenceClassification,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments,
    Trainer
)
import json
with open('/mnt/bn/data-tns-live-llm/leon/experiments/llm/audioslice/20240103/dataset_train_pos_tcs_neg_pp/data', 'r') as f:
    json_str = f.read()

json_str = "["+json_str.replace('\n', ',')[:-1]+"]"
json_str = json.loads(json_str)
print(len(json_str))

import pandas as pd
df = pd.DataFrame(json_str)
print(df.columns)
data = df[["label","text"]]
print(data.columns)
print(data.head)
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("/mnt/bn/data-tns-algo-nlp/pretrain_models/mistral")
# tokenize the text feature 
tokenized_feature_raw = tokenizer.batch_encode_plus(
                            # Sentences to encode
                            df.text.values.tolist(), 
                            # Add '[CLS]' and '[SEP]'
                            add_special_tokens = True      
                   )
# collect tokenized sentence length 
token_sentence_length = [len(x) for x in tokenized_feature_raw['input_ids']]
print('max: ', max(token_sentence_length))
print('min: ', min(token_sentence_length))
# plot the distribution
import matplotlib.pyplot as plt
plt.figure(figsize=(20, 8))
plt.hist(token_sentence_length, rwidth = 0.9)
plt.xlabel('Sequence Length', fontsize = 18)
plt.ylabel('# of Samples', fontsize = 18)
plt.xticks(fontsize = 14)
plt.yticks(fontsize = 14)
# identify features and target
features = df.text.values.tolist()
target = df.label.values.tolist()
#target 本身是字符列表
target = [int(value) for value in target]
# mistral 没有padding的token
tokenizer.pad_token = tokenizer.eos_token
# tokenize features 
MAX_LEN = 256
tokenized_feature = tokenizer.batch_encode_plus(
                            # Sentences to encode
                            features, 
                            # Add '[CLS]' and '[SEP]'
                            add_special_tokens = True,
                            # Add empty tokens if len(text)<MAX_LEN
                            padding = 'max_length',
                            # Truncate all sentences to max length
                            truncation=True,
                            # Set the maximum length
                            max_length = MAX_LEN, 
                            # Return attention mask
                            return_attention_mask = True,
                            # Return pytorch tensors
                            return_tensors = 'pt'       
                   )
# Use 80% for training and 20% for validation
from sklearn.model_selection import train_test_split
train_inputs, validation_inputs, train_labels, validation_labels, train_masks, validation_masks = train_test_split(tokenized_feature['input_ids'], 
                                                                                                             target,
                                                                                                                    tokenized_feature['attention_mask'],
                                                                                                      random_state=2018, test_size=0.2, stratify=target)
import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
# define batch_size
batch_size = 8
# Create the DataLoader for our training set
train_data = TensorDataset(train_inputs, train_masks, torch.tensor(train_labels))
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)
# Create the DataLoader for our test set
validation_data = TensorDataset(validation_inputs, validation_masks, torch.tensor(validation_labels))
validation_sampler = SequentialSampler(validation_data)
validation_dataloader = DataLoader(validation_data, sampler=validation_sampler, batch_size=batch_size)
# BertForSequenceClassification
from transformers import XLMRobertaForSequenceClassification, AdamW, get_linear_schedule_with_warmup, AutoModelForSequenceClassification
# model = XLMRobertaForSequenceClassification.from_pretrained(
#     "xlm-roberta-large", 
#     # Specify number of classes
#     num_labels = len(set(target)), 
#     # Whether the model returns attentions weights
#     output_attentions = False,
#     # Whether the model returns all hidden-states 
#     output_hidden_states = False
# )

from peft import (
    LoraConfig,
    prepare_model_for_kbit_training,
    get_peft_model,
    TaskType
)
from transformers import (
    AutoModelForSequenceClassification,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments,
    Trainer
)

# qunatization config
quantization_config = BitsAndBytesConfig(
    load_in_4bit = True, # enable 4-bit quantization
    bnb_4bit_quant_type = 'nf4', # information theoretically optimal dtype for normally distributed weights
    bnb_4bit_use_double_quant = True, # quantize quantized weights //insert xzibit meme
    bnb_4bit_compute_dtype = torch.bfloat16 # optimized fp format for ML
)

# lora config
lora_config = LoraConfig(
    r = 16, # the dimension of the low-rank matrices
    lora_alpha = 8, # scaling factor for LoRA activations vs pre-trained weight activations
    target_modules = ['q_proj', 'k_proj', 'v_proj', 'o_proj'],
    lora_dropout = 0.05, # dropout probability of the LoRA layers
    bias = 'none', # wether to train bias weights, set to 'none' for attention layers
    task_type = 'SEQ_CLS'
)


model = AutoModelForSequenceClassification.from_pretrained(
    "/mnt/bn/data-tns-algo-nlp/pretrain_models/mistral/models--mistralai--Mistral-7B-v0.1/snapshots/26bca36bde8333b5d7f72e9ed20ccda6a618af24/", 
    # Specify number of classes
    num_labels = len(set(target)), 
    # Whether the model returns attentions weights
    output_attentions = False,
    # Whether the model returns all hidden-states 
    output_hidden_states = False,
    quantization_config=quantization_config
)

# model = prepare_model_for_kbit_training(model)
# model = get_peft_model(model, lora_config)
# model.config.pad_token_id = tokenizer.pad_token_id
print(model)
model.print_trainable_parameters()
# output: trainable params: 786,432
#      || all params: 331,982,848
#      || trainable%: 0.2368893467652883
# Optimizer & Learning Rate Scheduler
optimizer = AdamW(model.parameters(),
                  lr = 2e-5, 
                  eps = 1e-8 
                )
#————————————————超参————————————————————#
eval_step = 1
epochs = 50
#————————————————超参————————————————————#
from torch.optim.lr_scheduler import CosineAnnealingLR
# Number of training epochs
# Total number of training steps is number of batches * number of epochs.
total_steps = len(train_dataloader) * epochs
# Create the learning rate scheduler
# scheduler = get_linear_schedule_with_warmup(optimizer,
#                                             num_warmup_steps = 0,
#                                             num_training_steps = total_steps)
scheduler = CosineAnnealingLR(optimizer, total_steps)
# tell pytorch to run this model on GPU
# import torch.nn as nn
# import os

# os.environ['CUDA_VISIBLE_DEVICES'] = '0,1'
# torch.distributed.init_process_group(backend='nccl')
# # model = nn.DataParallel(model, device_ids=[0, 1])
model.cuda()
# model = torch.nn.parallel.DistributedDataParallel(model, find_unused_parameters=True)

print(model.named_parameters())
for name, param in model.named_parameters():
    print(name)
# compute metrics
import numpy as np
def compute_metrics(logits, labels):
    """
    计算给定 logits 和 labels 的准确率、召回率和正负例的平均置信度。
    :param logits: PyTorch 张量，表示模型的输出 logits。
    :param labels: PyTorch 张量，表示真实的标签。
    :return: 包含准确率、召回率和正负例平均置信度的元组 (accuracy, recall, pos_prob, neg_prob)。
    """
    def confusion_matrix(preds, labels):
        """
        计算混淆矩阵及其四个基本指标：真阳性、假阳性、真阴性和假阴性
        """
        tp = torch.sum((preds == 1) & (labels == 1)).float()
        fp = torch.sum((preds == 1) & (labels == 0)).float()
        tn = torch.sum((preds == 0) & (labels == 0)).float()
        fn = torch.sum((preds == 0) & (labels == 1)).float()
        return tp, fp, tn, fn

    def compute_accuracy(tp, fp, tn, fn):
        """
        计算准确率
        """
        total = tp + fp + tn + fn
        acc = (tp + tn) / total
        return acc

    def compute_recall(tp, fp, tn, fn):
        """
        计算召回率
        """
        if tp+fn:
            recall = tp / (tp + fn)
        else:
            recall = 0
        assert recall == 0 or (not torch.isnan(recall))
        return recall

    def calc_accuracy_recall(logits, labels):
        preds = torch.argmax(logits, dim=1)
        # print(f"preds:{preds}")
        # print(f"labels:{labels}")
        tp, fp, tn, fn = confusion_matrix(preds, labels)
        # print(f"tp {tp} fp{fp} tn{tn} fn{fn}")
        assert tp+fp+tn+fn==labels.shape[0]
        acc = compute_accuracy(tp, fp, tn, fn)
        recall = compute_recall(tp, fp, tn, fn)
        # print(f"acc {acc} recall {recall}")

        if not isinstance(recall, int): recall = recall.item()
        acc = acc.item()
        return acc, recall

    def calc_confidence(logits, labels):
        probs = torch.softmax(logits, dim=1)
        # print(f"probs:{probs}")
        confidence_pos = probs[:, 1][labels == 1].mean().item()
        confidence_neg = probs[:, 0][labels == 0].mean().item()
        if np.isnan(confidence_pos): confidence_pos=0
        if np.isnan(confidence_neg): confidence_neg=0
        assert not (np.isnan(confidence_pos) or np.isnan(confidence_neg))
        return confidence_pos, confidence_neg
    
    accuracy, recall = calc_accuracy_recall(logits, labels)
    confidence_pos, confidence_neg = calc_confidence(logits, labels)
    return (accuracy, recall, confidence_pos, confidence_neg)
import torch.nn.functional as F
focalloss_gamma = 2
focalloss_alpha = 0.25
def compute_loss(model, input_ids, input_mask, labels, return_outputs=False):
        outputs = model(input_ids, attention_mask=input_mask, labels=labels)
        logits = outputs.get("logits")
        targets = F.one_hot(labels, num_classes=logits.shape[-1])
        
        p = torch.sigmoid(logits)
        ce_loss = F.binary_cross_entropy_with_logits(logits, targets.float(), reduction="none")
        p_t = p * targets + (1 - p) * (1 - targets)
        loss = ce_loss * ((1 - p_t) ** focalloss_gamma)

        if focalloss_alpha >= 0:
            alpha_t = focalloss_alpha * targets + (1 - focalloss_alpha) * (1 - targets)
            loss = alpha_t * loss

        loss = loss.mean()
        # if focalloss_reduction == "mean":
        #     loss = loss.mean()
        # elif focalloss_reduction == "sum":
        #     loss = loss.sum()

        return loss, outputs

def evaluate(model, validation_dataloader):
    # Set model to evaluation mode
    model.eval()
    device = torch.device("cuda")
    # Initialize lists to store metrics
    val_loss_values = []
    val_accuracy_values = []
    val_recall_values = []
    val_confidence_pos_values = []
    val_confidence_neg_values = []

    # Loop through batches of the validation dataset
    for batch in validation_dataloader:
        # Load batch to GPU
        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)

        # Perform forward pass
        # with torch.no_grad():
        #     outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask, labels=b_labels)

        # Compute loss
        loss, outputs = compute_loss(model,b_input_ids,b_input_mask,b_labels)
        # loss = outputs[0].mean()

        val_loss_values.append(loss.item())

        # Calculate epoch validation accuracy, recall, and positive and negative confidence values
        logits = outputs[1]
        accuracy, recall, confidence_pos, confidence_neg = compute_metrics(logits, b_labels)

        # Append metrics to lists
        val_accuracy_values.append(accuracy)
        val_recall_values.append(recall)
        val_confidence_pos_values.append(confidence_pos)
        val_confidence_neg_values.append(confidence_neg)

    # Calculate average epoch validation loss
    val_loss = np.mean(val_loss_values)
    val_accuracy = np.mean(val_accuracy_values)
    val_recall = np.mean(val_recall_values)
    val_pos_prob = np.mean(val_confidence_pos_values)
    val_neg_prob = np.mean(val_confidence_neg_values)

    return val_loss, val_accuracy, val_recall, val_pos_prob, val_neg_prob

from tqdm import tqdm
# Training
import time
import torch
from torch.utils.tensorboard import SummaryWriter
import transformers

writer = SummaryWriter("/opt/tiger/leon/runs/")
device = torch.device("cuda")
# Store the average loss after each epoch 
loss_values = []
# number of total steps for each epoch
print('total steps per epoch: ',  len(train_dataloader) / batch_size)

# 只训练分类头
# for name, param in model.named_parameters():
#     if "classifier" not in name:
#         param.requires_grad = False

# Train the model for each epoch

best_val_recall = 0
for epoch_i in range(epochs):
    # Initialize lists to store metrics for each epoch
    train_accuracy_values = []
    train_recall_values = []
    train_confidence_pos_values = []
    train_confidence_neg_values = []

    # Set model to training mode
    model.train()

    # Initialize lists to store training loss and predictions for each batch
    train_loss_values = []
    train_preds = []

    # Get the current learning rate
    lr = optimizer.param_groups[0]['lr']

    # Loop through batches of the training dataset using tqdm for progress tracking
    progress_bar = tqdm(train_dataloader, desc=f"Epoch {epoch_i+1}/{epochs}", unit="batch")
    # cnt = 0
    for batch in progress_bar:
        # cnt += 1 
        # Load batch to GPU
        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)

        # Clear gradients
        optimizer.zero_grad()

        # Perform forward pass
        # outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask, labels=b_labels)

        # Compute loss
        # loss = outputs[0]
        loss, outputs = compute_loss(model,b_input_ids,b_input_mask,b_labels)
        
        train_loss_values.append(loss.item())

        # Perform backward pass
        loss.backward()

        # Update model parameters
        optimizer.step()

        # Record predictions
        logits = outputs[1]

        # Calculate epoch training accuracy, recall, and positive and negative confidence values
        accuracy, recall, confidence_pos, confidence_neg = compute_metrics(logits, b_labels)
        # Append metrics to lists
        train_accuracy_values.append(accuracy)
        train_recall_values.append(recall)
        train_confidence_pos_values.append(confidence_pos)
        train_confidence_neg_values.append(confidence_neg)
        
        # if cnt == 10: break
        # Print metrics for this epoch
        # print(f"batch: training loss: {loss:.4f}; accuracy: {accuracy:.4f}; recall: {recall:.4f}; pos confidence: {pos_prob:.4f}; neg confidence: {neg_prob:.4f}")

    # Calculate average epoch training loss
    train_loss = np.mean(train_loss_values)
    accuracy = np.mean(train_accuracy_values)
    recall = np.mean(train_recall_values)
    pos_prob = np.mean(train_confidence_pos_values)
    neg_prob = np.mean(train_confidence_neg_values)

    # Write training metrics to TensorBoard
    writer.add_scalar("Training Loss", loss, epoch_i)
    writer.add_scalar("Training Accuracy", accuracy, epoch_i)
    writer.add_scalar("Training Recall", recall, epoch_i)
    writer.add_scalar("Positive Confidence", confidence_pos, epoch_i)
    writer.add_scalar("Negative Confidence", confidence_neg, epoch_i)
    writer.add_scalar("Learning Rate", lr, epoch_i)

    # Evaluate the model on the validation set every `eval_step` epochs
    if (epoch_i + 1) % eval_step == 0:
        val_loss, val_accuracy, val_recall, val_confidence_pos, val_confidence_neg = evaluate(model, validation_dataloader)
        print(f"Epoch {epoch_i+1}/{epochs} validation loss: {val_loss:.4f}; accuracy: {val_accuracy:.4f}; recall: {val_recall:.4f}; pos confidence: {val_confidence_pos:.4f}; neg confidence: {val_confidence_neg:.4f}")
        # Write validation metrics to TensorBoard
        writer.add_scalar("Validation Loss", val_loss, epoch_i)
        writer.add_scalar("Validation Accuracy", val_accuracy, epoch_i)
        writer.add_scalar("Validation Recall", val_recall, epoch_i)
        writer.add_scalar("Validation Positive Confidence", val_confidence_pos, epoch_i)
        writer.add_scalar("Validation Negative Confidence", val_confidence_neg, epoch_i)

        if val_recall > best_val_recall:
            model.save_pretrained("best_model")
            best_val_recall = val_recall
    # Print metrics for this epoch
    print(f"Epoch {epoch_i+1}/{epochs} training loss: {train_loss:.4f}; accuracy: {accuracy:.4f}; recall: {recall:.4f}; pos confidence: {pos_prob:.4f}; neg confidence: {neg_prob:.4f}")

# outputs 可视化
cnt = 0
for batch in train_dataloader:
    cnt +=1
    # print(batch)
    print(batch[0].shape) #input
    print(batch[1].shape) #mask
    print(batch[2].shape) #label
    b_input_ids = batch[0].to(device)
    b_input_mask = batch[1].to(device)
    b_labels = batch[2].to(device)
    outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask, labels=b_labels)
    print(outputs[0]) #loss
    print(outputs[1]) #logits
    logits = outputs[1]
    # Calculate epoch training accuracy, recall, and positive and negative confidence values
    accuracy, recall, confidence_pos, confidence_neg = compute_metrics(logits, b_labels)
    # accuracy, recall, pos_prob, neg_prob = compute_accuracy_recall(logits, b_labels)
    print(f"training loss: {outputs[0]:.4f}; accuracy: {accuracy:.4f}; recall: {recall:.4f}; pos confidence: {confidence_pos:.4f}; neg confidence: {confidence_neg:.4f}")
    if cnt == 10: break
from sklearn.metrics import f1_score
# define custom batch preprocessor
def collate_fn(batch, tokenizer):
    dict_keys = ['input_ids', 'attention_mask', 'labels']
    d = {k: [dic[k] for dic in batch] for k in dict_keys}
    d['input_ids'] = torch.nn.utils.rnn.pad_sequence(
        d['input_ids'], batch_first=True, padding_value=tokenizer.pad_token_id
    )
    d['attention_mask'] = torch.nn.utils.rnn.pad_sequence(
        d['attention_mask'], batch_first=True, padding_value=0
    )
    d['labels'] = torch.stack(d['labels'])
    return d

# define which metrics to compute for evaluation
def compute_metrics(p):
    predictions, labels = p
    f1_micro = f1_score(labels, predictions > 0, average = 'micro')
    f1_macro = f1_score(labels, predictions > 0, average = 'macro')
    f1_weighted = f1_score(labels, predictions > 0, average = 'weighted')
    return {
        'f1_micro': f1_micro,
        'f1_macro': f1_macro,
        'f1_weighted': f1_weighted
    }
# create custom trainer class to be able to pass label weights and calculate mutilabel loss
class CustomTrainer(Trainer):    
    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.pop("labels")
        outputs = model(**inputs)
        logits = outputs.get("logits")
        targets = F.one_hot(labels, num_classes=logits.shape[-1])
        
        p = torch.sigmoid(logits)
        ce_loss = F.binary_cross_entropy_with_logits(logits, targets.float(), reduction="none")
        p_t = p * targets + (1 - p) * (1 - targets)
        loss = ce_loss * ((1 - p_t) ** focalloss_gamma)

        if focalloss_alpha >= 0:
            alpha_t = focalloss_alpha * targets + (1 - focalloss_alpha) * (1 - targets)
            loss = alpha_t * loss

        loss = loss.mean()
        # if focalloss_reduction == "mean":
        #     loss = loss.mean()
        # elif focalloss_reduction == "sum":
        #     loss = loss.sum()

        return loss, outputs
import functools
#define training args
training_args = TrainingArguments(
    output_dir = 'multilabel_classification',
    learning_rate = 1e-4,
    per_device_train_batch_size = 8,
    per_device_eval_batch_size = 8,
    num_train_epochs = 10,
    weight_decay = 0.01,
    evaluation_strategy = 'epoch',
    save_strategy = 'epoch',
    load_best_model_at_end = True
)

# train
trainer = CustomTrainer(
    model = model,
    args = training_args,
    train_dataset = train_data,
    eval_dataset = validation_data,
    tokenizer = tokenizer,
    data_collator = functools.partial(collate_fn, tokenizer=tokenizer),
    compute_metrics = compute_metrics
)

trainer.train()

