In [1]:
import warnings
warnings.filterwarnings('ignore')
import os
import random
import torch
import torch.nn as nn
import numpy as np
import gc
from datasets import load_dataset, Dataset
from transformers import (
    AutoModelForSequenceClassification,
    AutoTokenizer,
    TrainingArguments,
    Trainer,
    BitsAndBytesConfig,
)
from peft import LoraConfig, get_peft_model
from collections import Counter

from sklearn.utils.class_weight import compute_class_weight

In [2]:
loan_data_path = "example"
dataset = load_dataset(loan_data_path)

In [3]:
def preprocess_data(examples):
    examples["loan_data"] = examples.pop("text")
    examples["labels"] = int(examples.pop("label"))
    return examples

In [4]:
train_data = dataset["train"]
train_data = train_data.map(preprocess_data)
train_data = [data for data in train_data]

In [5]:
# 过采样
def oversample_data(data):
    labels = [example['labels'] for example in data]
    label_counts = Counter(labels)
    majority_class = label_counts.most_common(1)[0][0]
    minority_class = 1 - majority_class
    minority_data = [d for d in data if d['labels'] == minority_class]
    oversampled_minority_data = minority_data * (label_counts[majority_class] // label_counts[minority_class])
    return data + oversampled_minority_data

balanced_train_data = oversample_data(train_data)
random.shuffle(balanced_train_data)
dataset = Dataset.from_list(balanced_train_data)
print("dataset_size:", dataset)

dataset_size: Dataset({
    features: ['loan_data', 'labels'],
    num_rows: 333233
})


In [6]:
model_name = "model/Mistral-7B-Instruct-v0.3"
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [7]:
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token': '[PAD]'})

In [8]:
# 8-bit量化
quantization_config = BitsAndBytesConfig(
    load_in_8bit=True,
    llm_int8_threshold=6.0,
)

In [9]:
model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    quantization_config=quantization_config,
    device_map="auto",
    num_labels=2,
)

This can be used to load a bitsandbytes version that is different from the PyTorch CUDA version.
If this was unintended set the BNB_CUDA_VERSION variable to an empty string: export BNB_CUDA_VERSION=
If you use the manual override make sure the right libcudart.so is in your LD_LIBRARY_PATH
For example by adding the following to your .bashrc: export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:<path_to_cuda_dir/lib64



Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

Some weights of MistralForSequenceClassification were not initialized from the model checkpoint at model/Mistral-7B-Instruct-v0.3 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [10]:
model.resize_token_embeddings(len(tokenizer))

Embedding(32769, 4096)

In [11]:
# pad_token_id
model.config.pad_token_id = tokenizer.pad_token_id

In [12]:
# LoRA
lora_config = LoraConfig(
    r=8,
    lora_alpha=8,
    target_modules = [
        "q_proj",
        "k_proj",
        "v_proj",
        "o_proj",
        "gate_proj",
        "up_proj",
        "down_proj",
    ],
    lora_dropout = 0.05,
    bias = "none",
    use_rslora = False,
    loftq_config = None,
    task_type="SEQ_CLS",
)
model = get_peft_model(model, lora_config)

In [13]:
model.gradient_checkpointing_enable()

In [14]:
def tokenize_function(examples):
    return tokenizer(
        examples["loan_data"],
        padding="max_length",
        truncation=True,
        max_length=256,
    )

In [15]:
tokenized_datasets = dataset.map(tokenize_function, batched=True)

Map:   0%|          | 0/333233 [00:00<?, ? examples/s]

In [16]:
tokenized_datasets.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])

In [17]:
# 评估指标
from sklearn.metrics import classification_report, roc_auc_score, average_precision_score

def compute_metrics(p):
    preds = np.argmax(p.predictions, axis=1)
    labels = p.label_ids
    report = classification_report(labels, preds, output_dict=True)
    auc_roc = roc_auc_score(labels, p.predictions[:,1])
    auc_pr = average_precision_score(labels, p.predictions[:,1])
    return {
        'accuracy': report['accuracy'],
        'f1': report['1']['f1-score'],
        'auc_roc': auc_roc,
        'auc_pr': auc_pr
    }

In [18]:
training_args = TrainingArguments(
    output_dir="outputs/mistral-7b-instruct-v0.3-0926(augmented)",
    num_train_epochs=2,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    gradient_accumulation_steps=1,
    warmup_steps=5,
    learning_rate=2e-4,
    optim="adamw_8bit",
    lr_scheduler_type="linear",
    weight_decay=0.01,
    seed=11,
    logging_dir='./logs',
    logging_strategy='steps',
    logging_steps=10,
    save_strategy="steps",
    save_steps=5000,
    evaluation_strategy="no",
    report_to=[],
    fp16 = not torch.cuda.is_bf16_supported(),
    bf16 = torch.cuda.is_bf16_supported(),
)

In [19]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets,
    tokenizer=tokenizer,
    data_collator=None,
    compute_metrics=compute_metrics,
)

Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


In [None]:
trainer.train()

`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


Step,Training Loss
10,1.4767
20,1.299
30,1.1274
40,1.0579
50,0.9952
60,0.8384
70,0.979
80,0.8654
90,0.7386
100,0.816
