In [1]:
import os
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification, BitsAndBytesConfig, DataCollatorWithPadding
from peft import PeftModel
from datasets import Dataset, load_dataset
from torch.utils.data import DataLoader
from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    confusion_matrix,
    roc_auc_score,
)
import random
import csv
from tqdm import tqdm
import pandas as pd

In [2]:
results_dir = "results"
os.makedirs(results_dir, exist_ok=True)
predictions_file = os.path.join(results_dir, "loan_predictions.csv")
metrics_file = os.path.join(results_dir, "metrics.txt")

if not os.path.exists(predictions_file):
    with open(predictions_file, 'w', newline='', encoding='utf-8') as f:
        writer = csv.writer(f)
        writer.writerow(["Loan_ID", "Loan_Data", "True_Label", "Predicted_Label", "Probability"])
    print(f"Result file '{predictions_file}' initialized.")

In [3]:
# 基础模型名称和训练后的模型路径
base_model_name = "model/Mistral-7B-Instruct-v0.3"
model_path = "outputs/mistral-7b-instruct-v0.3-0926(augmented)/checkpoint-35000"

In [4]:
try:
    tokenizer = AutoTokenizer.from_pretrained(model_path)
    print("Tokenizer loaded from model_path.")
except:
    tokenizer = AutoTokenizer.from_pretrained(base_model_name)
    print("Tokenizer not found in model_path, loaded from base_model_name.")

Tokenizer loaded from model_path.


In [5]:
# 检查并添加pad_token
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token': '[PAD]'})
    print("Added pad token to tokenizer.")
    tokenizer.save_pretrained(model_path)
else:
    print(f"Tokenizer already has pad_token: {tokenizer.pad_token}")

Tokenizer already has pad_token: [PAD]


In [6]:
quantization_config = BitsAndBytesConfig(
    load_in_8bit=True,
    llm_int8_threshold=6.0,
)

In [7]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

base_model = AutoModelForSequenceClassification.from_pretrained(
    base_model_name,
    quantization_config=quantization_config,
    device_map={"": device}, 
    num_labels=2,
)

This can be used to load a bitsandbytes version that is different from the PyTorch CUDA version.
If this was unintended set the BNB_CUDA_VERSION variable to an empty string: export BNB_CUDA_VERSION=
If you use the manual override make sure the right libcudart.so is in your LD_LIBRARY_PATH
For example by adding the following to your .bashrc: export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:<path_to_cuda_dir/lib64



Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

Some weights of MistralForSequenceClassification were not initialized from the model checkpoint at model/Mistral-7B-Instruct-v0.3 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [8]:
# 调整基础模型的嵌入层尺寸
base_model.resize_token_embeddings(len(tokenizer))
print(f"Resized model embeddings to match tokenizer vocab size: {len(tokenizer)}")

Resized model embeddings to match tokenizer vocab size: 32769


In [9]:
model = PeftModel.from_pretrained(
    base_model,
    model_path,
    device_map={"": device},
)
print("LoRA adapter loaded.")

LoRA adapter loaded.


In [10]:
model.config.pad_token_id = tokenizer.pad_token_id

In [11]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.eval()
print(f"Model moved to device: {device}")

Model moved to device: cuda


In [12]:
loan_data_path = "example"
dataset = load_dataset(loan_data_path)
val_data = dataset["validation"]

label_1_data = [data for data in val_data if data['label'] == 1]
label_0_data = [data for data in val_data if data['label'] == 0]

num_label_1 = len(label_1_data)
balanced_label_0_data = random.sample(label_0_data, num_label_1)
balanced_data = label_1_data + balanced_label_0_data

# random data layout
random.shuffle(balanced_data)

dataset = Dataset.from_list(balanced_data)

In [13]:
def preprocess_data(examples):
    examples["loan_data"] = examples.pop("text")
    examples["labels"] = int(examples.pop("label"))
    return examples

test_data = dataset.map(preprocess_data)

Map:   0%|          | 0/518 [00:00<?, ? examples/s]

In [14]:
def tokenize_function(examples):
    return tokenizer(
        examples["loan_data"],
        padding="longest",
        truncation=True,
        max_length=256,
    )

tokenized_test_data = test_data.map(tokenize_function, batched=True)
tokenized_test_data.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])

Map:   0%|          | 0/518 [00:00<?, ? examples/s]

In [15]:
# DataCollatorWithPadding
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

test_dataloader = DataLoader(
    tokenized_test_data,
    batch_size=32,           # 根据显存情况调整
    shuffle=False,
    collate_fn=data_collator,  # 使用 DataCollator 处理填充
)

print("Test DataLoader created.")

Test DataLoader created.


In [16]:
all_labels = []
all_preds = []
all_probs = []
all_ids = []  # 存储 Loan_ID 或其他唯一标识符

In [17]:
with torch.no_grad():

    with open(predictions_file, mode='a', newline='', encoding='utf-8') as csvfile:
        writer = csv.writer(csvfile)
        
        for batch in tqdm(test_dataloader, desc="Evaluating"):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            logits = outputs.logits
            
            # softmax获取概率分布
            probs = torch.softmax(logits, dim=-1).cpu()
            preds = torch.argmax(probs, dim=-1).cpu().numpy()
            prob_positive = probs[:, 1].cpu().numpy()
            
            all_labels.extend(labels.cpu().numpy())
            all_preds.extend(preds)
            all_probs.extend(prob_positive)
            
            if 'id' in test_data.column_names:
                ids = batch['id'].cpu().numpy()
                texts = test_data['loan_data'][batch['input_ids'].indices]
            else:
                # 使用 DataLoader 的索引
                start_idx = len(all_labels) - len(labels)
                ids = list(range(start_idx, start_idx + len(labels)))
                texts = test_data['loan_data'][start_idx : start_idx + len(labels)]
            
            for i in range(len(labels)):
                writer.writerow([
                    ids[i],
                    texts[i],
                    labels[i].item(),
                    preds[i].item(),
                    prob_positive[i]
                ])
            
            print(f"Batch processed. Sample predictions:")
            for i in range(min(5, len(labels))):
                print(f"ID: {ids[i]}, True: {labels[i].item()}, Pred: {preds[i].item()}, Prob: {prob_positive[i]:.4f}")
            print("-" * 50)

print("Predictions completed.")

Evaluating:   0%|          | 0/17 [00:00<?, ?it/s]We detected that you are passing `past_key_values` as a tuple and this is deprecated and will be removed in v4.43. Please use an appropriate `Cache` class (https://huggingface.co/docs/transformers/v4.41.3/en/internal/generation_utils#transformers.Cache)
Evaluating:   6%|▌         | 1/17 [00:02<00:45,  2.83s/it]

Batch processed. Sample predictions:
ID: 0, True: 0, Pred: 0, Prob: 0.2737
ID: 1, True: 1, Pred: 1, Prob: 0.7339
ID: 2, True: 0, Pred: 1, Prob: 0.9336
ID: 3, True: 0, Pred: 0, Prob: 0.4111
ID: 4, True: 1, Pred: 0, Prob: 0.2043
--------------------------------------------------


Evaluating:  12%|█▏        | 2/17 [00:04<00:31,  2.08s/it]

Batch processed. Sample predictions:
ID: 32, True: 1, Pred: 1, Prob: 0.9976
ID: 33, True: 1, Pred: 1, Prob: 0.8613
ID: 34, True: 0, Pred: 1, Prob: 0.9136
ID: 35, True: 1, Pred: 0, Prob: 0.1571
ID: 36, True: 1, Pred: 1, Prob: 0.7607
--------------------------------------------------


Evaluating:  18%|█▊        | 3/17 [00:05<00:25,  1.84s/it]

Batch processed. Sample predictions:
ID: 64, True: 1, Pred: 1, Prob: 0.6548
ID: 65, True: 0, Pred: 0, Prob: 0.0368
ID: 66, True: 0, Pred: 0, Prob: 0.3850
ID: 67, True: 0, Pred: 1, Prob: 0.9570
ID: 68, True: 1, Pred: 1, Prob: 0.8740
--------------------------------------------------


Evaluating:  24%|██▎       | 4/17 [00:07<00:22,  1.73s/it]

Batch processed. Sample predictions:
ID: 96, True: 0, Pred: 0, Prob: 0.0003
ID: 97, True: 0, Pred: 0, Prob: 0.0474
ID: 98, True: 1, Pred: 1, Prob: 0.9663
ID: 99, True: 0, Pred: 1, Prob: 0.7280
ID: 100, True: 1, Pred: 0, Prob: 0.2018
--------------------------------------------------


Evaluating:  29%|██▉       | 5/17 [00:09<00:20,  1.67s/it]

Batch processed. Sample predictions:
ID: 128, True: 0, Pred: 0, Prob: 0.3557
ID: 129, True: 0, Pred: 1, Prob: 0.9775
ID: 130, True: 1, Pred: 0, Prob: 0.0000
ID: 131, True: 0, Pred: 0, Prob: 0.0039
ID: 132, True: 0, Pred: 0, Prob: 0.0100
--------------------------------------------------


Evaluating:  35%|███▌      | 6/17 [00:10<00:17,  1.62s/it]

Batch processed. Sample predictions:
ID: 160, True: 1, Pred: 1, Prob: 0.5698
ID: 161, True: 1, Pred: 1, Prob: 0.8115
ID: 162, True: 0, Pred: 1, Prob: 0.8232
ID: 163, True: 0, Pred: 1, Prob: 0.7373
ID: 164, True: 1, Pred: 0, Prob: 0.2539
--------------------------------------------------


Evaluating:  41%|████      | 7/17 [00:12<00:15,  1.60s/it]

Batch processed. Sample predictions:
ID: 192, True: 0, Pred: 0, Prob: 0.1251
ID: 193, True: 0, Pred: 0, Prob: 0.3557
ID: 194, True: 1, Pred: 1, Prob: 0.6152
ID: 195, True: 0, Pred: 0, Prob: 0.0321
ID: 196, True: 0, Pred: 0, Prob: 0.0000
--------------------------------------------------


Evaluating:  47%|████▋     | 8/17 [00:13<00:14,  1.58s/it]

Batch processed. Sample predictions:
ID: 224, True: 0, Pred: 0, Prob: 0.0863
ID: 225, True: 1, Pred: 1, Prob: 0.5776
ID: 226, True: 0, Pred: 0, Prob: 0.0032
ID: 227, True: 1, Pred: 1, Prob: 0.6372
ID: 228, True: 1, Pred: 0, Prob: 0.2583
--------------------------------------------------


Evaluating:  53%|█████▎    | 9/17 [00:15<00:12,  1.58s/it]

Batch processed. Sample predictions:
ID: 256, True: 1, Pred: 1, Prob: 0.7905
ID: 257, True: 0, Pred: 0, Prob: 0.0090
ID: 258, True: 1, Pred: 1, Prob: 0.8267
ID: 259, True: 1, Pred: 1, Prob: 0.8057
ID: 260, True: 0, Pred: 1, Prob: 0.5391
--------------------------------------------------


Evaluating:  59%|█████▉    | 10/17 [00:16<00:11,  1.57s/it]

Batch processed. Sample predictions:
ID: 288, True: 0, Pred: 0, Prob: 0.0478
ID: 289, True: 1, Pred: 1, Prob: 0.6226
ID: 290, True: 1, Pred: 0, Prob: 0.3040
ID: 291, True: 1, Pred: 1, Prob: 0.6704
ID: 292, True: 1, Pred: 0, Prob: 0.2310
--------------------------------------------------


Evaluating:  65%|██████▍   | 11/17 [00:18<00:09,  1.55s/it]

Batch processed. Sample predictions:
ID: 320, True: 0, Pred: 0, Prob: 0.0019
ID: 321, True: 1, Pred: 1, Prob: 0.9175
ID: 322, True: 0, Pred: 0, Prob: 0.2751
ID: 323, True: 1, Pred: 1, Prob: 0.8975
ID: 324, True: 0, Pred: 0, Prob: 0.2480
--------------------------------------------------


Evaluating:  71%|███████   | 12/17 [00:19<00:07,  1.56s/it]

Batch processed. Sample predictions:
ID: 352, True: 0, Pred: 0, Prob: 0.1755
ID: 353, True: 1, Pred: 1, Prob: 0.5156
ID: 354, True: 0, Pred: 0, Prob: 0.0100
ID: 355, True: 0, Pred: 1, Prob: 0.7090
ID: 356, True: 1, Pred: 0, Prob: 0.0002
--------------------------------------------------


Evaluating:  76%|███████▋  | 13/17 [00:21<00:06,  1.55s/it]

Batch processed. Sample predictions:
ID: 384, True: 1, Pred: 1, Prob: 0.8066
ID: 385, True: 0, Pred: 0, Prob: 0.0099
ID: 386, True: 1, Pred: 1, Prob: 0.9805
ID: 387, True: 0, Pred: 1, Prob: 0.9185
ID: 388, True: 0, Pred: 0, Prob: 0.2878
--------------------------------------------------


Evaluating:  82%|████████▏ | 14/17 [00:22<00:04,  1.55s/it]

Batch processed. Sample predictions:
ID: 416, True: 1, Pred: 0, Prob: 0.4148
ID: 417, True: 1, Pred: 1, Prob: 0.8740
ID: 418, True: 0, Pred: 0, Prob: 0.4961
ID: 419, True: 1, Pred: 1, Prob: 0.5176
ID: 420, True: 1, Pred: 1, Prob: 0.8428
--------------------------------------------------


Evaluating:  88%|████████▊ | 15/17 [00:24<00:03,  1.55s/it]

Batch processed. Sample predictions:
ID: 448, True: 1, Pred: 1, Prob: 0.9707
ID: 449, True: 0, Pred: 0, Prob: 0.4456
ID: 450, True: 1, Pred: 1, Prob: 0.9497
ID: 451, True: 1, Pred: 0, Prob: 0.1461
ID: 452, True: 0, Pred: 1, Prob: 0.6582
--------------------------------------------------


Evaluating:  94%|█████████▍| 16/17 [00:26<00:01,  1.53s/it]

Batch processed. Sample predictions:
ID: 480, True: 0, Pred: 0, Prob: 0.0804
ID: 481, True: 1, Pred: 1, Prob: 0.8481
ID: 482, True: 1, Pred: 1, Prob: 0.9658
ID: 483, True: 0, Pred: 0, Prob: 0.3040
ID: 484, True: 1, Pred: 1, Prob: 0.8198
--------------------------------------------------


Evaluating: 100%|██████████| 17/17 [00:26<00:00,  1.56s/it]

Batch processed. Sample predictions:
ID: 512, True: 1, Pred: 0, Prob: 0.4980
ID: 513, True: 0, Pred: 0, Prob: 0.0237
ID: 514, True: 0, Pred: 0, Prob: 0.2338
ID: 515, True: 1, Pred: 0, Prob: 0.1060
ID: 516, True: 0, Pred: 0, Prob: 0.0888
--------------------------------------------------
Predictions completed.





In [18]:
accuracy = accuracy_score(all_labels, all_preds)
precision = precision_score(all_labels, all_preds, zero_division=0)
recall = recall_score(all_labels, all_preds, zero_division=0)
f1 = f1_score(all_labels, all_preds, zero_division=0)
tn, fp, fn, tp = confusion_matrix(all_labels, all_preds).ravel()
specificity = tn / (tn + fp) if (tn + fp) > 0 else 0
auc = roc_auc_score(all_labels, all_probs)

print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall (Sensitivity): {recall:.4f}")
print(f"F1 Score: {f1:.4f}")
print(f"Specificity: {specificity:.4f}")
print(f"AUC: {auc:.4f}")

result_dir = 'results'
metrics_file_path = os.path.join(result_dir, "metrics.txt")
with open(metrics_file_path, mode='w', encoding='utf-8') as f:
    f.write(f"Accuracy: {accuracy:.4f}\n")
    f.write(f"Precision: {precision:.4f}\n")
    f.write(f"Recall (Sensitivity): {recall:.4f}\n")
    f.write(f"F1 Score: {f1:.4f}\n")
    f.write(f"Specificity: {specificity:.4f}\n")
    f.write(f"AUC: {auc:.4f}\n")

print(f"Metrics saved to '{metrics_file_path}'.")

Accuracy: 0.7181
Precision: 0.7165
Recall (Sensitivity): 0.7220
F1 Score: 0.7192
Specificity: 0.7143
AUC: 0.7647
Metrics saved to 'results/metrics.txt'.
