In [23]:
import warnings
warnings.filterwarnings('ignore')
import os
import re
import pandas as pd
import random
import torch
import torch.nn as nn
import torch.nn.functional as F
from unsloth import FastLanguageModel
import wandb
from datasets import load_dataset, Dataset, DatasetDict
from trl import SFTTrainer
from transformers import TrainingArguments

In [24]:
# 贷款数据特征变量权重
feature_importance = {
    "Loan amount": 1.2,
    "DTI": 1.5,
    "Employment Title": 0.8,
    "Employment Length": 1.0,
    "Home Ownership": 1.1,
    "Annual Income": 1.6,
    "Verification Status": 1.0,
    "Grade": 2.0,
    "Purpose": 0.9,
    "Description": 0.7,
    "Title": 0.8,
    "Open Accounts": 1.3
}

In [25]:
loan_data_path = "example"
dataset = load_dataset(loan_data_path)

In [26]:
print(dataset["train"])
train_data = dataset["train"]

Dataset({
    features: ['text', 'label'],
    num_rows: 167153
})


In [27]:
# balance data sample
label_1_data = [data for data in train_data if data['label'] == 1]
label_0_data = [data for data in train_data if data['label'] == 0]

num_label_1 = len(label_1_data)
balanced_label_0_data = random.sample(label_0_data, num_label_1)
balanced_data = label_1_data + balanced_label_0_data

# random data layout
random.shuffle(balanced_data)

dataset = Dataset.from_list(balanced_data)

In [28]:
print(f"Original dataset size: {len(train_data)}")
print(f"Balanced dataset size: {len(balanced_data)}")
print(f"Number of label 1 samples: {len(label_1_data)}")
print(f"Number of label 0 samples: {len(balanced_label_0_data)}")

Original dataset size: 167153
Balanced dataset size: 1920
Number of label 1 samples: 960
Number of label 0 samples: 960


In [29]:
def rename_columns(example):
    example["loan_data"] = example.pop("text")
    example["labels"] = example.pop("label")
    return example

dataset = dataset.map(rename_columns)
print(dataset)
print(f"dataset size: {len(dataset)}")
print(dataset[0])

Map:   0%|          | 0/1920 [00:00<?, ? examples/s]

Dataset({
    features: ['loan_data', 'labels'],
    num_rows: 1920
})
dataset size: 1920
{'loan_data': 'Loan amount: 10000, DTI: 18.15, Employment Title: Sales Representative, Employment Length: 2 years, Home Ownership: RENT, Annual Income: 40000.0, Verification Status: Not Verified, Grade: E-E1, Purpose: debt_consolidation, Description: nan, Title: Debt consolidation, Open Accounts: 10', 'labels': 1}


In [9]:
max_seq_length = 2048
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = False # True to use 4bit quantization / reduce memory usage (for T4 GPU)

In [10]:
model, tokenizer = FastLanguageModel.from_pretrained(
    # model_name = "unsloth/mistral-7b-v0.2-bnb-4bit",
    model_name = "model/Mistral-7B-Instruct-v0.3",
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
)

==((====))==  Unsloth 2024.8: Fast Mistral patching. Transformers = 4.44.0.
   \\   /|    GPU: NVIDIA A800-SXM4-80GB. Max memory: 79.325 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.3.1+cu121. CUDA = 8.0. CUDA Toolkit = 12.1.
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.27. FA2 = True]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

model/Mistral-7B-Instruct-v0.3 does not have a padding token! Will use pad_token = [control_768].


In [11]:
# Add LoRA adapters
model = FastLanguageModel.get_peft_model(
    model,
    r = 16, # can improve fine-tuning, at attention/feed fwd layers
    target_modules = [
        "q_proj",
        "k_proj",
        "v_proj",
        "o_proj",
        "gate_proj",
        "up_proj",
        "down_proj",
    ],
    lora_alpha = 16, # more change to pre-train weights but care overfitting
    lora_dropout = 0.05, # any, but 0 if perf opti.
    bias = "none",    # any, but "none" is perf  opti.
    use_gradient_checkpointing = True,
    random_state = 11,
    use_rslora = False,  # support rank stabilized LoRA
    loftq_config = None, # LoftQ support
)

Unsloth: Dropout = 0 is supported for fast patching. You are using dropout = 0.05.
Unsloth will patch all other layers, except LoRA matrices, causing a performance hit.
Unsloth 2024.8 patched 32 layers with 0 QKV layers, 0 O layers and 0 MLP layers.


In [12]:
# Prompt Preparation
prompt = """You are a highly intelligent and detailed artificial intelligence assistant with a deep understanding of financial data, specifically in predicting loan defaults.
Your task is to accurately classify loan data into one of two possible outcomes:
- 0: The loan is fully paid off (no default).
- 1: The loan has defaulted (borrower failed to meet the repayment terms).

The input data will provide various details about the loan and the borrower's financial situation. Your goal is to carefully analyze this information and determine the loan's status based on the provided features.

You are expected to generate a response that is one of the following labels:
- 0: The loan is fully paid off.
- 1: The loan has defaulted.

Your classification must be precise and match the best possible outcome for the given loan data.

Here is the loan data you need to classify:
### Loan Data:
{loan_data}
### Loan Status:
{loan_status}"""


In [13]:
# Add EOS special token, according to previously loaded tokenizer
EOS_TOKEN = tokenizer.eos_token
def format_prompts(examples):
    inputs = examples["loan_data"]
    outputs = examples["labels"]
    texts = []
    for inp, output in zip(inputs, outputs):
        # Add end of string token to prevent infinite generations.
        text = prompt.format(loan_data=inp, loan_status=output) + EOS_TOKEN
        texts.append(text)
    return {"text":texts}

In [14]:
# Building prompts
dataset= dataset.map(format_prompts, batched = True)

# Print a sample :
print(dataset[0]['text'])

Map:   0%|          | 0/1920 [00:00<?, ? examples/s]

You are a highly intelligent and detailed artificial intelligence assistant with a deep understanding of financial data, specifically in predicting loan defaults.
Your task is to accurately classify loan data into one of two possible outcomes:
- 0: The loan is fully paid off (no default).
- 1: The loan has defaulted (borrower failed to meet the repayment terms).

The input data will provide various details about the loan and the borrower's financial situation. Your goal is to carefully analyze this information and determine the loan's status based on the provided features.

You are expected to generate a response that is one of the following labels:
- 0: The loan is fully paid off.
- 1: The loan has defaulted.

Your classification must be precise and match the best possible outcome for the given loan data.

Here is the loan data you need to classify:
### Loan Data:
Loan amount: 8400, DTI: 33.26, Employment Title: Police officer, Employment Length: 5 years, Home Ownership: RENT, Annual 

In [15]:
# Train the model
training_args = TrainingArguments(
    per_device_train_batch_size = 2,
    gradient_accumulation_steps = 4,
    warmup_steps = 5,
    num_train_epochs = 1,
    # max_steps = 110,
    learning_rate = 2e-4, # 2e-5
    fp16 = not torch.cuda.is_bf16_supported(),
    bf16 = torch.cuda.is_bf16_supported(),
    optim = "adamw_8bit",
    weight_decay = 0.01,
    lr_scheduler_type = "linear",
    seed = 11,
    output_dir = "outputs/mistral-7b-instruct-v0.3-0910",
    run_name = "mistral-7b-instruct-v0.3",
    logging_strategy = 'steps',
    logging_steps = 1,
    save_strategy="steps",
    save_steps=10,
    save_total_limit = 2,
    report_to = "wandb",
    )

In [19]:
# init the trainer
trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = dataset,
    dataset_text_field = "text",
    max_seq_length = max_seq_length,
    dataset_num_proc = 2, # if packing = False, else default to None
    packing = False, # more speed if packing short sequences. Maybe later
    args = training_args
)
print("dataset[0]:", dataset[0])
print("trainer.train_dataset[0]:", trainer.train_dataset[0])

Map (num_proc=2):   0%|          | 0/1920 [00:00<?, ? examples/s]

Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


dataset[0]: {'loan_data': 'Loan amount: 8400, DTI: 33.26, Employment Title: Police officer, Employment Length: 5 years, Home Ownership: RENT, Annual Income: 49000.0, Verification Status: Source Verified, Grade: C-C1, Purpose: debt_consolidation, Description: nan, Title: Debt consolidation, Open Accounts: 12', 'labels': 1, 'text': "You are a highly intelligent and detailed artificial intelligence assistant with a deep understanding of financial data, specifically in predicting loan defaults.\nYour task is to accurately classify loan data into one of two possible outcomes:\n- 0: The loan is fully paid off (no default).\n- 1: The loan has defaulted (borrower failed to meet the repayment terms).\n\nThe input data will provide various details about the loan and the borrower's financial situation. Your goal is to carefully analyze this information and determine the loan's status based on the provided features.\n\nYou are expected to generate a response that is one of the following labels:\n-

In [20]:

# Apply feature importance to the loan_data before feeding to the model
def apply_feature_importance(loan_data, feature_importance):
    features = loan_data.split(', ')
    weighted_features = []
    
    # Iterate through each feature and apply its weight
    for feature in features:
        feature_name, feature_value = feature.split(': ')
        if feature_name in feature_importance:
            weighted_value = float(feature_value) * feature_importance[feature_name]
        else:
            weighted_value = float(feature_value)  # No weight for this feature
        weighted_features.append(f"{feature_name}: {weighted_value}")
    
    return ', '.join(weighted_features)

# Apply the feature importance to the entire dataset
for i, row in enumerate(dataset):
    dataset[i]['loan_data'] = apply_feature_importance(row['loan_data'], feature_importance)
    
# Proceed with the rest of the training process
        
'''train'''
trainer = trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 1,920 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 2 | Gradient Accumulation steps = 4
\        /    Total batch size = 8 | Total steps = 240
 "-____-"     Number of trainable parameters = 41,943,040
[34m[1mwandb[0m: Currently logged in as: [33mjunjie_chiang[0m ([33mjjchiang[0m). Use [1m`wandb login --relogin`[0m to force relogin


Step,Training Loss
1,1.9397
2,1.9522
3,1.8637
4,1.4981
5,1.1731
6,1.0801
7,0.5716
8,0.4632
9,0.3256
10,0.2413


KeyboardInterrupt: 