# Financial Reconciliation Data Processing & Fine-Tuning
Complete pipeline: Data loading → Matching → Training data creation → Model fine-tuning → Inference

In [2]:
# ==================== STEP 1: Import Libraries ====================
import pandas as pd
import torch
import json
import os
from datasets import Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    BitsAndBytesConfig,
    TrainingArguments,
    Trainer,
    DataCollatorForLanguageModeling,
    pipeline
)
from peft import LoraConfig, get_peft_model, AutoPeftModelForCausalLM

print(f"GPU Available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")

  from .autonotebook import tqdm as notebook_tqdm


GPU Available: False


In [3]:
# ==================== STEP 2: Load and Preprocess Data ====================
print("Loading data...")
ledger = pd.read_csv(r"C:\Users\HarithaNagamalla\Downloads\SRC_LDGR_Bloomberg_251125 1.csv")
statement = pd.read_csv(r"C:\Users\HarithaNagamalla\Downloads\SRC_STMT_Calypso_251125 1.csv")

print(f"Ledger shape: {ledger.shape}")
print(f"Statement shape: {statement.shape}")
print(f"\nLedger columns: {ledger.columns.tolist()}")
print(f"Statement columns: {statement.columns.tolist()}")

# Parse dates
ledger["Trade_Date"] = pd.to_datetime(ledger["Trade_Date"], format="mixed", dayfirst=True).dt.date
statement["Trade_Date"] = pd.to_datetime(statement["Trade_Date"], format="mixed", dayfirst=True).dt.date

# Normalize signage
def normalize_signage(x):
    return x.strip().upper() if pd.notna(x) else "C"

ledger["Signage"] = ledger["Signage"].apply(normalize_signage)
statement["Signage"] = statement["Signage"].apply(normalize_signage)

# Create signed amounts
def signed_amount(row):
    if row["Signage"] in ["DR", "D"]:
        return -row["Amount1"]
    elif row["Signage"] in ["CR", "C"]:
        return row["Amount1"]
    else:
        return 0

ledger["signed_amount"] = ledger.apply(signed_amount, axis=1)
statement["signed_amount"] = statement.apply(signed_amount, axis=1)

ledger["abs_amount"] = ledger["signed_amount"].abs()
statement["abs_amount"] = statement["signed_amount"].abs()

# Fill missing values in reference columns
for col in ["Ref1", "Ref2"]:
    if col in ledger.columns:
        ledger[col] = ledger[col].fillna("UNKNOWN").astype(str)
    if col in statement.columns:
        statement[col] = statement[col].fillna("UNKNOWN").astype(str)

print("\nData preprocessing complete!")

Loading data...
Ledger shape: (1003, 12)
Statement shape: (1000, 12)

Ledger columns: ['Source', 'ISIN_CUSIP', 'Trade_Date', 'Currency', 'Tran_code', 'Quantity', 'Amount1', 'Amount2', 'Signage', 'Ref1', 'Ref2', 'Trade_status']
Statement columns: ['Source', 'ISIN_CUSIP', 'Trade_Date', 'Currency', 'Tran_code', 'Quantity', 'Amount1', 'Amount2', 'Signage', 'Ref1', 'Ref2', 'Trade_status']

Data preprocessing complete!


In [4]:
# ==================== STEP 3: Match Transactions ====================
print("Matching transactions...")

# Build match keys dynamically
base_keys = [
    "Trade_Date",
    "Currency",
    "Tran_code",
    "Quantity",
    "abs_amount"
]

match_keys = base_keys.copy()

# Add optional keys if they exist
if "Ref1" in ledger.columns and "Ref1" in statement.columns:
    match_keys.append("Ref1")
    print("✓ Including Ref1 in match keys")
    
if "Ref2" in ledger.columns and "Ref2" in statement.columns:
    match_keys.append("Ref2")
    print("✓ Including Ref2 in match keys")

print(f"\nUsing match keys: {match_keys}")

merged = ledger.merge(
    statement,
    on=match_keys,
    suffixes=("_ledger", "_statement"),
    how="inner"
)

print(f"\nMatched rows: {len(merged)}")
print(f"Match rate: {100 * len(merged) / len(ledger):.1f}%")
print(f"Merged dataframe shape: {merged.shape}")
print(f"Columns in merged: {list(merged.columns)[:15]}...")

Matching transactions...
✓ Including Ref1 in match keys
✓ Including Ref2 in match keys

Using match keys: ['Trade_Date', 'Currency', 'Tran_code', 'Quantity', 'abs_amount', 'Ref1', 'Ref2']

Matched rows: 903
Match rate: 90.0%
Merged dataframe shape: (903, 21)
Columns in merged: ['Source_ledger', 'ISIN_CUSIP_ledger', 'Trade_Date', 'Currency', 'Tran_code', 'Quantity', 'Amount1_ledger', 'Amount2_ledger', 'Signage_ledger', 'Ref1', 'Ref2', 'Trade_status_ledger', 'signed_amount_ledger', 'abs_amount', 'Source_statement']...


In [5]:
# ==================== STEP 4: Calculate Confidence & Create Training Data ====================
def calculate_confidence(ledger_row, statement_row):
    """Calculate confidence score (0.0-1.0) based on match quality."""
    confidence = 1.0
    
    try:
        if ledger_row.get("ISIN_CUSIP") != statement_row.get("ISIN_CUSIP"):
            confidence -= 0.15
        if str(ledger_row.get("Trade_Date")) != str(statement_row.get("Trade_Date")):
            confidence -= 0.10
        if ledger_row.get("Currency") != statement_row.get("Currency"):
            confidence -= 0.20
        if ledger_row.get("Quantity") != statement_row.get("Quantity"):
            confidence -= 0.15
        if abs(float(ledger_row.get("signed_amount", 0)) - float(statement_row.get("signed_amount", 0))) > 0.01:
            confidence -= 0.20
        if ledger_row.get("Ref1", "N/A") != statement_row.get("Ref1", "N/A"):
            confidence -= 0.05
        if ledger_row.get("Ref2", "N/A") != statement_row.get("Ref2", "N/A"):
            confidence -= 0.05
    except Exception as e:
        print(f"Error calculating confidence: {e}")
    
    return max(0.0, min(1.0, confidence))

def create_finetune_sample(ledger_row, statement_row):
    """Create a training sample for fine-tuning."""
    confidence = calculate_confidence(ledger_row, statement_row)
    
    user_prompt = f"""Analyze these transactions and determine if they reconcile:

Ledger Transaction:
Source: {ledger_row.get('Source', 'Unknown')}
Date: {ledger_row.get('Trade_Date', 'Unknown')}
Currency: {ledger_row.get('Currency', 'Unknown')}
Amount: {ledger_row.get('signed_amount', 0)}
Quantity: {ledger_row.get('Quantity', 0)}
Reference: {ledger_row.get('Ref1', 'N/A')}

Statement Transaction:
Source: {statement_row.get('Source', 'Unknown')}
Date: {statement_row.get('Trade_Date', 'Unknown')}
Currency: {statement_row.get('Currency', 'Unknown')}
Amount: {statement_row.get('signed_amount', 0)}
Quantity: {statement_row.get('Quantity', 0)}
Reference: {statement_row.get('Ref1', 'N/A')}

Question: Do these transactions match?"""
    
    assistant_response = f"""Yes, these transactions reconcile with confidence score {confidence:.2f}.
Match Details:
- Date: Match
- Currency: Match
- Amount: Match (accounting for signage differences)
- Quantity: Match
- References: Match
Recommendation: RECONCILED"""
    
    formatted_text = f"""[INST] {user_prompt} [/INST] {assistant_response}"""
    
    return {"text": formatted_text}

print("Creating training data...")
training_data = []
for idx, (_, row) in enumerate(merged.iterrows()):
    # Use .get() to safely access columns
    ledger_row = {
        "Source": str(row.get("Source_ledger", "Unknown")),
        "Trade_Date": str(row.get("Trade_Date", "Unknown")),
        "Currency": str(row.get("Currency", "Unknown")),
        "signed_amount": float(row.get("signed_amount_ledger", 0)),
        "Quantity": int(row.get("Quantity", 0)),
        "Ref1": str(row.get("Ref1", "N/A")),
        "ISIN_CUSIP": str(row.get("ISIN_CUSIP_ledger", "Unknown"))
    }
    
    statement_row = {
        "Source": str(row.get("Source_statement", "Unknown")),
        "Trade_Date": str(row.get("Trade_Date", "Unknown")),
        "Currency": str(row.get("Currency", "Unknown")),
        "signed_amount": float(row.get("signed_amount_statement", 0)),
        "Quantity": int(row.get("Quantity", 0)),
        "Ref1": str(row.get("Ref1", "N/A")),
        "ISIN_CUSIP": str(row.get("ISIN_CUSIP_statement", "Unknown"))
    }
    
    training_data.append(create_finetune_sample(
        pd.Series(ledger_row),
        pd.Series(statement_row)
    ))
    
    if (idx + 1) % 100 == 0:
        print(f"  Processed {idx + 1}/{len(merged)} samples...")

print(f"\n✓ Created {len(training_data)} training samples")

Creating training data...
  Processed 100/903 samples...
  Processed 200/903 samples...
  Processed 300/903 samples...
  Processed 400/903 samples...
  Processed 500/903 samples...
  Processed 600/903 samples...
  Processed 700/903 samples...
  Processed 800/903 samples...
  Processed 900/903 samples...

✓ Created 903 training samples


In [6]:
# ==================== STEP 5: Save Training Data ====================
print("Saving training data...")

output_path = r"C:\Users\HarithaNagamalla\Downloads\reconciliation_training_data.jsonl"
os.makedirs(os.path.dirname(output_path), exist_ok=True)

with open(output_path, 'w') as f:
    for item in training_data:
        f.write(json.dumps(item) + "\n")

print(f"✓ Dataset saved to {output_path}")
print(f"\nSample training data (first 200 chars):\n{training_data[0]['text'][:200]}...")

Saving training data...
✓ Dataset saved to C:\Users\HarithaNagamalla\Downloads\reconciliation_training_data.jsonl

Sample training data (first 200 chars):
[INST] Analyze these transactions and determine if they reconcile:

Ledger Transaction:
Source: Bloomberg
Date: 2022-05-07
Currency: CAD
Amount: -25633.43
Quantity: 111
Reference: 7GVUPC

Statement Tr...


In [7]:
# ==================== STEP 6: Create Dataset ====================
print("Creating Hugging Face dataset...")

train_dataset = Dataset.from_dict({
    'text': [item['text'] for item in training_data]
})

print(f"✓ Dataset size: {len(train_dataset)}")
print(f"✓ Sample text length: {len(train_dataset[0]['text'])} characters")

Creating Hugging Face dataset...
✓ Dataset size: 903
✓ Sample text length: 579 characters


In [8]:
# ==================== STEP 7: Load Model & Tokenizer ====================
print("Loading model and tokenizer...")

MODEL_NAME = "mistralai/Mistral-7B-Instruct-v0.2"

# 4-bit quantization config with CPU offloading for memory efficiency
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=True,
    llm_int8_enable_fp32_cpu_offload=True  # Enable CPU offloading to prevent OOM
)

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

# Load model with quantization - with tiered fallback strategy
try:
    model = AutoModelForCausalLM.from_pretrained(
        MODEL_NAME,
        quantization_config=bnb_config,
        device_map="auto",
        trust_remote_code=True,
        attn_implementation="flash_attention_2"
    )
    print("✓ Using Flash Attention 2")
except Exception as e:
    try:
        print(f"Flash attention not available, trying default attention: {e}")
        model = AutoModelForCausalLM.from_pretrained(
            MODEL_NAME,
            quantization_config=bnb_config,
            device_map="auto",
            trust_remote_code=True
        )
        print("✓ Using default attention")
    except Exception as e2:
        print(f"GPU loading failed, using CPU-only mode: {e2}")
        model = AutoModelForCausalLM.from_pretrained(MODEL_NAME)

        quantization_config=bnb_config,print(f"✓ Tokenizer loaded")

        device_map="cpu",print(f"✓ Model loaded: {MODEL_NAME}")

        trust_remote_code=True

        model.enable_input_require_grads()  # Optimize gradient computation

        print("✓ Using CPU-only mode (slower)")
model.config.use_cache = False

model.gradient_checkpointing_enable()

Loading model and tokenizer...


Flash attention not available, trying default attention: FlashAttention2 has been toggled on, but it cannot be used due to the following error: the package flash_attn seems to be not installed. Please refer to the documentation of https://huggingface.co/docs/transformers/perf_infer_gpu_one#flashattention-2 to install Flash Attention 2.


Loading checkpoint shards: 100%|██████████| 3/3 [10:18<00:00, 206.29s/it]
Some parameters are on the meta device because they were offloaded to the cpu and disk.


✓ Using default attention


In [None]:
# ==================== STEP 8: Configure LoRA ====================
print("Configuring LoRA...")

lora_config = LoraConfig(
    r=16,
    lora_alpha=32,
    target_modules=["q_proj", "v_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)

model = get_peft_model(model, lora_config)

trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
total_params = sum(p.numel() for p in model.parameters())

print(f"✓ Trainable params: {trainable_params:,} / {total_params:,}")
print(f"✓ Trainable percentage: {100 * trainable_params / total_params:.2f}%")

Configuring LoRA...


In [None]:
# ==================== STEP 9: Tokenize Dataset ====================
print("Tokenizing dataset...")

def tokenize_function(examples):
    return tokenizer(
        examples["text"],
        truncation=True,
        max_length=512,
        padding="max_length"
    )

tokenized_dataset = train_dataset.map(
    tokenize_function,
    batched=True,
    remove_columns=["text"]
)

print(f"✓ Tokenized dataset size: {len(tokenized_dataset)}")

In [None]:
# ==================== STEP 10: Setup Training ====================
print("Configuring training...")

training_args = TrainingArguments(
    output_dir=r"C:\Users\HarithaNagamalla\Downloads\reconciliation_model",
    num_train_epochs=2,  # Reduced from 3 for memory efficiency
    per_device_train_batch_size=1,  # Reduced from 4 to minimize GPU usage
    gradient_accumulation_steps=4,  # Increased from 2 to maintain effective batch size
    learning_rate=2e-4,
    warmup_steps=50,  # Reduced from 100
    weight_decay=0.01,
    logging_steps=10,
    save_steps=30,  # Reduced from 50
    save_total_limit=1,  # Reduced from 2 to save disk space
    max_grad_norm=1.0,
    fp16=True,
    optim="paged_adamw_32bit",
    seed=42,
    report_to=[]
)

print("✓ Training config ready (optimized for GPU memory: batch_size=1, gradient_accumulation=4)")

In [None]:
# ==================== STEP 11: Train Model ====================
print("Initializing trainer...")

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    data_collator=data_collator,
)

print("\n" + "="*80)
print("STARTING FINE-TUNING...")
print("="*80 + "\n")

trainer.train()

In [None]:
# ==================== STEP 12: Save Fine-tuned Model ====================
print("\nSaving fine-tuned model...")

model_save_path = r"C:\Users\HarithaNagamalla\Downloads\reconciliation_model_finetuned"
os.makedirs(model_save_path, exist_ok=True)

model.save_pretrained(model_save_path)
tokenizer.save_pretrained(model_save_path)

print(f"✓ Model saved to {model_save_path}")

In [None]:
# ==================== STEP 13: Test Inference ====================
print("Loading model for inference...")

model_inference = AutoPeftModelForCausalLM.from_pretrained(
    model_save_path,
    device_map="auto",
    torch_dtype=torch.float16,
)

tokenizer_inference = AutoTokenizer.from_pretrained(model_save_path)

# Test prompt
test_prompt = """[INST] Analyze these transactions and determine if they reconcile:

Ledger Transaction:
Source: Bloomberg
Date: 2022-05-07
Currency: CAD
Amount: -25633.43
Quantity: 111

Statement Transaction:
Source: Calypso
Date: 2022-05-07
Currency: CAD
Amount: 25633.43
Quantity: 111

Question: Do these transactions match? [/INST]"""

print("\nGenerating prediction...")
inputs = tokenizer_inference(test_prompt, return_tensors="pt").to("cuda" if torch.cuda.is_available() else "cpu")

outputs = model_inference.generate(
    **inputs,
    max_new_tokens=200,
    do_sample=True,
    top_p=0.95,
    temperature=0.7
)

response = tokenizer_inference.decode(outputs[0], skip_special_tokens=True)
print("\n" + "="*80)
print("MODEL PREDICTION:")
print("="*80)
print(response)

In [None]:
# ==================== STEP 14: Summary ====================
print("\n" + "="*80)
print("RECONCILIATION FINE-TUNING PIPELINE COMPLETE!")
print("="*80)
print(f"\nResults:")
print(f"✓ Processed {len(ledger)} ledger transactions")
print(f"✓ Processed {len(statement)} statement transactions")
print(f"✓ Matched {len(merged)} transaction pairs")
print(f"✓ Generated {len(training_data)} training samples")
print(f"✓ Fine-tuned model saved to: {model_save_path}")
print(f"\nModel Information:")
print(f"✓ Base Model: {MODEL_NAME}")
print(f"✓ Trainable Parameters: {trainable_params:,}")
print(f"✓ Training Time: ~30-60 minutes on GPU")
print(f"\nNext Steps:")
print(f"1. Use the model for inference on new reconciliation tasks")
print(f"2. Evaluate accuracy on a test dataset")
print(f"3. Deploy as an API endpoint for production use")
print("="*80)