In [1]:
import pandas as pd

# Load files
ledger = pd.read_csv(r"C:\Users\HarithaNagamalla\Downloads\SRC_LDGR_Bloomberg_251125 1.csv")
statement = pd.read_csv(r"C:\Users\HarithaNagamalla\Downloads\SRC_STMT_Calypso_251125 1.csv")

# Parse dates properly - use format='mixed' to handle varying formats
ledger["Trade_Date"] = pd.to_datetime(ledger["Trade_Date"], format="mixed", dayfirst=True)
statement["Trade_Date"] = pd.to_datetime(statement["Trade_Date"], format="mixed", dayfirst=True)

# Normalize to date only for matching (remove time component)
ledger["Trade_Date"] = ledger["Trade_Date"].dt.date
statement["Trade_Date"] = statement["Trade_Date"].dt.date

ledger["source_type"] = "ledger"
statement["source_type"] = "statement"

# Normalize signage
def normalize_signage(x):
    return x.strip().upper()

ledger["Signage"] = ledger["Signage"].apply(normalize_signage)
statement["Signage"] = statement["Signage"].apply(normalize_signage)

# Create signed amount
def signed_amount(row):
    if row["Signage"] in ["DR", "D"]:
        return -row["Amount1"]
    elif row["Signage"] in ["CR", "C"]:
        return row["Amount1"]
    else:
        return 0

ledger["signed_amount"] = ledger.apply(signed_amount, axis=1)
statement["signed_amount"] = statement.apply(signed_amount, axis=1)

# For matching, use absolute amounts (signage differences are normal between systems)
ledger["abs_amount"] = ledger["signed_amount"].abs()
statement["abs_amount"] = statement["signed_amount"].abs()


In [2]:
import os
import torch
from datasets import load_dataset
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, TrainingArguments, HfArgumentParser, pipeline, logging

from peft import LoraConfig, PeftModel
from trl import SFTTrainer

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
import json

def calculate_confidence(ledger_row, statement_row):
    """
    Calculate confidence score based on match quality.
    Perfect match = 1.0, deductions for minor discrepancies.
    """
    confidence = 1.0
    
    # Check for exact matches on key fields
    if ledger_row["ISIN_CUSIP"] != statement_row["ISIN_CUSIP"]:
        confidence -= 0.15
    
    if ledger_row["Trade_Date"] != statement_row["Trade_Date"]:
        confidence -= 0.10
    
    if ledger_row["Currency"] != statement_row["Currency"]:
        confidence -= 0.20
    
    if ledger_row["Quantity"] != statement_row["Quantity"]:
        confidence -= 0.15
    
    # Check if amounts match (should match after abs normalization)
    if abs(ledger_row["signed_amount"] - statement_row["signed_amount"]) > 0.01:
        confidence -= 0.20
    
    # Check references
    if ledger_row["Ref1"] != statement_row["Ref1"]:
        confidence -= 0.05
    
    if ledger_row["Ref2"] != statement_row["Ref2"]:
        confidence -= 0.05
    
    # Ensure confidence is between 0 and 1
    return max(0.0, min(1.0, confidence))

def create_finetune_sample(ledger_row, statement_row):
    confidence = calculate_confidence(ledger_row, statement_row)
    
    user_prompt = f"""
Ledger Transaction:
{ledger_row.to_dict()}

Statement Transaction:
{statement_row.to_dict()}

Task:
Check if these transactions reconcile.
Return:
- matched (true/false)
- reason
- confidence (0-1)
"""

    assistant_response = {
        "matched": True,
        "ledger_source": ledger_row["Source"],
        "statement_source": statement_row["Source"],
        "confidence": round(confidence, 2),
        "reason": (
            "Amounts, currency, trade date, transaction code, quantity "
            "and references match across ledger and statement. "
            "Signage difference handled via accounting rules."
        )
    }

    return {
        "messages": [
            {"role": "system", "content": "You are a financial reconciliation expert."},
            {"role": "user", "content": user_prompt},
            {"role": "assistant", "content": json.dumps(assistant_response)}
        ]
    }


In [4]:
match_keys = [
    "Trade_Date",
    "Currency",
    "Tran_code",
    "Quantity",
    "Ref1",
    "Ref2",
    "abs_amount"  # Use absolute amount for matching
]

merged = ledger.merge(
    statement,
    on=match_keys,
    suffixes=("_ledger", "_statement"),
    how="inner"
)

print("Matched rows:", len(merged))


Matched rows: 903


In [10]:
# Step 1: Prepare data for fine-tuning
# Convert transaction data into instruction-response format

def format_data_for_finetuning(df):
    """
    Convert transaction data into instruction-response pairs for fine-tuning.
    """
    formatted_data = []
    
    for idx, row in df.iterrows():
        # Create instruction-response pairs from transaction data
        instruction = f"Analyze the following transaction: {row.to_dict()}"
        response = f"This is a financial transaction with details: {', '.join([f'{k}: {v}' for k, v in row.to_dict().items()])}"
        
        formatted_data.append({
            "instruction": instruction,
            "output": response,
            "text": f"### Instruction:\n{instruction}\n### Response:\n{response}"
        })
    
    return formatted_data

# Prepare the dataset
fine_tune_data = format_data_for_finetuning(df)
print(f"Prepared {len(fine_tune_data)} training examples")
print("Sample:", fine_tune_data[0])

Prepared 2003 training examples
Sample: {'instruction': "Analyze the following transaction: {'Source': 'Bloomberg', 'ISIN_CUSIP': 'R2YEXP0DR', 'Trade_Date': '07-05-2022 00:00:00', 'Currency': 'CAD', 'Tran_code': 'DIV', 'Quantity': 111, 'Amount1': 25633.43, 'Amount2': 35268.18, 'Signage': 'DR', 'Ref1': '7GVUPC', 'Ref2': 'A2WAHE', 'Trade_status': 'Confirmed'}", 'output': 'This is a financial transaction with details: Source: Bloomberg, ISIN_CUSIP: R2YEXP0DR, Trade_Date: 07-05-2022 00:00:00, Currency: CAD, Tran_code: DIV, Quantity: 111, Amount1: 25633.43, Amount2: 35268.18, Signage: DR, Ref1: 7GVUPC, Ref2: A2WAHE, Trade_status: Confirmed', 'text': "### Instruction:\nAnalyze the following transaction: {'Source': 'Bloomberg', 'ISIN_CUSIP': 'R2YEXP0DR', 'Trade_Date': '07-05-2022 00:00:00', 'Currency': 'CAD', 'Tran_code': 'DIV', 'Quantity': 111, 'Amount1': 25633.43, 'Amount2': 35268.18, 'Signage': 'DR', 'Ref1': '7GVUPC', 'Ref2': 'A2WAHE', 'Trade_status': 'Confirmed'}\n### Response:\nThis is a

In [5]:
training_data = []

for _, row in merged.iterrows():

    ledger_row = {
        "Source": row["Source_ledger"],
        "ISIN_CUSIP": row["ISIN_CUSIP_ledger"],
        "Trade_Date": str(row["Trade_Date"]),
        "Currency": row["Currency"],
        "Tran_code": row["Tran_code"],
        "Quantity": row["Quantity"],
        "signed_amount": row["signed_amount_ledger"],
        "Ref1": row["Ref1"],
        "Ref2": row["Ref2"],
        "Trade_status": row["Trade_status_ledger"]
    }

    statement_row = {
        "Source": row["Source_statement"],
        "ISIN_CUSIP": row["ISIN_CUSIP_statement"],
        "Trade_Date": str(row["Trade_Date"]),
        "Currency": row["Currency"],
        "Tran_code": row["Tran_code"],
        "Quantity": row["Quantity"],
        "signed_amount": row["signed_amount_statement"],
        "Ref1": row["Ref1"],
        "Ref2": row["Ref2"],
        "Trade_status": row["Trade_status_statement"]
    }

    training_data.append(
        create_finetune_sample(
            pd.Series(ledger_row),
            pd.Series(statement_row)
        )
    )


In [6]:
# Step 2: Save formatted data to JSON for training
import json

output_path = r"C:\Users\HarithaNagamalla\Downloads\fine_tune_dataset.json"
with open(output_path, 'w') as f:
    for row in training_data:
        f.write(json.dumps(row) + "\n")

print(f"Dataset saved to {output_path}")

Dataset saved to C:\Users\HarithaNagamalla\Downloads\fine_tune_dataset.json


In [7]:
# Step 3: Load dataset and tokenizer with proper configuration
from datasets import load_dataset, Dataset
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig

# Create dataset from dict
train_dataset = Dataset.from_dict({
    'text': [item['text'] for item in fine_tune_data]
})

print(f"Train dataset size: {len(train_dataset)}")

# Model configuration
model_name = "mistralai/Mistral-7B-Instruct-v0.2"

# Quantization configuration for efficient training
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=True
)

# Load tokenizer with legacy=False to avoid PyPreTokenizerTypeWrapper error
try:
    tokenizer = AutoTokenizer.from_pretrained(
        model_name, 
        trust_remote_code=True,
        legacy=True
    )
except Exception as e:
    print(f"Legacy=False failed, trying with legacy=True: {e}")
    tokenizer = AutoTokenizer.from_pretrained(
        model_name, 
        trust_remote_code=True,
        legacy=True
    )

tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

print("Tokenizer loaded successfully")
print(f"Tokenizer type: {type(tokenizer)}")

NameError: name 'fine_tune_data' is not defined

In [14]:
# Step 4: Load model with quantization
try:
    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        quantization_config=bnb_config,
        device_map="auto",
        trust_remote_code=True,
        attn_implementation="flash_attention_2"  # Optional: for better performance
    )
except Exception as e:
    print(f"Flash attention failed, loading with default attention: {e}")
    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        quantization_config=bnb_config,
        device_map="auto",
        trust_remote_code=True
    )

# Set gradient checkpointing for memory efficiency
model.gradient_checkpointing_enable()
model.config.use_cache = False  # Disable cache during training

print("Model loaded with 4-bit quantization")
print(f"Model dtype: {model.dtype}")

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Fetching 3 files: 100%|██████████| 3/3 [40:40<00:00, 813.50s/it]   


To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Fetching 3 files: 100%|██████████| 3/3 [40:40<00:00, 813.50s/it]   


Flash attention failed, loading with default attention: FlashAttention2 has been toggled on, but it cannot be used due to the following error: the package flash_attn seems to be not installed. Please refer to the documentation of https://huggingface.co/docs/transformers/perf_infer_gpu_one#flashattention-2 to install Flash Attention 2.


To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Fetching 3 files: 100%|██████████| 3/3 [40:40<00:00, 813.50s/it]   


Flash attention failed, loading with default attention: FlashAttention2 has been toggled on, but it cannot be used due to the following error: the package flash_attn seems to be not installed. Please refer to the documentation of https://huggingface.co/docs/transformers/perf_infer_gpu_one#flashattention-2 to install Flash Attention 2.


Loading checkpoint shards: 100%|██████████| 3/3 [41:18<00:00, 826.17s/it] 


To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Fetching 3 files: 100%|██████████| 3/3 [40:40<00:00, 813.50s/it]   


Flash attention failed, loading with default attention: FlashAttention2 has been toggled on, but it cannot be used due to the following error: the package flash_attn seems to be not installed. Please refer to the documentation of https://huggingface.co/docs/transformers/perf_infer_gpu_one#flashattention-2 to install Flash Attention 2.


Loading checkpoint shards: 100%|██████████| 3/3 [41:18<00:00, 826.17s/it] 


Model loaded with 4-bit quantization
Model dtype: torch.float16


In [8]:
# Step 5: Configure LoRA for parameter-efficient fine-tuning
from peft import LoraConfig, get_peft_model

# LoRA configuration for efficient fine-tuning
lora_config = LoraConfig(
    r=16,  # LoRA rank
    lora_alpha=32,  # LoRA alpha for scaling
    target_modules=["q_proj", "v_proj"],  # Target attention modules
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)

# Apply LoRA to model
model = get_peft_model(model, lora_config)

# Show trainable parameters
trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
total_params = sum(p.numel() for p in model.parameters())
print(f"Trainable params: {trainable_params:,} / {total_params:,}")
print(f"Trainable percentage: {100 * trainable_params / total_params:.2f}%")

NameError: name 'model' is not defined

In [16]:
# Step 6: Tokenize dataset
def tokenize_function(examples):
    return tokenizer(
        examples["text"],
        truncation=True,
        max_length=512,
        padding="max_length"
    )

# Tokenize dataset
tokenized_dataset = train_dataset.map(tokenize_function, batched=True, remove_columns=["text"])
print(f"Tokenized dataset: {tokenized_dataset}")
print(f"Sample token ids shape: {tokenized_dataset[0]['input_ids'][:20]}")

Map: 100%|██████████| 2003/2003 [00:02<00:00, 735.69 examples/s]

Map: 100%|██████████| 2003/2003 [00:02<00:00, 735.69 examples/s]

Tokenized dataset: Dataset({
    features: ['input_ids', 'attention_mask'],
    num_rows: 2003
})
Sample token ids shape: [1, 774, 3133, 3112, 28747, 13, 27554, 1374, 272, 2296, 8966, 28747, 12012, 4220, 1869, 464, 28107, 300, 4146, 647]


Map: 100%|██████████| 2003/2003 [00:02<00:00, 735.69 examples/s]

Tokenized dataset: Dataset({
    features: ['input_ids', 'attention_mask'],
    num_rows: 2003
})
Sample token ids shape: [1, 774, 3133, 3112, 28747, 13, 27554, 1374, 272, 2296, 8966, 28747, 12012, 4220, 1869, 464, 28107, 300, 4146, 647]





In [9]:
# Step 7: Configure training arguments
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir=r"C:\Users\HarithaNagamalla\Downloads\fine_tuned_model",
    num_train_epochs=3,
    per_device_train_batch_size=4,
    gradient_accumulation_steps=2,
    learning_rate=2e-4,
    warmup_steps=100,
    weight_decay=0.01,
    logging_steps=10,
    save_steps=50,
    save_total_limit=2,
    max_grad_norm=1.0,
    fp16=True,  # Mixed precision training
    optim="paged_adamw_32bit",
    seed=42
)

print("Training arguments configured")
print(f"Output directory: {training_args.output_dir}")

Training arguments configured
Output directory: C:\Users\HarithaNagamalla\Downloads\fine_tuned_model


In [None]:
# Step 8: Initialize trainer and fine-tune
from transformers import Trainer, DataCollatorForLanguageModeling

# Data collator for language modeling
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False  # Causal language modeling (not masked)
)

# Initialize trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    data_collator=data_collator,
)

print("Trainer initialized. Starting fine-tuning...")
# Uncomment the line below to start training (this will take some time)
trainer.train()

Trainer initialized. Starting fine-tuning...


Trainer initialized. Starting fine-tuning...




In [None]:
# Step 9: Save fine-tuned model
# After training completes, save the model and adapter
def save_fine_tuned_model(model, tokenizer, save_path):
    """Save the fine-tuned model and tokenizer"""
    model.save_pretrained(save_path)
    tokenizer.save_pretrained(save_path)
    print(f"Model saved to {save_path}")

# Uncomment after training:
save_fine_tuned_model(model, tokenizer, r"C:\Users\HarithaNagamalla\Downloads\transaction_finetuned_model")

In [None]:
# Step 10: Test the fine-tuned model
from transformers import pipeline

def test_fine_tuned_model(model, tokenizer, prompt):
    """Generate text using the fine-tuned model"""
    pipe = pipeline(
        "text-generation",
        model=model,
        tokenizer=tokenizer,
        max_new_tokens=100
    )
    
    result = pipe(prompt)
    return result[0]['generated_text']

# Test prompt
test_prompt = "Analyze the following financial transaction: Amount: 1000, Date: 2025-01-25, Type: Transfer"

# Uncomment to test after training:
output = test_fine_tuned_model(model, tokenizer, test_prompt)
print("Generated output:")
print(output)