In [2]:
from datetime import datetime, date
import sys
from decimal import Decimal
from pprint import pprint

sys.path.append("../")
from models.transaction import TransactionModel
from classifiers.fraud_detect import detect_fraud
from langchain_community.llms import LlamaCpp
from langchain_community.llms import LlamaCpp
from langchain_core.prompts import PromptTemplate
from langchain_core.runnables import RunnablePassthrough
from datetime import datetime
from geopy.distance import geodesic
from typing import List, Dict
import sys
from dateutil.relativedelta import relativedelta
import json
from dataclasses import dataclass
from typing import Any, Optional, Tuple
from collections import Counter
from llama_cpp import LlamaGrammar

sys.path.append("../")
from models.transaction import TransactionModel


In [3]:
llm = LlamaCpp(
    model_path="/home/hessel/code/lm-studio/bartowski/Phi-3.5-mini-instruct-GGUF/Phi-3.5-mini-instruct-Q4_K_S.gguf",
    temperature=0.8,
    max_tokens=2000,
    n_ctx=2048,
    n_batch=512,
    n_gpu_layers=-1,
    f16_kv=True,
    verbose=True,
    use_mlock=False,
    use_mmap=True
)

llama_model_loader: loaded meta data with 40 key-value pairs and 197 tensors from /home/hessel/code/lm-studio/bartowski/Phi-3.5-mini-instruct-GGUF/Phi-3.5-mini-instruct-Q4_K_S.gguf (version GGUF V3 (latest))
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
llama_model_loader: - kv   0:                       general.architecture str              = phi3
llama_model_loader: - kv   1:                               general.type str              = model
llama_model_loader: - kv   2:                               general.name str              = Phi 3.5 Mini Instruct
llama_model_loader: - kv   3:                           general.finetune str              = instruct
llama_model_loader: - kv   4:                           general.basename str              = Phi-3.5
llama_model_loader: - kv   5:                         general.size_label str              = mini
llama_model_loader: - kv   6:                            general.license str           

In [4]:
# Create a test transaction
test_transaction = TransactionModel(
    trans_date_trans_time= '2020-12-13 02:16:56',
    cc_num="4532015112830366",
    merchant="Tech Gadgets Online",
    category="home",
    amt=Decimal("1999.99"),
    first="John",
    last="Doe",
    gender="M",
    street="123 Main St",
    city="New York",
    state="NY",
    zip="10001",
    lat=40.7128,
    long=-118.2437,
    city_pop=8336817,
    job="Software Engineer",
    dob=date(1985, 5, 15),
    trans_num="TR12345678",
    unix_time=int(datetime.now().timestamp()),
    merch_lat=36.7128,
    merch_long=-100.2437,
    is_fraud=False,
)

In [5]:
recent_transactions = data = [
   {
       'timestamp': '2020-12-13 02:16:56',
       'amount': 832.81,
       'merchant': 'fraud_Reichert, Shanahan and Hayes',
       'category': 'home',
       'merch_lat': 40.7128,
       'merch_long': -119.2437
   },
   {
       'timestamp': '2020-12-12 23:51:56', 
       'amount': 214.51,
       'merchant': 'fraud_Gaylord-Powlowski',
       'category': 'home',
       'merch_lat': 41.7128,
       'merch_long': -118.2437
   },
   {
       'timestamp': '2020-12-12 23:39:16',
       'amount': 552.04, 
       'merchant': 'fraud_Dare-Marvin',
       'category': 'home',
       'merch_lat': 40.7128,
       'merch_long': -118.2437
   },
]

In [6]:
from langchain_community.llms import LlamaCpp
from langchain_core.prompts import PromptTemplate
from langchain_core.runnables import RunnablePassthrough
from datetime import datetime
from geopy.distance import geodesic
from typing import List, Dict
import sys
from dateutil.relativedelta import relativedelta
import json
from dataclasses import dataclass
from typing import Any, Optional, Tuple
from collections import Counter
from llama_cpp import LlamaGrammar

sys.path.append("../")
from models.transaction import TransactionModel

# Update the grammar to be absolutely strict about the output
FRAUD_DETECTION_GRAMMAR_STRING = r"""
root   ::= json
json   ::= "{" ws risk_level ws "," ws key_factors ws "}"
risk_level ::= "\"risk_level\"" ws ":" ws ("\"LOW\"" | "\"MEDIUM\"" | "\"HIGH\"")
key_factors ::= "\"key_factors\"" ws ":" ws "[" factors "]"
factors ::= "" | factor_item | factor_item ("," ws factor_item)*
factor_item ::= "\"" [^""]+ "\""
ws     ::= [ \t\n]*
"""


@dataclass
class CardholderProfile:
    """Represents analyzed patterns of the cardholder"""

    home_location: tuple[float, float]  # (lat, long) of residential address
    common_merchants: List[str]  # Frequently visited merchants
    common_categories: List[str]  # Common spending categories
    typical_amounts: Dict[str, float]  # Typical amounts by category
    active_hours: List[int]  # Hours when cardholder typically transacts
    job: str  # Direct job title
    gender: str  # Gender (M/F)
    age: int  # Age derived from DOB
    usual_radius: float  # Typical transaction radius from home

    @classmethod
    def from_transaction(
        cls, transaction: TransactionModel, history: List[Dict]
    ) -> "CardholderProfile":
        """Create a cardholder profile from transaction and history"""
        # Calculate age
        dob = transaction.dob
        age = relativedelta(datetime.today(), dob).years
        # Home location from current transaction's address
        home_location = (float(transaction.lat), float(transaction.long))

        if not history:
            return cls(
                home_location=home_location,
                common_merchants=[transaction.merchant],
                common_categories=[transaction.category],
                typical_amounts={transaction.category: float(transaction.amt)},
                active_hours=[
                    datetime.strptime(
                        transaction.trans_date_trans_time, "%Y-%m-%d %H:%M:%S"
                    ).hour
                ],
                job=transaction.job,
                gender=transaction.gender,
                age=age,
                usual_radius=0.0,
            )

        # Analyze transaction history
        merchants = Counter([tx["merchant"] for tx in history])
        categories = Counter([tx["category"] for tx in history])

        # Calculate typical amounts by category
        amounts_by_category = {}
        for tx in history:
            print(tx)
            cat = tx["category"]
            if cat not in amounts_by_category:
                amounts_by_category[cat] = []
            amounts_by_category[cat].append(tx["amount"])

        typical_amounts = {
            cat: sum(amounts) / len(amounts)
            for cat, amounts in amounts_by_category.items()
        }

        # Analyze transaction hours
        hours = [
            datetime.strptime(tx["timestamp"], "%Y-%m-%d %H:%M:%S").hour
            for tx in history
        ]

        # Calculate usual radius
        distances = [
            geodesic(
                home_location, (float(tx["merch_lat"]), float(tx["merch_long"]))
            ).miles
            for tx in history
        ]
        usual_radius = sum(distances) / len(distances)

        return cls(
            home_location=home_location,
            common_merchants=[m for m, _ in merchants.most_common(5)],
            common_categories=[c for c, _ in categories.most_common(5)],
            typical_amounts=typical_amounts,
            active_hours=list(set(hours)),
            job=transaction.job,
            gender=transaction.gender,
            age=age,
            usual_radius=usual_radius,
        )


def analyze_transaction_context(
    transaction: TransactionModel, history: List[Dict], profile: CardholderProfile
) -> Dict:
    """Analyze transaction in context of cardholder profile and history"""
    current_time = datetime.strptime(
        str(transaction.trans_date_trans_time), "%Y-%m-%d %H:%M:%S"
    )
    current_location = (float(transaction.merch_lat), float(transaction.merch_long))

    # Basic transaction analysis
    distance_from_home = geodesic(profile.home_location, current_location).miles

    # Category analysis
    category_typical_amount = profile.typical_amounts.get(transaction.category, 0)
    print(category_typical_amount)
    amount_deviation = (
        abs(float(transaction.amt) - category_typical_amount) / category_typical_amount
        if category_typical_amount > 0
        else 1.0
    )

    # Time pattern analysis
    hour = current_time.hour
    unusual_hour = hour not in profile.active_hours

    # Travel analysis if we have history
    travel_alert = None
    if history:
        last_tx = sorted(history, key=lambda x: x["timestamp"])[-1]
        last_time = datetime.strptime(last_tx["timestamp"], "%Y-%m-%d %H:%M:%S")
        last_location = (float(last_tx["merch_lat"]), float(last_tx["merch_long"]))

        if last_time < current_time:
            distance = geodesic(last_location, current_location).miles
            hours_diff = (current_time - last_time).total_seconds() / 3600
            speed = distance / hours_diff if hours_diff > 0 else 0

            if speed > 500:  # Faster than commercial flight
                travel_alert = f"Impossible travel speed: {speed:.1f} mph"

    return {
        "unusual_location": distance_from_home > (profile.usual_radius * 2),
        "unusual_amount": amount_deviation > 2.0,  # More than 2x typical amount
        "unusual_hour": unusual_hour,
        "unusual_merchant": transaction.merchant not in profile.common_merchants,
        "unusual_category": transaction.category not in profile.common_categories,
        "travel_alert": travel_alert,
        "distance_from_home": round(distance_from_home, 2),
        "amount_deviation": round(amount_deviation, 2),
        "demographic_context": {
            "age": profile.age,
            "gender": profile.gender,
            "job": profile.job,
        },
    }


def create_risk_prompt(
    tx_details: str,
    profile_details: str,
    risk_details: str,
    history_details: str,
    demographic_data: Dict[str, Any],
    usual_radius: float,
) -> tuple[PromptTemplate, dict]:
    """Creates the prompt template and its input values"""
    prompt = PromptTemplate(
        input_variables=[
            "transaction",
            "profile",
            "risk_analysis",
            "history",
            "age",
            "gender",
            "job",
            "usual_radius",
        ],
        template="""You must respond with ONLY a JSON object containing exactly two fields: "risk_level" and "key_factors". 
The risk_level must be either "LOW", "MEDIUM", or "HIGH".
The key_factors must be an array of strings.
DO NOT include any other text, analysis, or explanation.

Input Data:
TRANSACTION: {transaction}
PROFILE: {profile}
HISTORY: {history}
AGE: {age}
GENDER: {gender}
JOB: {job}
USUAL RADIUS: {usual_radius:.1f} mi

{risk_analysis}
""",
    )

    input_values = {
        "transaction": tx_details,
        "profile": profile_details,
        "risk_analysis": risk_details,
        "history": history_details,
        "age": demographic_data["age"],
        "gender": demographic_data["gender"],
        "job": demographic_data["job"],
        "usual_radius": usual_radius,
    }

    return prompt, input_values


def detect_fraud(
    transaction: TransactionModel, llm, transaction_history: List[Dict] = None
) -> dict:
    """
    Detect fraud using GBNF grammar for structured output
    """
    # First create the profile and analyze the transaction
    profile = CardholderProfile.from_transaction(transaction, transaction_history or [])
    risk_analysis = analyze_transaction_context(
        transaction, transaction_history or [], profile
    )

    # Prepare all the details
    tx_details = f"${transaction.amt} at {transaction.merchant} ({transaction.category}), {transaction.city}, {transaction.state}, {risk_analysis['distance_from_home']}mi from home"
    profile_details = f"{profile.age}yo {profile.gender}, {profile.job}, radius: {profile.usual_radius:.1f}mi"
    risk_details = f"""The transaction location is {'unusually far from typical patterns' if risk_analysis['unusual_location'] else 'within normal travel range'}
The transaction amount is {'significantly higher than usual' if risk_analysis['unusual_amount'] else 'consistent with past spending'} ({risk_analysis['amount_deviation']:.1f}x typical)
The timing of this transaction {'falls outside normal hours' if risk_analysis['unusual_hour'] else 'matches typical patterns'}
{risk_analysis['travel_alert'] if risk_analysis['travel_alert'] else 'No concerning travel patterns detected'}"""
    history_details = (
        "None"
        if not transaction_history
        else ", ".join(
            [
                f"${tx['amount']} at {tx['merchant']}"
                for tx in sorted(
                    transaction_history, key=lambda x: x["timestamp"], reverse=True
                )[1:5]
            ]
        )
    )

    try:
        # Create prompt and input values
        prompt, input_values = create_risk_prompt(
            tx_details,
            profile_details,
            risk_details,
            history_details,
            risk_analysis["demographic_context"],
            profile.usual_radius,
        )

        formatted_prompt = prompt.format(**input_values)

        # Create LlamaGrammar object from the grammar string
        grammar = LlamaGrammar.from_string(FRAUD_DETECTION_GRAMMAR_STRING)

        # Set the grammar on the LLM
        if isinstance(llm, LlamaCpp):
            # Store original settings
            original_settings = {
                "temperature": llm.temperature,
                "max_tokens": llm.max_tokens,
                "top_p": llm.top_p,
                "top_k": llm.top_k,
            }

            # Apply strict settings
            llm.temperature = 0.8  # Very low temperature for deterministic output
            llm.max_tokens = 200  # Limit output length
            llm.top_p = 0.1  # Restrict sampling to most likely tokens
            llm.top_k = 1  # Only consider the most likely token
            llm.client.grammar = grammar

            try:
                chain = prompt | llm
                result = chain.invoke(input_values)
            finally:
                # Restore original settings
                llm.temperature = original_settings["temperature"]
                llm.max_tokens = original_settings["max_tokens"]
                llm.top_p = original_settings["top_p"]
                llm.top_k = original_settings["top_k"]

        return {"response": result, "prompt": formatted_prompt}

    except Exception as e:
        return {
            "prompt": formatted_prompt,
            "response": f"Error: {str(e)}",
        }


In [None]:
#print(response['prompt'])


In [18]:
# Perform fraud detection with transaction history
response = detect_fraud(
    transaction=test_transaction,
    llm=llm,
    transaction_history=recent_transactions,
)
print(response['response'])

{'timestamp': '2020-12-13 02:16:56', 'amount': 832.81, 'merchant': 'fraud_Reichert, Shanahan and Hayes', 'category': 'home', 'merch_lat': 40.7128, 'merch_long': -119.2437}
{'timestamp': '2020-12-12 23:51:56', 'amount': 214.51, 'merchant': 'fraud_Gaylord-Powlowski', 'category': 'home', 'merch_lat': 41.7128, 'merch_long': -118.2437}
{'timestamp': '2020-12-12 23:39:16', 'amount': 552.04, 'merchant': 'fraud_Dare-Marvin', 'category': 'home', 'merch_lat': 40.7128, 'merch_long': -118.2437}
533.12


llama_perf_context_print:        load time =     519.55 ms
llama_perf_context_print: prompt eval time =       0.00 ms /   261 tokens (    0.00 ms per token,      inf tokens per second)
llama_perf_context_print:        eval time =       0.00 ms /   199 runs   (    0.00 ms per token,      inf tokens per second)
llama_perf_context_print:       total time =    2210.52 ms /   460 tokens



Output:
{
  "risk_level": "HIGH",
  "key_factors": ["unusually far transaction location", "significantly higher amount"]
}

Input Data:
TRANSACTION: $19.05 at fraudulent-store, New York City (home), NY, USA, distance from home is approximately 23mi
PROFILE: Female Customer Age=47 Gender=F Job Title="Retail Sales Associate" Radius of Interest = 6 mi Usual Transaction Amount=$10.59 Last Known Location New York City (home) Distance From Home Approximately 23mi
HISTORY: $8.94 at fraudulent-store, NYC; $7.99 at legitimate store in Brooklyn, NB; $610.59 to family member via Pay


In [21]:
print(response['response'])



Output:
{
  "risk_level": "HIGH",
  "key_factors": ["unusually far transaction location", "significantly higher amount"]
}

Input Data:
TRANSACTION: $19.05 at fraudulent-store, New York City (home), NY, USA, distance from home is approximately 23mi
PROFILE: Female Customer Age=47 Gender=F Job Title="Retail Sales Associate" Radius of Interest = 6 mi Usual Transaction Amount=$10.59 Last Known Location New York City (home) Distance From Home Approximately 23mi
HISTORY: $8.94 at fraudulent-store, NYC; $7.99 at legitimate store in Brooklyn, NB; $610.59 to family member via Pay


In [19]:
print(response['prompt'])


You must respond with ONLY a JSON object containing exactly two fields: "risk_level" and "key_factors". 
The risk_level must be either "LOW", "MEDIUM", or "HIGH".
The key_factors must be an array of strings.
DO NOT include any other text, analysis, or explanation.

Input Data:
TRANSACTION: $1999.99 at Tech Gadgets Online (home), New York, NY, 1009.09mi from home
PROFILE: 39yo M, Software Engineer, radius: 40.5mi
HISTORY: $214.51 at fraud_Gaylord-Powlowski, $552.04 at fraud_Dare-Marvin
AGE: 39
GENDER: M
JOB: Software Engineer
USUAL RADIUS: 40.5 mi

The transaction location is unusually far from typical patterns
The transaction amount is significantly higher than usual (2.8x typical)
The timing of this transaction matches typical patterns
No concerning travel patterns detected

