In [21]:
!pip install requests beautifulsoup4 faiss-cpu sentence-transformers transformers torch
!pip install accelerate>=0.26.0
import os
os.environ["HF_HUB_DISABLE_SYMLINKS_WARNING"] = "1"




In [22]:
from huggingface_hub import login

login(token="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")

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch

# Model name
model_name = "meta-llama/Llama-2-7b-chat-hf"

# Load the tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype=torch.float16,  # Use float16 for faster inference (optional)
    device_map="auto",  # Automatically use available GPU(s)
)

# Chat history (you can add to this to continue the conversation)
chat_history = []

In [None]:
# Input prompt
user_prompt = "What is the capital of France?"
chat_history.append({"role": "user", "content": user_prompt})

# Tokenize the input using the tokenizer.apply_chat_template method
# See https://huggingface.co/docs/transformers/main/en/chat_templating for more details
inputs = tokenizer.apply_chat_template(
    chat_history,
    add_generation_prompt=True,  # Add a prompt at the end to trigger the model to generate
    return_tensors="pt"  # Return PyTorch tensors
).to(model.device)

# Generate text
generate_ids = model.generate(
    inputs,
    max_length=4096,  # Increased max_length (Llama 2 has a context window of 4096)
    do_sample=True,
    top_p=0.9,  # Use nucleus sampling
    temperature=0.2,  # Lower temperature for more focused responses
    pad_token_id=tokenizer.eos_token_id,  # Set pad token to avoid warnings
)

# Decode the generated text
decoded_output = tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]

# Isolate the assistant's response from the decoded output
assistant_response = decoded_output.split("[/INST]")[-1].strip()

# Add the assistant's response to the chat history
chat_history.append({"role": "assistant", "content": assistant_response})

# Print the response
print(f"Assistant: {assistant_response}")

Assistant: The capital of France is Paris.


In [15]:
# If using Colab's local storage (files will be deleted when runtime ends):
FAISS_INDEX_FILE = "/content/regulatory_index.faiss"
CUSTOMER_INDEX_FILE = "/content/customer_index.faiss"
last_update_file = "/content/last_update.txt"
last_reindex_file = "/content/last_reindex.txt"
regulatory_metadata_file = "/content/regulatory_metadata.json"
customer_metadata_file = "/content/customer_metadata.json"

GOVERNMENT_LINKS = [
    "https://www.fincen.gov/resources/statutes-regulations/guidance",
    "https://ofac.treasury.gov/sanctions-programs-and-country-information",
]
UPDATE_INTERVAL_DAYS = 7
RE_INDEX_INTERVAL_DAYS = 30
CHUNK_SIZE = 512
EMBEDDING_MODEL_NAME = "all-mpnet-base-v2"
LLM_MODEL_NAME = "meta-llama/Llama-2-7b-chat-hf"
K_RETRIEVAL = 5

In [16]:
import requests
from bs4 import BeautifulSoup
import time
import datetime
import faiss
import numpy as np
import re
import json

from sentence_transformers import SentenceTransformer

# --- Data Structures ---
class Document:
    def __init__(self, url, title, content, last_updated):
        self.url = url
        self.title = title
        self.content = content
        self.last_updated = last_updated

# --- Data Loading and Preprocessing ---

def preprocess_transaction(transaction):
    """Converts a transaction dictionary to a descriptive text string."""
    return (
        f"Customer {transaction['customer_id']} {transaction['transaction_type']}ed "
        f"${transaction['amount']:.2f} {transaction['currency']} in {transaction['location']} on {transaction['date']}."
    )

def aggregate_transactions(transactions):
    """Groups transactions by customer_id and creates a chronological narrative."""
    customer_narratives = {}
    for transaction in transactions:
        customer_id = transaction["customer_id"]
        narrative = preprocess_transaction(transaction)
        if customer_id not in customer_narratives:
            customer_narratives[customer_id] = []
        customer_narratives[customer_id].append(narrative)

    for customer_id, narratives in customer_narratives.items():
        customer_narratives[customer_id] = " ".join(narratives)

    return customer_narratives

def scrape_website(url):
    """Scrapes a website for relevant content."""
    try:
        response = requests.get(url)
        response.raise_for_status()

        soup = BeautifulSoup(response.content, "html.parser")
        main_content = soup.find("main") or soup.find("body")
        if not main_content:
            return None

        title = soup.title.string.strip() if soup.title else url

        for script in main_content(["script", "style"]):
            script.decompose()

        text = main_content.get_text(separator=" ", strip=True)

        return Document(url, title, text, datetime.date.today())

    except requests.exceptions.RequestException as e:
        print(f"Error scraping {url}: {e}")
        return None
    except Exception as e:
        print(f"An unexpected error occurred while scraping {url}: {e}")
        return None


def chunk_document(document):
    """Splits a document into smaller chunks."""
    chunks = []
    start = 0
    doc_text = document.content
    while start < len(doc_text):
        end = min(start + CHUNK_SIZE, len(doc_text))
        # Try to find a sentence boundary near the chunk end
        boundary = doc_text.rfind(". ", start, end)
        if boundary == -1:
            boundary = end
        else:
            boundary += 2  # Include the period and space

        chunks.append(
            {
                "text": doc_text[start:boundary],
                "metadata": {
                    "source": document.url,
                    "title": document.title,
                    "last_updated": document.last_updated.isoformat(),
                    "doc_id": document.url
                },
            }
        )
        start = boundary
    return chunks

In [17]:
# --- Retrieval System ---

def create_embeddings(texts, embedding_model):
    """Generates embeddings for a list of texts."""
    return embedding_model.encode(texts)

def initialize_faiss_index(dimension):
    """Creates a FAISS index."""
    return faiss.IndexFlatL2(dimension)

def add_to_index(index, embeddings):
    """Adds embeddings to a FAISS index."""
    embeddings = np.array(embeddings).astype("float32")
    index.add(embeddings)

def create_index_and_save(embeddings, metadata_list, index_path, index_file):
    dimension = len(embeddings[0])
    index = initialize_faiss_index(dimension)
    add_to_index(index, embeddings)
    faiss.write_index(index, index_path)

    # Enhanced saving with validation and text inclusion:
    saved_data = []
    for i, chunk in enumerate(metadata_list):
        if not isinstance(chunk, dict) or "text" not in chunk or "metadata" not in chunk:
            print(f"Error: Invalid chunk format at index {i}: {chunk}")
            continue  # Skip this chunk

        if not isinstance(chunk["metadata"], dict):
            print(f"Error: Invalid metadata format at index {i}: {chunk['metadata']}")
            continue

        saved_data.append({
            "text": chunk["text"],  # Include the text content
            "metadata": chunk["metadata"]
        })

    with open(index_file, "w") as f:
        json.dump(saved_data, f, indent=4)  # Use indent for readability
    return index

def load_index(index_path, index_file):
    try:
        index = faiss.read_index(index_path)
    except Exception as e:
        print(f"Error reading index from {index_path}: {e}")
        return None, []

    try:
        with open(index_file, "r") as f:
            loaded_data = json.load(f)
    except (FileNotFoundError, json.JSONDecodeError) as e:
        print(f"Error loading metadata from {index_file}: {e}")
        return index, []

    metadata_list = []
    for i, chunk in enumerate(loaded_data):
        if not isinstance(chunk, dict) or "text" not in chunk or "metadata" not in chunk:
            print(f"Error: Invalid chunk format at index {i} in loaded data: {chunk}")
            continue

        if not isinstance(chunk["metadata"], dict):
            print(f"Error: Invalid metadata format at index {i}: {chunk['metadata']}")
            continue

        metadata_list.append({
            "text": chunk["text"],
            "metadata": chunk["metadata"]
        })

    return index, metadata_list

def search_index(index, query_embedding, metadata_list, k=5):
    """Searches the FAISS index for similar embeddings."""
    D, I = index.search(np.array([query_embedding]).astype("float32"), k)
    results = []
    for i in I[0]:
        if i != -1:
            results.append(metadata_list[i])
    return results

In [18]:
# --- Prompting (Modified for Llama 2) ---
tokenizer = AutoTokenizer.from_pretrained(LLM_MODEL_NAME)
def create_prompt(customer_narrative, relevant_chunks):
    """Constructs a prompt for the LLM."""
    prompt = f"""Analyze the following customer transaction history for potential money laundering activities, considering the provided regulatory context. Identify any suspicious patterns and explain your reasoning, citing relevant regulations.

Customer Transaction History:
{customer_narrative}

Regulatory Context:
"""
    for chunk in relevant_chunks:
        prompt += f"{chunk['text']} - Source: {chunk['metadata']['source']}\n"

    prompt += "\nAnalysis and Summary:"
    return prompt


# --- Text Generation (Modified for Llama 2) ---
model = AutoModelForCausalLM.from_pretrained(
    LLM_MODEL_NAME,
    device_map="auto",
    torch_dtype=torch.float16
)

def generate_llama_summary(prompt, max_length=1000):
    """Generates a summary using the Llama 2 model."""
    try:
        inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
        with torch.no_grad():
            outputs = model.generate(**inputs, max_length=max_length, do_sample=True, top_k=50, top_p=0.95, num_return_sequences=1)
        summary = tokenizer.decode(outputs[0], skip_special_tokens=True)
        return summary
    except Exception as e:
        print(f"Error during text generation: {e}")
        return ""


Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  3.38it/s]
Some parameters are on the meta device because they were offloaded to the disk and cpu.


In [19]:
# --- Main Application Logic ---

def load_and_preprocess_documents(urls):
    """Loads, scrapes, preprocesses, and chunks regulatory documents."""
    all_chunks = []
    documents = []

    for url in urls:
        doc = scrape_website(url)
        if doc:
            documents.append(doc)

    for doc in documents:
        chunks = chunk_document(doc)
        for chunk in chunks:
            chunk["metadata"]["doc_id"] = doc.url  # Assign the doc_id here
        all_chunks.extend(chunks)

    return all_chunks

def should_update_documents(last_update_file, interval_days):
    """Checks if it's time to update the regulatory documents."""
    if not os.path.exists(last_update_file):
        return True
    with open(last_update_file, "r") as f:
        last_update = datetime.datetime.fromisoformat(f.read())
    return (datetime.datetime.now() - last_update) > datetime.timedelta(days=interval_days)

def should_reindex_documents(last_reindex_file, interval_days):
    """Checks if it's time to re-index the regulatory documents."""
    if not os.path.exists(last_reindex_file):
        return True
    with open(last_reindex_file, "r") as f:
        last_reindex = datetime.datetime.fromisoformat(f.read())
    return (datetime.datetime.now() - last_reindex) > datetime.timedelta(days=interval_days)

def main():
    """Main function to run the RAG-based AML analysis."""

    embedding_model = SentenceTransformer(EMBEDDING_MODEL_NAME)
    dimension = embedding_model.get_sentence_embedding_dimension()

    if should_update_documents(last_update_file, UPDATE_INTERVAL_DAYS):
        print("Updating regulatory documents...")
        document_chunks = load_and_preprocess_documents(GOVERNMENT_LINKS)

        if should_reindex_documents(last_reindex_file, RE_INDEX_INTERVAL_DAYS):
            print("Creating embeddings for regulatory documents...")
            document_embeddings = create_embeddings([chunk["text"] for chunk in document_chunks], embedding_model)

            print("Creating index for regulatory documents...")
            metadata_list = document_chunks  # Use the entire document_chunks list
            regulatory_index = create_index_and_save(document_embeddings, metadata_list, FAISS_INDEX_FILE, regulatory_metadata_file)

            with open(last_reindex_file, "w") as f:
                f.write(datetime.datetime.now().isoformat())

        with open(last_update_file, "w") as f:
            f.write(datetime.datetime.now().isoformat())
    else:
        print("Loading existing regulatory index...")
        regulatory_index, metadata_list = load_index(FAISS_INDEX_FILE, regulatory_metadata_file)  # Load the index here



    # Load customer data directly:
    customer_transactions = [
        {"customer_id": "A123", "date": "2023-10-26", "amount": 15000, "currency": "USD", "location": "NY", "transaction_type": "deposit"},
        {"customer_id": "A123", "date": "2023-10-27", "amount": 14500, "currency": "USD", "location": "CA", "transaction_type": "transfer"},
        {"customer_id": "B456", "date": "2023-10-28", "amount": 500, "currency": "USD", "location": "NY", "transaction_type": "deposit"},
    ]
    customer_narratives = aggregate_transactions(customer_transactions)

    # Analyze each customer
    for customer_id, narrative in customer_narratives.items():
        print(f"\nAnalyzing customer: {customer_id}")
        customer_embedding = create_embeddings([narrative], embedding_model)[0]

        # Retrieve relevant regulatory context
        relevant_chunks = search_index(regulatory_index, customer_embedding, metadata_list, k=K_RETRIEVAL)

        # Create prompt and get LLM response using gemini
        prompt = create_prompt(narrative, relevant_chunks)
        summary = generate_llama_summary(prompt)  # Make sure this function is defined

        print("Analysis:")
        print(summary)

In [None]:
if __name__ == "__main__":
    main()

Updating regulatory documents...
Creating embeddings for regulatory documents...
Creating index for regulatory documents...


RuntimeError: Error in __cdecl faiss::FileIOWriter::FileIOWriter(const char *) at D:\a\faiss-wheels\faiss-wheels\faiss\faiss\impl\io.cpp:102: Error: 'f' failed: could not open /content/regulatory_index.faiss for writing: No such file or directory