In [4]:
import spacy
from spacy.matcher import PhraseMatcher
from spacy.pipeline import EntityRuler

# Load model
nlp = spacy.load("en_core_web_sm")


insider_trading_terms = [
    "buy before announcement",
    "sell before news",
    "confidential merger information",
    "material nonpublic information",
    "MNPI",
    "tip from insider",
    "quiet period violation",
    "trading window closed",
    "blackout period"
]

market_manipulation_terms = [
    "pump and dump",
    "coordinated buying",
    "artificial price inflation",
    "wash trading",
    "spoofing",
    "layering orders",
    "painting the tape",
    "front running"
]

money_laundering_terms = [
    "structuring deposits",
    "smurfing",
    "shell company transfer",
    "offshore account",
    "cash intensive business",
    "high risk jurisdiction",
    "beneficial owner concealment",
    "unusual wire transfer pattern"
]

gdpr_violation_terms = [
    "personal data sharing",
    "customer information disclosed",
    "privacy breach",
    "consent not obtained",
    "data protection violation",
    "right to be forgotten ignored",
    "cross-border data transfer"
]

# -----------------------------
# 2. Create PhraseMatcher
# -----------------------------
compliance_matcher = PhraseMatcher(nlp.vocab, attr="LOWER")

compliance_matcher.add(
    "INSIDER_TRADING",
    [nlp.make_doc(t) for t in insider_trading_terms]
)
compliance_matcher.add(
    "MARKET_MANIPULATION",
    [nlp.make_doc(t) for t in market_manipulation_terms]
)
compliance_matcher.add(
    "MONEY_LAUNDERING",
    [nlp.make_doc(t) for t in money_laundering_terms]
)
compliance_matcher.add(
    "GDPR_VIOLATION",
    [nlp.make_doc(t) for t in gdpr_violation_terms]
)

# -----------------------------
# 3. Add EntityRuler for patterns
# -----------------------------
ruler = nlp.add_pipe("entity_ruler", before="ner")

patterns = [
    # Stock ticker (simple pattern like AAPL, TSLA, etc.)
    {"label": "STOCK_TICKER", "pattern": [{"IS_UPPER": True, "LENGTH": {"<=": 5}}]},

    # Transaction IDs
    {"label": "TRANSACTION_ID", "pattern": [{"TEXT": {"REGEX": "^TXN-[A-Z0-9]+$"}}]},

    # Account numbers
    {"label": "ACCOUNT_NUMBER", "pattern": [{"TEXT": {"REGEX": "^ACC-[0-9]+$"}}]},

    # Large amounts like $500000
    {"label": "LARGE_AMOUNT", "pattern": [{"TEXT": {"REGEX": "^[\\$][0-9,]+$"}}]},

    # Suspicious actions
    {"label": "SUSPICIOUS_ACTION", "pattern": [{"LOWER": "urgent"}, {"LOWER": "wire"}, {"LOWER": "transfer"}]},
    {"label": "SUSPICIOUS_ACTION", "pattern": [{"LOWER": "delete"}, {"LOWER": "all"}, {"LOWER": "emails"}]},
]

ruler.add_patterns(patterns)

# -----------------------------
# 4. Test communication text
# -----------------------------
communication = """
From: trader@globalfinance.com
To: colleague@globalfinance.com
Subject: Quick heads up

Hey ‚Äî just got confidential merger information from our client.
You might want to buy before announcement. Keep this quiet, we're in the blackout period.

URGENT: Need to execute wire transfer of $500000 to account ACC-98765432.
Use TXN-ABC123XYZ789. Delete all emails after completion.

This involves MNPI and some offshore account movement.
"""

doc = nlp(communication)

# -----------------------------
# 5. Run PhraseMatcher
# -----------------------------
matches = compliance_matcher(doc)

print("üö® COMPLIANCE ALERTS DETECTED")
print("=" * 50)

for match_id, start, end in matches:
    violation_type = nlp.vocab.strings[match_id]
    matched_text = doc[start:end].text
    print(f"\n‚ö†Ô∏è Violation Type: {violation_type}")
    print(f"Matched Text: {matched_text}")
    print("Risk Level: HIGH")

üö® COMPLIANCE ALERTS DETECTED

‚ö†Ô∏è Violation Type: INSIDER_TRADING
Matched Text: confidential merger information
Risk Level: HIGH

‚ö†Ô∏è Violation Type: INSIDER_TRADING
Matched Text: buy before announcement
Risk Level: HIGH

‚ö†Ô∏è Violation Type: INSIDER_TRADING
Matched Text: blackout period
Risk Level: HIGH

‚ö†Ô∏è Violation Type: INSIDER_TRADING
Matched Text: MNPI
Risk Level: HIGH

‚ö†Ô∏è Violation Type: MONEY_LAUNDERING
Matched Text: offshore account
Risk Level: HIGH


In [5]:
# -----------------------------
# 6. Print extracted entities
# -----------------------------
print("\nüìå FINANCIAL ENTITIES EXTRACTED")
print("=" * 50)

for ent in doc.ents:
    print(f"{ent.label_:20} | {ent.text}")


üìå FINANCIAL ENTITIES EXTRACTED
MONEY                | 500000
ACCOUNT_NUMBER       | ACC-98765432
STOCK_TICKER         | TXN
SUSPICIOUS_ACTION    | Delete all emails
STOCK_TICKER         | MNPI


In [6]:
# -----------------------------
# 7. Simple risk scoring
# -----------------------------
risk_score = 0
risk_factors = []

for ent in doc.ents:
    if ent.label_ == "SUSPICIOUS_ACTION":
        risk_score += 30
        risk_factors.append(f"Suspicious action: {ent.text}")
    elif ent.label_ == "LARGE_AMOUNT":
        risk_score += 20
        risk_factors.append(f"Large amount: {ent.text}")

for match_id, start, end in matches:
    risk_score += 25
    risk_factors.append(f"Compliance phrase: {doc[start:end].text}")

print("\nüßÆ RISK ASSESSMENT")
print("=" * 50)
print(f"Risk Score: {risk_score}/100")
print("Risk Level:", "CRITICAL" if risk_score >= 60 else "HIGH" if risk_score >= 30 else "MEDIUM")

print("\nRisk Factors:")
for rf in risk_factors:
    print("-", rf)


üßÆ RISK ASSESSMENT
Risk Score: 155/100
Risk Level: CRITICAL

Risk Factors:
- Suspicious action: Delete all emails
- Compliance phrase: confidential merger information
- Compliance phrase: buy before announcement
- Compliance phrase: blackout period
- Compliance phrase: MNPI
- Compliance phrase: offshore account
