In [1]:
import re, math, warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.preprocessing import RobustScaler, QuantileTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD

from sklearn.ensemble import IsolationForest
from sklearn.svm import OneClassSVM
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score

from sklearn.isotonic import IsotonicRegression
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

import networkx as nx

from pyod.models.hbos import HBOS
from pyod.models.copod import COPOD

np.random.seed(42)

In [2]:
df = pd.read_excel('data/final_risk_data.xlsx')

# –∏–∑–º–µ–Ω–µ–Ω–∏–µ —Ç–∏–ø–æ–≤ –¥–∞–Ω–Ω—ã—Ö
df['date'] = pd.to_datetime(df['date']).dt.normalize()
for i in ['debit_amount', 'credit_amount']:
    if i in df.columns:
        df[i] = pd.to_numeric(df[i], errors='coerce')
        
# —Å–æ–∑–¥–∞–Ω–∏–µ –∫–æ–ª–æ–Ω–∫–∏ —Å —Å—É–º–º–æ–π –æ–ø–µ—Ä–∞—Ü–∏–∏
df["amount"] = np.where(df["debit_amount"].notna(), df["debit_amount"], df["credit_amount"])
df["debit_amount"] = df["debit_amount"].fillna(0.0)
df["credit_amount"] = df["credit_amount"].fillna(0.0)

# —Å–æ–∑–∂–∞–Ω–∏–µ –≤—Ä–µ–º–µ–Ω–Ω—ã—Ö –ø—Ä–∏–∑–Ω–∞–∫–æ–≤
df['day_of_week'] = df['date'].dt.dayofweek
df['is_weekend'] = df['day_of_week'].isin([5,6]).astype(int)
df['is_month_end'] = df['date'].dt.is_month_end.astype(int)
df['month'] = df['date'].dt.month
df["weekofyear"]   = df["date"].dt.isocalendar().week.astype(int)

# –°—É—Ç–æ—á–Ω—ã–µ —Ñ–∏—á–∏
df['daily_total_debit'] = df.groupby(['debit_inn', 'date'])['debit_amount'].transform('sum')
df['daily_total_credit'] = df.groupby(['credit_inn', 'date'])['credit_amount'].transform('sum')
# –ø–æ—Å—á–µ—Ç —Ç—Ä–∞–Ω–∑–∞–∫—Ü–∏–π –∑–∞ —Å—É—Ç–∫–∏
df["daily_debit_transaction_count"]    = df.groupby(["debit_inn","date"])["debit_amount"].transform(lambda s: (s > 0).sum())
df["daily_credit_transaction_count"]    = df.groupby(["credit_inn","date"])["credit_amount"].transform(lambda s: (s > 0).sum())
# —É–Ω–∏–∫–∞–ª—å–Ω—ã–µ –æ—Ç–ø—Ä–∞–≤–∏—Ç–µ–ª–∏ –∏ –ø–æ–ª—É—á–∞—Ç–µ–ª–∏ –≤ —Å—É—Ç–∫–∏
df["unique_recipients_per_day"] = df.groupby(["debit_inn","date"])["credit_inn"].transform("nunique")
df["unique_receivers_per_day"] = df.groupby(["credit_inn","date"])["debit_inn"].transform("nunique")

# –î–æ–ª—è —Ç–µ–∫—É—â–µ–π –æ–ø–µ—Ä–∞—Ü–∏–∏ –≤ —Å—É—Ç–æ—á–Ω–æ–º –æ–±—ä—ë–º–µ
df["daily_debit_percent"] = (df["debit_amount"] / df["daily_total_debit"].replace(0, np.nan)).fillna(0.0)
df["daily_credit_percent"] = (df["credit_amount"] / df["daily_total_credit"].replace(0, np.nan)).fillna(0.0)

# –ò–Ω—Ç–µ—Ä–≤–∞–ª—ã –º–µ–∂–¥—É –æ–ø–µ—Ä–∞—Ü–∏—è–º–∏ (–≤ –¥–Ω—è—Ö)
df["days_since_last_txn_debit"]  = df.groupby("debit_inn")["date"].diff().dt.days.fillna(9999)
df["days_since_last_txn_credit"] = df.groupby("credit_inn")["date"].diff().dt.days.fillna(9999)

def add_rolling_side(df: pd.DataFrame,
                     side: str,
                     amt_col: str,
                     windows=(7, 14, 30, 90)) -> pd.DataFrame:
    """
    –°–æ–∑–¥–∞–µ—Ç —Ä–æ–ª–ª–∏–Ω–≥–∏ –ø–æ —Å—É–º–º–∞–º/–∫–æ–ª-–≤—É —Ç—Ä–∞–Ω–∑–∞–∫—Ü–∏–π –∑–∞ –æ–∫–Ω–∞ windows –¥–ª—è –∑–∞–¥–∞–Ω–Ω–æ–π —Å—Ç–æ—Ä–æ–Ω—ã.
    side: 'debit' –∏–ª–∏ 'credit'
    amt_col: –∏–º—è —Å—Ç–æ–ª–±—Ü–∞ —Å —Å—É–º–º–æ–π –¥–ª—è —ç—Ç–æ–π —Å—Ç–æ—Ä–æ–Ω—ã (–Ω–∞–ø—Ä–∏–º–µ—Ä, 'debit_amount' / 'credit_amount')
    """
    inn_col = f"{side}_inn"
    out_parts = []

    # –ë–µ—Ä–µ–º —Ç–æ–ª—å–∫–æ –Ω—É–∂–Ω—ã–µ –∫–æ–ª–æ–Ω–∫–∏ (—É—Å–∫–æ—Ä—è–µ—Ç groupby)
    need_cols = [inn_col, "date", amt_col]
    sub_all = df[need_cols].copy()
    sub_all[amt_col] = sub_all[amt_col].fillna(0.0)

    for inn, sub in sub_all.groupby(inn_col, sort=False):
        # —Å—É—Ç–æ—á–Ω–∞—è –∞–≥—Ä–µ–≥–∞—Ü–∏—è –ø–æ –∞–∫–∫–∞—É–Ω—Ç—É
        daily = (sub.groupby("date", as_index=True)[amt_col]
                   .sum()
                   .to_frame("amt_day")
                   .sort_index())

        # –Ω–µ–ø—Ä–µ—Ä—ã–≤–Ω—ã–π –∫–∞–ª–µ–Ω–¥–∞—Ä–Ω—ã–π –∏–Ω–¥–µ–∫—Å (—á—Ç–æ–±—ã –æ–∫–Ω–∞ –Ω–µ "—Å–∂–∏–º–∞–ª–∏—Å—å")
        idx = pd.date_range(daily.index.min(), daily.index.max(), freq="D")
        daily = daily.reindex(idx, fill_value=0.0)
        daily.index.name = "date"

        # –∏–Ω–¥–∏–∫–∞—Ç–æ—Ä "–±—ã–ª–∞ –ª–∏ –æ–ø–µ—Ä–∞—Ü–∏—è –≤ –¥–µ–Ω—å" (–¥–ª—è —Ä–æ–ª–ª–∏–Ω–≥–∞ –∫–æ–ª–∏—á–µ—Å—Ç–≤–∞)
        daily["tx_day"] = (daily["amt_day"] > 0).astype(int)

        # —Ä–æ–ª–ª–∏–Ω–≥–∏
        for W in windows:
            daily[f"{side}_roll_sum_{W}d"]  = daily["amt_day"].rolling(W, min_periods=1).sum()
            daily[f"{side}_roll_cnt_{W}d"]  = daily["tx_day"].rolling(W, min_periods=1).sum()
            daily[f"{side}_roll_mean_{W}d"] = daily["amt_day"].rolling(W, min_periods=1).mean()
            daily[f"{side}_roll_std_{W}d"]  = daily["amt_day"].rolling(W, min_periods=1).std().fillna(0.0)
            daily[f"{side}_roll_p95_{W}d"]  = daily["amt_day"].rolling(W, min_periods=1).quantile(0.95)

        daily = daily.reset_index()
        daily[inn_col] = inn
        out_parts.append(daily[["date", inn_col] + [c for c in daily.columns if c.startswith(f"{side}_roll_")]])

    rolls = pd.concat(out_parts, ignore_index=True)
    return df.merge(rolls, on=["date", inn_col], how="left")



# –ù–æ–≤—ã–µ —Ä–æ–ª–ª–∏–Ω–≥–∏ 7/14/30/90 –¥–ª—è –¥–µ–±–µ—Ç–∞ –∏ –∫—Ä–µ–¥–∏—Ç–∞
WINDOWS = (7, 14, 30, 90)
df = add_rolling_side(df, side="debit",  amt_col="debit_amount",  windows=WINDOWS)
df = add_rolling_side(df, side="credit", amt_col="credit_amount", windows=WINDOWS)

# ==== 1. –í—Å–ø–ª–µ—Å–∫–∏ —Å—É–º–º (amount_spike_ratio_7d) ====
df["debit_amount_spike_ratio_7d"]  = df["debit_roll_sum_7d"]  / (df["debit_roll_sum_30d"]/4 + 1e-6)
df["credit_amount_spike_ratio_7d"] = df["credit_roll_sum_7d"] / (df["credit_roll_sum_30d"]/4 + 1e-6)

# ==== 2. –í—Å–ø–ª–µ—Å–∫–∏ –∞–∫—Ç–∏–≤–Ω–æ—Å—Ç–∏ (tx_rate_spike_7d) ====
df["debit_tx_rate_spike_7d"]  = df["debit_roll_cnt_7d"]  / (df["debit_roll_cnt_30d"]/4 + 1e-6)
df["credit_tx_rate_spike_7d"] = df["credit_roll_cnt_7d"] / (df["credit_roll_cnt_30d"]/4 + 1e-6)

# ==== 3. Fan-out / Fan-in (—É–Ω–∏–∫–∞–ª—å–Ω—ã–µ –∫–æ–Ω—Ç—Ä–∞–≥–µ–Ω—Ç—ã) ====
# –î–ª—è –¥–µ–±–µ—Ç–∞ ‚Äî fan-out (—Å–∫–æ–ª—å–∫–æ —Ä–∞–∑–Ω—ã—Ö –ø–æ–ª—É—á–∞—Ç–µ–ª–µ–π –Ω–∞ –æ–ø–µ—Ä–∞—Ü–∏—é)
df["debit_fan_out_ratio"] = df["unique_recipients_per_day"] / (df["daily_debit_transaction_count"] + 1e-6)

# –î–ª—è –∫—Ä–µ–¥–∏—Ç–∞ ‚Äî fan-in (—Å–∫–æ–ª—å–∫–æ —Ä–∞–∑–Ω—ã—Ö –æ—Ç–ø—Ä–∞–≤–∏—Ç–µ–ª–µ–π –Ω–∞ –æ–ø–µ—Ä–∞—Ü–∏—é)
df["credit_fan_in_ratio"] = (
    df.groupby(["credit_inn", "date"])["debit_inn"].transform("nunique") /
    (df["daily_credit_transaction_count"] + 1e-6)
)

# ==== 4. –î–∏—Å–±–∞–ª–∞–Ω—Å –ø–æ—Ç–æ–∫–æ–≤ (in_out_ratio_30d) ====
df["in_out_ratio_30d"] = (df["credit_roll_sum_30d"] + 1e-6) / (df["debit_roll_sum_30d"] + 1e-6)

# ==== 5. –í–æ–ª–∞—Ç–∏–ª—å–Ω–æ—Å—Ç—å –æ–±—ä—ë–º–æ–≤ (amount_volatility_30d) ====
df["debit_amount_volatility_30d"]  = df["debit_roll_std_30d"]  / (df["debit_roll_mean_30d"]  + 1e-6)
df["credit_amount_volatility_30d"] = df["credit_roll_std_30d"] / (df["credit_roll_mean_30d"] + 1e-6)


# –ö—Ä—É–≥–ª—ã–µ —Å—É–º–º—ã
df["round_10k"]  = ((df["amount"] % 10000)  == 0).astype(int)
df["round_100k"] = ((df["amount"] % 100000) == 0).astype(int)
df["round_large_amount"] = ((df["round_10k"]==1) | (df["round_100k"]==1)).astype(int)

df.head()

Unnamed: 0,date,debit_account,debit_name,debit_inn,credit_account,credit_name,credit_inn,debit_amount,credit_amount,purpose,...,debit_tx_rate_spike_7d,credit_tx_rate_spike_7d,debit_fan_out_ratio,credit_fan_in_ratio,in_out_ratio_30d,debit_amount_volatility_30d,credit_amount_volatility_30d,round_10k,round_100k,round_large_amount
0,2019-01-09,ded93f97f389bf2c,b736fcdbf591b1c2,d877722ca4e40f98,0c00203649ed677a,e19cc80da09f445f,8d9e0be733f77f1c,10.0,0.0,–∫–æ–º–∏—Å—Å–∏—è –≤–Ω—É—Ç—Ä–∏ —Å–±–µ—Ä–±–∞–Ω–∫–∞ –∑–∞ –ø–ø/–ø—Ç —á–µ—Ä–µ–∑ –¥–±–æ —Å...,...,3.999984,0.0,1.0,1000000.0,3.333322e-13,0.0,0.0,0,0,0
1,2019-01-09,ded93f97f389bf2c,b736fcdbf591b1c2,d877722ca4e40f98,7682fd5c32e028f5,796a88c244ffcc0a,6931e23e98703aa9,3000000.0,0.0,–æ–ø–ª–∞—Ç–∞ –ø–æ —Å—á–µ—Ç—É ‚Ññ 28 –æ—Ç date_9f241b636025 –ø–æ –¥...,...,3.999984,0.0,1.0,1000000.0,3.333322e-13,0.0,0.0,1,1,1
2,2019-01-10,ded93f97f389bf2c,b736fcdbf591b1c2,d877722ca4e40f98,0c00203649ed677a,e19cc80da09f445f,8d9e0be733f77f1c,10.0,0.0,–∫–æ–º–∏—Å—Å–∏—è –≤–Ω—É—Ç—Ä–∏ —Å–±–µ—Ä–±–∞–Ω–∫–∞ –∑–∞ –ø–ø/–ø—Ç —á–µ—Ä–µ–∑ –¥–±–æ —Å...,...,3.999992,0.0,1.0,1000000.0,3.32856e-13,1.410173,0.0,0,0,0
3,2019-01-10,ded93f97f389bf2c,b736fcdbf591b1c2,d877722ca4e40f98,e63cf19b76230a3d,21a73421ca00ae87,eed8b1f54ed4366c,4282.0,0.0,–æ–ø–ª–∞—Ç–∞ –ø–æ –¥–æ–≥–æ–≤–æ—Ä—É —ç–ª–µ–∫—Ç—Ä–æ—ç–Ω–µ—Ä–≥–∏—è –ø–æ –¥–æ–≥.‚Ññ1124...,...,3.999992,0.0,1.0,1000000.0,3.32856e-13,1.410173,0.0,0,0,0
4,2019-01-10,d658b4e51c5a5df5,f7185cc66db08da5,ad57c94e5b8df8f6,ded93f97f389bf2c,3e368a63959d285e,d877722ca4e40f98,0.0,1000000.0,–ø–µ—Ä–µ–≤–æ–¥ –¥–µ–Ω–µ–∂–Ω—ã—Ö —Å—Ä–µ–¥—Å—Ç–≤ –ø–æ –¥–æ–≥–æ–≤–æ—Ä—É –∑–∞–π–º–∞ ‚Ññ —á...,...,0.0,3.999984,1000000.0,1.0,3000000000000.0,0.0,0.0,1,1,1


In [3]:
# ============================================================
#  –ê–Ω–∞–ª–∏–∑ "–ù–∞–∑–Ω–∞—á–µ–Ω–∏—è –ø–ª–∞—Ç–µ–∂–∞": TF-IDF (char 3‚Äì5) + —Å—Ç–æ–ø-—Å–ª–æ–≤–∞ —Ä–∏—Å–∫–∞
# ============================================================

import re
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD

# ------------------------------------------------------------
# 1) –†–µ–≥—É–ª—è—Ä–Ω—ã–µ –ø–∞—Ç—Ç–µ—Ä–Ω—ã –≤—ã—Å–æ–∫–æ–≥–æ —Ä–∏—Å–∫–∞
# ------------------------------------------------------------
STOP_HIGH_PATTERNS = [
    r"\b–∑–∞–π–º\w*\b", r"\b–≤–æ–∑–≤—Ä–∞—Ç\W*(–∑–∞–π–º|–¥–æ–ª–≥)\w*\b", r"\b–ø–æ–≥–∞—à–µ–Ω\w*\W*(–∑–∞–π–º|–¥–æ–ª–≥)\w*\b",
    r"\b–¥–æ–ª–≥\w*\b", r"\b–æ–ø–ª–∞—Ç\w*\W*(–∑–∞–π–º|–¥–æ–ª–≥)\w*\b",
    r"\b–∫—Ä–∏–ø—Ç\w*\b", r"\b–±–∏—Ç–∫–æ–∏–Ω\w*\b", r"\busdt\b", r"\bbtc\b", r"\bcoin\b",
    r"\b–±–∏—Ä–∂\w*\b", r"\b–æ–±–º–µ–Ω\w*\b", r"\b–≤–∞–ª—é—Ç\w*\b",
    r"\b–Ω–∞–ª–∏—á\w*\b", r"\b–æ–±–Ω–∞–ª\w*\b", r"\b–≤—ã–¥–∞—á\w*\W*–Ω–∞–ª–∏—á\w*\b",
    r"\b–ø–µ—Ä–µ–≤–æ–¥\w*\W*(–Ω–∞|—Å)\W*–∫–∞—Ä—Ç\w*\b", r"\b–ø–µ—Ä–µ–≤–æ–¥\w*\W*—Ñ–∏–∑\w*\b",
    r"\b–ø–µ—Ä–µ–≤–æ–¥\w*\W*—Ä–æ–¥—Å—Ç–≤–µ–Ω\w*\–±", r"\b–ø–µ—Ä–µ–≤–æ–¥\w*\W*—Å—Ä–µ–¥—Å—Ç–≤\b",
    r"\b–±–µ–∑\W*–¥–æ–≥–æ–≤–æ—Ä\w*\b",
    r"\b–ø–æ–∂–µ—Ä—Ç–≤\w*\b", r"\b–±–ª–∞–≥–æ—Ç–≤–æ—Ä\w*\–±", r"\b–¥–∞—Ä–µ–Ω\w*\b",
    r"\b–∞–≥–µ–Ω—Ç—Å–∫\w*\W*–≤–æ–∑–Ω–∞–≥—Ä–∞–∂–¥\w*\–±", r"\b–∫–æ–º–∏—Å—Å–∏\w*\W*–≤–æ–∑–Ω–∞–≥—Ä–∞–∂–¥\w*\b",
    r"\b–≤–æ–∑–Ω–∞–≥—Ä–∞–∂–¥\w*\b",
    r"\b—Ü–µ—Å—Å\w*\–±", r"\b–ø–æ—Ä—É—á–∏—Ç–µ–ª\w*\b", r"\b–∑–∞–ª–æ–≥\w*\b",
    r"\b–æ—Ñ—à–æ—Ä\w*\–±", r"\b–∏–Ω–æ—Å—Ç—Ä–∞–Ω\w*\W*–ø–µ—Ä–µ–≤–æ–¥\w*\b", r"\bswift\b",
    r"\b–ª–∏—á–Ω\w*\W*–Ω—É–∂–¥\w*\–±", r"\b–ø–µ—Ä–µ–¥–∞—á\w*\W*–∞–∫—Ç–∏–≤\w*\–±",
    r"\b–≤–∫–ª–∞–¥\w*\–±", r"\b–¥–µ–ø–æ–∑–∏—Ç\w*\–±",
]
STOP_HIGH_RE = re.compile("|".join(STOP_HIGH_PATTERNS), flags=re.IGNORECASE)

# ------------------------------------------------------------
# 2) –û—á–∏—Å—Ç–∫–∞ —Ç–µ–∫—Å—Ç–∞
# ------------------------------------------------------------
def clean_text(s: str) -> str:
    s = str(s).lower()
    s = re.sub(r"[^a-z–∞-—è0-9\s]", " ", s)
    s = re.sub(r"\s+", " ", s).strip()
    return s

df["purpose_clean"] = df["purpose"].apply(clean_text)

# ------------------------------------------------------------
# 3) –ú–µ—Ç–∫–∞ –Ω–∞–ª–∏—á–∏—è —Å—Ç–æ–ø-—Å–ª–æ–≤ —Ä–∏—Å–∫–∞
# ------------------------------------------------------------
df["purpose_stopword_high"] = df["purpose_clean"].str.contains(STOP_HIGH_RE, na=False)

# ------------------------------------------------------------
# 4) TF-IDF (char-level 3‚Äì5-–≥—Ä–∞–º–º—ã) + SVD
# ------------------------------------------------------------
texts = df["purpose_clean"].astype(str).tolist()

tfidf = TfidfVectorizer(
    analyzer="char",
    ngram_range=(3, 5),
    min_df=1
)
X_tfidf = tfidf.fit_transform(texts)
print(f"TF-IDF –º–∞—Ç—Ä–∏—Ü–∞: {X_tfidf.shape[0]} √ó {X_tfidf.shape[1]}")

# SVD –¥–ª—è —Å–∂–∞—Ç–∏—è (–Ω–∞–ø—Ä–∏–º–µ—Ä, –¥–æ 50 –∫–æ–º–ø–æ–Ω–µ–Ω—Ç)
svd_k_target = 50
max_svd = max(1, min(X_tfidf.shape[0] - 1, X_tfidf.shape[1] - 1))
n_svd = min(svd_k_target, max_svd)

svd = TruncatedSVD(n_components=n_svd, random_state=42)
X_svd = svd.fit_transform(X_tfidf)

for i in range(X_svd.shape[1]):
    df[f"purpose_svd_{i+1}"] = X_svd[:, i]

expl_var = svd.explained_variance_ratio_.sum()
print(f"SVD explained variance (k={X_svd.shape[1]}): {expl_var:.3f}")

# ------------------------------------------------------------
# 5) –†–µ–∑—É–ª—å—Ç–∞—Ç
# ------------------------------------------------------------
svd_cols = [c for c in df.columns if c.startswith("purpose_svd_")]
cols_show = ["purpose", "purpose_stopword_high"] + svd_cols

print(f"\n‚úÖ –î–æ–±–∞–≤–ª–µ–Ω—ã –ø—Ä–∏–∑–Ω–∞–∫–∏: {len(svd_cols)} SVD-–∫–æ–º–ø–æ–Ω–µ–Ω—Ç + –º–µ—Ç–∫–∞ —Å—Ç–æ–ø-—Å–ª–æ–≤")
print(df[cols_show].head())


TF-IDF –º–∞—Ç—Ä–∏—Ü–∞: 2688 √ó 45426
SVD explained variance (k=50): 0.545

‚úÖ –î–æ–±–∞–≤–ª–µ–Ω—ã –ø—Ä–∏–∑–Ω–∞–∫–∏: 50 SVD-–∫–æ–º–ø–æ–Ω–µ–Ω—Ç + –º–µ—Ç–∫–∞ —Å—Ç–æ–ø-—Å–ª–æ–≤
                                             purpose  purpose_stopword_high  \
0  –∫–æ–º–∏—Å—Å–∏—è –≤–Ω—É—Ç—Ä–∏ —Å–±–µ—Ä–±–∞–Ω–∫–∞ –∑–∞ –ø–ø/–ø—Ç —á–µ—Ä–µ–∑ –¥–±–æ —Å...                  False   
1  –æ–ø–ª–∞—Ç–∞ –ø–æ —Å—á–µ—Ç—É ‚Ññ 28 –æ—Ç date_9f241b636025 –ø–æ –¥...                  False   
2  –∫–æ–º–∏—Å—Å–∏—è –≤–Ω—É—Ç—Ä–∏ —Å–±–µ—Ä–±–∞–Ω–∫–∞ –∑–∞ –ø–ø/–ø—Ç —á–µ—Ä–µ–∑ –¥–±–æ —Å...                  False   
3  –æ–ø–ª–∞—Ç–∞ –ø–æ –¥–æ–≥–æ–≤–æ—Ä—É —ç–ª–µ–∫—Ç—Ä–æ—ç–Ω–µ—Ä–≥–∏—è –ø–æ –¥–æ–≥.‚Ññ1124...                  False   
4  –ø–µ—Ä–µ–≤–æ–¥ –¥–µ–Ω–µ–∂–Ω—ã—Ö —Å—Ä–µ–¥—Å—Ç–≤ –ø–æ –¥–æ–≥–æ–≤–æ—Ä—É –∑–∞–π–º–∞ ‚Ññ —á...                   True   

   purpose_svd_1  purpose_svd_2  purpose_svd_3  purpose_svd_4  purpose_svd_5  \
0       0.708416      -0.202923      -0.110215      -0.029243       0.142140   
1       0.122677   

In [4]:
df.columns

Index(['date', 'debit_account', 'debit_name', 'debit_inn', 'credit_account',
       'credit_name', 'credit_inn', 'debit_amount', 'credit_amount', 'purpose',
       ...
       'purpose_svd_41', 'purpose_svd_42', 'purpose_svd_43', 'purpose_svd_44',
       'purpose_svd_45', 'purpose_svd_46', 'purpose_svd_47', 'purpose_svd_48',
       'purpose_svd_49', 'purpose_svd_50'],
      dtype='object', length=130)

In [32]:
# ============================================================
#  –û–±–Ω–∞—Ä—É–∂–µ–Ω–∏–µ –∞–Ω–æ–º–∞–ª–∏–π –¥–ª—è –ò–ù–ù (debit+credit)
#  –ù–µ—Å–∫–æ–ª—å–∫–æ –º–æ–¥–µ–ª–µ–π sklearn: IsolationForest, LOF, OneClassSVM, EllipticEnvelope
#  + —Ñ–∏–ª—å—Ç—Ä: –∞–Ω–∞–ª–∏–∑ —Ç–æ–ª—å–∫–æ –ò–ù–ù —Å >=20 —Ç—Ä–∞–Ω–∑–∞–∫—Ü–∏—è–º–∏
# ============================================================

import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import IsolationForest
from sklearn.neighbors import LocalOutlierFactor
from sklearn.svm import OneClassSVM
from sklearn.covariance import EllipticEnvelope
import warnings
warnings.filterwarnings("ignore")

print(f"–í—Å–µ–≥–æ —Ç—Ä–∞–Ω–∑–∞–∫—Ü–∏–π: {len(df):,}")

# ---------- 1) –ê–≥—Ä–µ–≥–∞—Ü–∏—è –ø–æ –ò–ù–ù ----------
def agg_side(df, inn_col, prefix):
    aggs = df.groupby(inn_col).agg(
        tx_count=('date', 'size'),
        amount_sum=('amount', 'sum'),
        amount_mean=('amount', 'mean'),
        amount_std=('amount', 'std'),
        roll_cnt_30d_mean=(f'{prefix}_roll_cnt_30d','mean'),
        roll_mean_30d_mean=(f'{prefix}_roll_mean_30d','mean'),
        roll_std_30d_mean=(f'{prefix}_roll_std_30d','mean'),
        spike_ratio_7d_mean=(f'{prefix}_amount_spike_ratio_7d','mean'),
        tx_rate_spike_7d_mean=(f'{prefix}_tx_rate_spike_7d','mean'),
        vol_30d_mean=(f'{prefix}_amount_volatility_30d','mean'),
    )

    # –¥–æ–±–∞–≤–∏–º –ø—Ä–µ—Ñ–∏–∫—Å –∫–æ –≤—Å–µ–º –∫–æ–ª–æ–Ω–∫–∞–º, —á—Ç–æ–±—ã –∏–∑–±–µ–∂–∞—Ç—å –ø–µ—Ä–µ—Å–µ—á–µ–Ω–∏—è
    aggs = aggs.add_prefix(f"{prefix}_")
    return aggs

agg_deb = agg_side(df, 'debit_inn', 'debit')
agg_cred = agg_side(df, 'credit_inn', 'credit')
agg_all = agg_deb.join(agg_cred, how='outer').fillna(0.0)

# —Å–∞–º–æ–ø–µ—Ä–µ–≤–æ–¥—ã –∫–æ–Ω–∫—Ä–µ—Ç–Ω–æ –∏ —Ç–æ–ª—å–∫–æ –¥–ª—è –ø–µ—Ä–µ—Å–µ—á–µ–Ω–∏—è
self_tx = (df.loc[df['debit_inn'] == df['credit_inn'], 'debit_inn']
             .value_counts()
             .rename('self_tx_count'))
agg_all = agg_all.join(self_tx, how='left').fillna({'self_tx_count': 0})

# –∫–æ—Ä—Ä–µ–∫—Ç–Ω—ã–π total_tx_count –±–µ–∑ –¥–≤–æ–π–Ω–æ–≥–æ —É—á—ë—Ç–∞
agg_all['total_tx_count'] = (
    agg_all.get('debit_tx_count', 0).astype(float)
  + agg_all.get('credit_tx_count', 0).astype(float)
  - agg_all['self_tx_count'].astype(float)
)

# ---------- 1.1) –°–µ—Ç–µ–≤—ã–µ –ø–æ–∫–∞–∑–∞—Ç–µ–ª–∏ ----------
extra = df.groupby('debit_inn').agg(out_unique=('credit_inn','nunique'))
extra2 = df.groupby('credit_inn').agg(in_unique=('debit_inn','nunique'))
agg_all = agg_all.join(extra, how='left').join(extra2, how='left').fillna(0)

agg_all['fan_out_ratio'] = agg_all['out_unique'] / (agg_all['debit_tx_count']+1e-6)
agg_all['fan_in_ratio']  = agg_all['in_unique']  / (agg_all['credit_tx_count']+1e-6)
agg_all['in_out_ratio']  = (agg_all['credit_amount_sum']+1e-6)/(agg_all['debit_amount_sum']+1e-6)

# ---------- 1.2) –§–∏–ª—å—Ç—Ä: —Ç–æ–ª—å–∫–æ –ò–ù–ù —Å >=20 —Ç—Ä–∞–Ω–∑–∞–∫—Ü–∏—è–º–∏ ----------

agg_all_filtered = agg_all[agg_all["total_tx_count"] >= 20].copy()
print(f"–û—Ç–æ–±—Ä–∞–Ω–æ {len(agg_all_filtered)} –ò–ù–ù —Å >=20 —Ç—Ä–∞–Ω–∑–∞–∫—Ü–∏—è–º–∏ –∏–∑ {len(agg_all)}")

if len(agg_all_filtered) == 0:
    raise RuntimeError("‚ùå –ù–µ—Ç –ò–ù–ù —Å –¥–æ—Å—Ç–∞—Ç–æ—á–Ω—ã–º –∫–æ–ª–∏—á–µ—Å—Ç–≤–æ–º —Ç—Ä–∞–Ω–∑–∞–∫—Ü–∏–π –¥–ª—è –∞–Ω–∞–ª–∏–∑–∞.")

# ---------- 1.3) –ú–∞—Å—à—Ç–∞–±–∏—Ä–æ–≤–∞–Ω–∏–µ ----------
X = agg_all_filtered.fillna(0.0).replace([np.inf,-np.inf],0.0)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

print(f"–ê–≥—Ä–µ–≥–∏—Ä–æ–≤–∞–Ω–æ {len(agg_all_filtered):,} –ò–ù–ù, {X_scaled.shape[1]} —Ñ–∏—á–µ–π")

# ---------- 2) –ú–æ–¥–µ–ª–∏ ----------
models = {
    "IsolationForest": IsolationForest(n_estimators=400, contamination=0.03, random_state=42, n_jobs=-1),
    "LocalOutlierFactor": LocalOutlierFactor(n_neighbors=20, contamination=0.03, novelty=True),
    "OneClassSVM": OneClassSVM(kernel="rbf", gamma=0.05, nu=0.03),
    "EllipticEnvelope": EllipticEnvelope(contamination=0.03, random_state=42)
}

results = {}
scores_all = pd.DataFrame(index=agg_all_filtered.index)

for name, model in models.items():
    print(f"‚è≥ –û–±—É—á–∞–µ–º {name} ...")
    model.fit(X_scaled)
    if hasattr(model, "score_samples"):
        raw = -model.score_samples(X_scaled)
    else:
        raw = -model.decision_function(X_scaled)
    s = (raw - raw.min()) / (raw.max() - raw.min() + 1e-6)
    scores_all[name+"_score"] = s
    results[name] = {
        "mean": float(s.mean()),
        "std": float(s.std()),
        "top5_mean": float(np.mean(np.sort(s)[-5:])),
        "threshold_97": float(np.quantile(s,0.95))
    }
    print(f"  min={s.min():.3f} max={s.max():.3f} mean={s.mean():.3f}")

# ---------- 3) –ö–æ–º–ø–æ–∑–∏—Ç–Ω—ã–π —Å–∫–æ—Ä ----------
scores_all["graph_anomaly_score"] = scores_all.mean(axis=1)
thr = np.quantile(scores_all["graph_anomaly_score"],0.97)
scores_all["anomaly_flag"] = (scores_all["graph_anomaly_score"]>=thr).astype(int)

agg_all_filtered = pd.concat([agg_all_filtered, scores_all], axis=1)

print("\n=== –ò—Ç–æ–≥–æ–≤—ã–µ –ø–æ—Ä–æ–≥–∏ –∏ —Å—Ç–∞—Ç–∏—Å—Ç–∏–∫–∞ ===")
for k,v in results.items():
    print(f"{k:20s}  mean={v['mean']:.3f}  thr97={v['threshold_97']:.3f}")

print(f"\n–°–≤–æ–¥–Ω—ã–π –ø–æ—Ä–æ–≥: {thr:.3f} | –ê–Ω–æ–º–∞–ª—å–Ω—ã—Ö –ò–ù–ù: {agg_all_filtered['anomaly_flag'].sum()} –∏–∑ {len(agg_all_filtered)}")

# ---------- 4) –¢–æ–ø –∞–Ω–æ–º–∞–ª—å–Ω—ã—Ö –ò–ù–ù ----------
top = agg_all_filtered.sort_values("graph_anomaly_score", ascending=False).head(20)
display(top[["graph_anomaly_score","anomaly_flag"] + [c for c in top.columns if "ratio" in c or "sum" in c or "count" in c]].head(20))

# ---------- 5) –°–æ—Ö—Ä–∞–Ω–µ–Ω–∏–µ ----------
agg_all_filtered.to_csv("inn_anomaly_scores_filtered.csv")
print("\n‚úÖ –†–µ–∑—É–ª—å—Ç–∞—Ç—ã —Å–æ—Ö—Ä–∞–Ω–µ–Ω—ã: inn_anomaly_scores_filtered.csv (–ò–ù–ù —Å >=20 —Ç—Ä–∞–Ω–∑–∞–∫—Ü–∏—è–º–∏, —Å —Ñ–ª–∞–≥–∞–º–∏ –∏ score)")


–í—Å–µ–≥–æ —Ç—Ä–∞–Ω–∑–∞–∫—Ü–∏–π: 2,688
–û—Ç–æ–±—Ä–∞–Ω–æ 20 –ò–ù–ù —Å >=20 —Ç—Ä–∞–Ω–∑–∞–∫—Ü–∏—è–º–∏ –∏–∑ 111
–ê–≥—Ä–µ–≥–∏—Ä–æ–≤–∞–Ω–æ 20 –ò–ù–ù, 27 —Ñ–∏—á–µ–π
‚è≥ –û–±—É—á–∞–µ–º IsolationForest ...
  min=0.000 max=1.000 mean=0.198
‚è≥ –û–±—É—á–∞–µ–º LocalOutlierFactor ...
  min=0.000 max=0.000 mean=0.000
‚è≥ –û–±—É—á–∞–µ–º OneClassSVM ...
  min=0.000 max=1.000 mean=0.791
‚è≥ –û–±—É—á–∞–µ–º EllipticEnvelope ...
  min=0.000 max=1.000 mean=0.707

=== –ò—Ç–æ–≥–æ–≤—ã–µ –ø–æ—Ä–æ–≥–∏ –∏ —Å—Ç–∞—Ç–∏—Å—Ç–∏–∫–∞ ===
IsolationForest       mean=0.198  thr97=0.594
LocalOutlierFactor    mean=0.000  thr97=0.000
OneClassSVM           mean=0.791  thr97=0.997
EllipticEnvelope      mean=0.707  thr97=1.000

–°–≤–æ–¥–Ω—ã–π –ø–æ—Ä–æ–≥: 0.686 | –ê–Ω–æ–º–∞–ª—å–Ω—ã—Ö –ò–ù–ù: 1 –∏–∑ 20


Unnamed: 0,graph_anomaly_score,anomaly_flag,debit_tx_count,debit_amount_sum,debit_spike_ratio_7d_mean,credit_tx_count,credit_amount_sum,credit_spike_ratio_7d_mean,self_tx_count,total_tx_count,fan_out_ratio,fan_in_ratio,in_out_ratio
d877722ca4e40f98,0.744333,1,2152.0,677167800.0,1.271407,560.0,680330700.0,1.292062,24.0,2688.0,0.045074,0.067857,1.004671
8d9e0be733f77f1c,0.642451,0,223.0,353829000.0,0.0,983.0,113293200.0,0.0,0.0,1206.0,0.004484,0.001017,0.3201919
6931e23e98703aa9,0.623952,0,0.0,0.0,0.0,256.0,393427300.0,0.0,0.0,256.0,0.0,0.003906,393427300000000.0
9d8f54ae3b88911b,0.585435,0,32.0,49848840.0,0.0,7.0,22802790.0,0.0,0.0,39.0,0.03125,0.142857,0.4574388
80799e1a6646b6dc,0.573548,0,9.0,76988370.0,0.0,172.0,3239357.0,0.0,0.0,181.0,0.111111,0.005814,0.04207593
6ef5667d7851dcbc,0.559121,0,20.0,23200000.0,0.0,1.0,3000000.0,0.0,0.0,21.0,0.05,0.999999,0.1293103
1b21a7645f64d170,0.547137,0,1.0,2357825.0,0.0,39.0,5317691.0,0.0,0.0,40.0,0.999999,0.025641,2.255337
659207c64fee1c66,0.543846,0,60.0,18200000.0,0.0,1.0,100000.0,0.0,0.0,61.0,0.016667,0.999999,0.005494505
5ae9eeb0d29bc5c0,0.478128,0,43.0,11100000.0,0.0,0.0,0.0,0.0,0.0,43.0,0.023256,0.0,9.009009e-14
eed8b1f54ed4366c,0.473275,0,0.0,0.0,0.0,82.0,1228693.0,0.0,0.0,82.0,0.0,0.012195,1228693000000.0



‚úÖ –†–µ–∑—É–ª—å—Ç–∞—Ç—ã —Å–æ—Ö—Ä–∞–Ω–µ–Ω—ã: inn_anomaly_scores_filtered.csv (–ò–ù–ù —Å >=20 —Ç—Ä–∞–Ω–∑–∞–∫—Ü–∏—è–º–∏, —Å —Ñ–ª–∞–≥–∞–º–∏ –∏ score)


In [45]:
# ============================================================
#  –û–±–Ω–∞—Ä—É–∂–µ–Ω–∏–µ –∞–Ω–æ–º–∞–ª–∏–π –¥–ª—è –ò–ù–ù (debit+credit)
#  –ú–æ–¥–µ–ª–∏: IsolationForest, LOF, OneClassSVM, EllipticEnvelope
#  + —Ñ–∏–ª—å—Ç—Ä: –∞–Ω–∞–ª–∏–∑ —Ç–æ–ª—å–∫–æ –ò–ù–ù —Å >=20 —Ç—Ä–∞–Ω–∑–∞–∫—Ü–∏—è–º–∏
#  + —Ç–æ–ø-100 –ø–æ–¥–æ–∑—Ä–∏—Ç–µ–ª—å–Ω—ã—Ö –¢–†–ê–ù–ó–ê–ö–¶–ò–ô –ø–æ –ª—É—á—à–µ–π –º–æ–¥–µ–ª–∏
# ============================================================

import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import IsolationForest
from sklearn.neighbors import LocalOutlierFactor
from sklearn.svm import OneClassSVM
from sklearn.covariance import EllipticEnvelope
import warnings
warnings.filterwarnings("ignore")

print(f"–í—Å–µ–≥–æ —Ç—Ä–∞–Ω–∑–∞–∫—Ü–∏–π: {len(df):,}")

# ---------- 1) –ê–≥—Ä–µ–≥–∞—Ü–∏—è –ø–æ –ò–ù–ù ----------
def agg_side(df, inn_col, prefix):
    aggs = df.groupby(inn_col).agg(
        tx_count=('date', 'size'),
        amount_sum=('amount', 'sum'),
        amount_mean=('amount', 'mean'),
        amount_std=('amount', 'std'),
        roll_cnt_30d_mean=(f'{prefix}_roll_cnt_30d','mean'),
        roll_mean_30d_mean=(f'{prefix}_roll_mean_30d','mean'),
        roll_std_30d_mean=(f'{prefix}_roll_std_30d','mean'),
        spike_ratio_7d_mean=(f'{prefix}_amount_spike_ratio_7d','mean'),
        tx_rate_spike_7d_mean=(f'{prefix}_tx_rate_spike_7d','mean'),
        vol_30d_mean=(f'{prefix}_amount_volatility_30d','mean'),
    )
    aggs = aggs.add_prefix(f"{prefix}_")
    return aggs

agg_deb = agg_side(df, 'debit_inn', 'debit')
agg_cred = agg_side(df, 'credit_inn', 'credit')
agg_all = agg_deb.join(agg_cred, how='outer').fillna(0.0)

# —Å–∞–º–æ–ø–µ—Ä–µ–≤–æ–¥—ã
self_tx = (df.loc[df['debit_inn'] == df['credit_inn'], 'debit_inn']
             .value_counts()
             .rename('self_tx_count'))
agg_all = agg_all.join(self_tx, how='left').fillna({'self_tx_count': 0})

# –∫–æ—Ä—Ä–µ–∫—Ç–Ω—ã–π total_tx_count –±–µ–∑ –¥–≤–æ–π–Ω–æ–≥–æ —É—á—ë—Ç–∞
agg_all['total_tx_count'] = (
    agg_all.get('debit_tx_count', 0).astype(float)
  + agg_all.get('credit_tx_count', 0).astype(float)
  - agg_all['self_tx_count'].astype(float)
)

# ---------- 1.1) –°–µ—Ç–µ–≤—ã–µ –ø–æ–∫–∞–∑–∞—Ç–µ–ª–∏ ----------
extra  = df.groupby('debit_inn').agg(out_unique=('credit_inn','nunique'))
extra2 = df.groupby('credit_inn').agg(in_unique=('debit_inn','nunique'))
agg_all = agg_all.join(extra, how='left').join(extra2, how='left').fillna(0)

agg_all['fan_out_ratio'] = agg_all['out_unique'] / (agg_all['debit_tx_count']+1e-6)
agg_all['fan_in_ratio']  = agg_all['in_unique']  / (agg_all['credit_tx_count']+1e-6)
agg_all['in_out_ratio']  = (agg_all['credit_amount_sum']+1e-6)/(agg_all['debit_amount_sum']+1e-6)

# ---------- 1.2) –§–∏–ª—å—Ç—Ä: —Ç–æ–ª—å–∫–æ –ò–ù–ù —Å >=20 —Ç—Ä–∞–Ω–∑–∞–∫—Ü–∏—è–º–∏ ----------
agg_all_filtered = agg_all[agg_all["total_tx_count"] >= 20].copy()
print(f"–û—Ç–æ–±—Ä–∞–Ω–æ {len(agg_all_filtered)} –ò–ù–ù —Å >=20 —Ç—Ä–∞–Ω–∑–∞–∫—Ü–∏—è–º–∏ –∏–∑ {len(agg_all)}")
if len(agg_all_filtered) == 0:
    raise RuntimeError("‚ùå –ù–µ—Ç –ò–ù–ù —Å –¥–æ—Å—Ç–∞—Ç–æ—á–Ω—ã–º –∫–æ–ª–∏—á–µ—Å—Ç–≤–æ–º —Ç—Ä–∞–Ω–∑–∞–∫—Ü–∏–π –¥–ª—è –∞–Ω–∞–ª–∏–∑–∞.")

# ---------- 1.3) –ú–∞—Å—à—Ç–∞–±–∏—Ä–æ–≤–∞–Ω–∏–µ ----------
X = agg_all_filtered.replace([np.inf,-np.inf],0.0).fillna(0.0)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
print(f"–ê–≥—Ä–µ–≥–∏—Ä–æ–≤–∞–Ω–æ {len(agg_all_filtered):,} –ò–ù–ù, {X_scaled.shape[1]} —Ñ–∏—á–µ–π")

# ---------- 2) –ú–æ–¥–µ–ª–∏ ----------
models = {
    "IsolationForest": IsolationForest(n_estimators=400, contamination=0.03, random_state=42, n_jobs=-1),
    "LocalOutlierFactor": LocalOutlierFactor(n_neighbors=20, contamination=0.03, novelty=True),
    "OneClassSVM": OneClassSVM(kernel="rbf", gamma=0.05, nu=0.03),
    "EllipticEnvelope": EllipticEnvelope(contamination=0.03, random_state=42)
}

results = {}
scores_all = pd.DataFrame(index=agg_all_filtered.index)

for name, model in models.items():
    print(f"‚è≥ –û–±—É—á–∞–µ–º {name} ...")
    model.fit(X_scaled)
    # –ø—Ä–∏–≤–æ–¥–∏–º –∫ —à–∫–∞–ª–µ: —á–µ–º –±–æ–ª—å—à–µ, —Ç–µ–º –∞–Ω–æ–º–∞–ª—å–Ω–µ–µ
    if hasattr(model, "score_samples"):
        raw = -model.score_samples(X_scaled)
    else:
        raw = -model.decision_function(X_scaled)
    raw = np.asarray(raw, dtype=float)
    s = (raw - raw.min()) / (raw.max() - raw.min() + 1e-6)
    scores_all[name+"_score"] = s
    results[name] = {
        "mean": float(s.mean()),
        "std": float(s.std()),
        "top5_mean": float(np.mean(np.sort(s)[-5:])),
        "threshold_97": float(np.quantile(s,0.97))
    }
    print(f"  min={s.min():.3f} max={s.max():.3f} mean={s.mean():.3f}")

# ---------- 3) –ö–æ–º–ø–æ–∑–∏—Ç–Ω—ã–π —Å–∫–æ—Ä –∏ –≤—ã–±–æ—Ä –ª—É—á—à–µ–π –º–æ–¥–µ–ª–∏ ----------
scores_all["ensemble_score"] = scores_all.mean(axis=1)
thr = np.quantile(scores_all["ensemble_score"],0.97)
scores_all["anomaly_flag"] = (scores_all["ensemble_score"]>=thr).astype(int)

agg_all_filtered = pd.concat([agg_all_filtered, scores_all], axis=1)

print("\n=== –ò—Ç–æ–≥–æ–≤—ã–µ –ø–æ—Ä–æ–≥–∏ –∏ —Å—Ç–∞—Ç–∏—Å—Ç–∏–∫–∞ ===")
for k,v in results.items():
    print(f"{k:20s}  mean={v['mean']:.3f}  thr97={v['threshold_97']:.3f}")
print(f"\n–°–≤–æ–¥–Ω—ã–π –ø–æ—Ä–æ–≥ (–∞–Ω—Å–∞–º–±–ª—å, p97): {thr:.3f} | –ê–Ω–æ–º–∞–ª—å–Ω—ã—Ö –ò–ù–ù: {agg_all_filtered['anomaly_flag'].sum()} –∏–∑ {len(agg_all_filtered)}")

# –ª—É—á—à–∞—è –º–æ–¥–µ–ª—å = –º–∞–∫—Å–∏–º–∞–ª—å–Ω–∞—è –∫–æ—Ä—Ä–µ–ª—è—Ü–∏—è —Å –∞–Ω—Å–∞–º–±–ª–µ–º (—É—Å—Ç–æ–π—á–∏–≤–æ—Å—Ç—å)
corr_with_ens = scores_all.drop(columns=["ensemble_score","anomaly_flag"]).corrwith(scores_all["ensemble_score"])
best_model = corr_with_ens.idxmax() + "_score" if not corr_with_ens.empty else "IsolationForest_score"
if best_model not in scores_all.columns:
    best_model = "IsolationForest_score"
print(f"\nüéØ –õ—É—á—à–∞—è –º–æ–¥–µ–ª—å –¥–ª—è –º–∞–ø–ø–∏–Ω–≥–∞ –Ω–∞ —Ç—Ä–∞–Ω–∑–∞–∫—Ü–∏–∏: {best_model}")

# ---------- 4) –§–æ—Ä–º–∏—Ä–æ–≤–∞–Ω–∏–µ —Ç—Ä–∞–Ω–∑–∞–∫—Ü–∏–æ–Ω–Ω–æ–≥–æ —Å–∫–æ—Ä–∞ (–≥–∏–±—Ä–∏–¥) ----------
# –º–∞–ø–ø–∏–Ω–≥ —Å–∫–æ—Ä-–∞ –ª—É—á—à–µ–π –º–æ–¥–µ–ª–∏ –ø–æ —É—á–∞—Å—Ç–Ω–∏–∫–∞–º
df = df.copy()
df["debit_score"]  = df["debit_inn"].map(agg_all_filtered[best_model])
df["credit_score"] = df["credit_inn"].map(agg_all_filtered[best_model])

# –µ—Å–ª–∏ –Ω–µ—Ç —Ç–µ–∫—Å—Ç–æ–≤–æ–π –º–µ—Ç—Ä–∏–∫–∏ ‚Äî —Å–æ–∑–¥–∞–¥–∏–º –∑–∞–≥–ª—É—à–∫—É
if "purpose_risk_score" not in df.columns:
    df["purpose_risk_score"] = 0.0

# –Ω–æ—Ä–º–∏—Ä–æ–≤–∞–Ω–Ω–∞—è —Å—É–º–º–∞ (–ª–æ–≥), —á—Ç–æ–±—ã —Ä–∞–∑–≥–ª–∞–¥–∏—Ç—å –º–∞—Å—à—Ç–∞–±
amount_norm = np.log1p(df["amount"].clip(lower=0)) / (np.log1p(df["amount"].clip(lower=0)).max() + 1e-9)

# –ª–æ–∫–∞–ª—å–Ω—ã–π —Ä–∞–Ω–≥ —Å—É–º–º—ã –≤–Ω—É—Ç—Ä–∏ –æ—Ç–ø—Ä–∞–≤–∏—Ç–µ–ª—è (–∫–æ–Ω—Ç—Ä–∞—Å—Ç –≤–Ω—É—Ç—Ä–∏ –ò–ù–ù)
df["local_amount_rank"] = df.groupby("debit_inn")["amount"].rank(pct=True)

# –≥–∏–±—Ä–∏–¥–Ω—ã–π —Ç—Ä–∞–Ω–∑–∞–∫—Ü–∏–æ–Ω–Ω—ã–π —Å–∫–æ—Ä:
#   70% ‚Äî —Ä–∏—Å–∫ —É—á–∞—Å—Ç–Ω–∏–∫–∞ (–º–∞–∫—Å –¥–µ–±–µ—Ç/–∫—Ä–µ–¥–∏—Ç),
#   20% ‚Äî —Ä–∏—Å–∫ —Ç–µ–∫—Å—Ç–∞,
#   10% ‚Äî –æ–±—ä—ë–º/–ª–æ–∫–∞–ª—å–Ω—ã–π –∫–æ–Ω—Ç—Ä–∞—Å—Ç.
df["txn_risk_score"] = (
    0.7 * df[["debit_score", "credit_score"]].max(axis=1).fillna(0) +
    0.2 * df["purpose_risk_score"].fillna(0) +
    0.1 * (0.5*amount_norm.fillna(0) + 0.5*df["local_amount_rank"].fillna(0))
)

df["txn_risk_score"] = (df["txn_risk_score"] - df["txn_risk_score"].min()) / (
    df["txn_risk_score"].max() - df["txn_risk_score"].min() + 1e-9
)

# —Ñ–ª–∞–≥ top-3% –ø–æ —Ç—Ä–∞–Ω–∑–∞–∫—Ü–∏—è–º
thr_txn = np.quantile(df["txn_risk_score"].dropna(), 0.97)
df["txn_anomaly_flag"] = (df["txn_risk_score"] >= thr_txn).astype(int)
print(f"\n–ü–æ—Ä–æ–≥ —Ç—Ä–∞–Ω–∑–∞–∫—Ü–∏–π (p97): {thr_txn:.3f} | –ê–Ω–æ–º–∞–ª—å–Ω—ã—Ö —Ç—Ä–∞–Ω–∑–∞–∫—Ü–∏–π: {df['txn_anomaly_flag'].sum()} –∏–∑ {len(df)}")

# ---------- 5) –¢–æ–ø-100 –ø–æ–¥–æ–∑—Ä–∏—Ç–µ–ª—å–Ω—ã—Ö —Ç—Ä–∞–Ω–∑–∞–∫—Ü–∏–π ----------
cols_show = [
    "date","debit_inn","credit_inn","amount","purpose",
    "purpose_risk_score","debit_score","credit_score",
    "txn_risk_score","txn_anomaly_flag"
]
cols_show = [c for c in cols_show if c in df.columns]

top_n = 2000
top_txn = df.sort_values("txn_risk_score", ascending=False).head(top_n).copy()
print(f"\n=== –¢–æ–ø-{top_n} –ø–æ–¥–æ–∑—Ä–∏—Ç–µ–ª—å–Ω—ã—Ö —Ç—Ä–∞–Ω–∑–∞–∫—Ü–∏–π ({best_model}) ===")
print(top_txn[cols_show].head(20))  # –≤ –≤—ã–≤–æ–¥ ‚Äî –ø–µ—Ä–≤—ã–µ 20 —Å—Ç—Ä–æ–∫ –¥–ª—è –∫—Ä–∞—Ç–∫–æ—Å—Ç–∏

# —Å–æ—Ö—Ä–∞–Ω–µ–Ω–∏–µ —Ñ–∞–π–ª–æ–≤
agg_all_filtered.to_csv("inn_anomaly_scores_filtered.csv", index=False, encoding="utf-8-sig")
top_txn[cols_show].to_csv(f"top_{top_n}_anomalous_transactions_{best_model}.csv", index=False, encoding="utf-8-sig")

print("\n‚úÖ –°–æ—Ö—Ä–∞–Ω–µ–Ω–æ:")
print("  ‚Ä¢ inn_anomaly_scores_filtered.csv  ‚Äî –ò–ù–ù —Å >=20 —Ç—Ä–∞–Ω–∑–∞–∫—Ü–∏—è–º–∏ (—Å–∫–æ—Ä—ã, —Ñ–ª–∞–≥–∏)")
print(f"  ‚Ä¢ top_{top_n}_anomalous_transactions_{best_model}.csv  ‚Äî —Ç–æ–ø-{top_n} –ø–æ–¥–æ–∑—Ä–∏—Ç–µ–ª—å–Ω—ã—Ö —Ç—Ä–∞–Ω–∑–∞–∫—Ü–∏–π")


–í—Å–µ–≥–æ —Ç—Ä–∞–Ω–∑–∞–∫—Ü–∏–π: 2,688
–û—Ç–æ–±—Ä–∞–Ω–æ 20 –ò–ù–ù —Å >=20 —Ç—Ä–∞–Ω–∑–∞–∫—Ü–∏—è–º–∏ –∏–∑ 111
–ê–≥—Ä–µ–≥–∏—Ä–æ–≤–∞–Ω–æ 20 –ò–ù–ù, 27 —Ñ–∏—á–µ–π
‚è≥ –û–±—É—á–∞–µ–º IsolationForest ...
  min=0.000 max=1.000 mean=0.198
‚è≥ –û–±—É—á–∞–µ–º LocalOutlierFactor ...
  min=0.000 max=0.000 mean=0.000
‚è≥ –û–±—É—á–∞–µ–º OneClassSVM ...
  min=0.000 max=1.000 mean=0.791
‚è≥ –û–±—É—á–∞–µ–º EllipticEnvelope ...
  min=0.000 max=1.000 mean=0.707

=== –ò—Ç–æ–≥–æ–≤—ã–µ –ø–æ—Ä–æ–≥–∏ –∏ —Å—Ç–∞—Ç–∏—Å—Ç–∏–∫–∞ ===
IsolationForest       mean=0.198  thr97=0.756
LocalOutlierFactor    mean=0.000  thr97=0.000
OneClassSVM           mean=0.791  thr97=0.998
EllipticEnvelope      mean=0.707  thr97=1.000

–°–≤–æ–¥–Ω—ã–π –ø–æ—Ä–æ–≥ (–∞–Ω—Å–∞–º–±–ª—å, p97): 0.686 | –ê–Ω–æ–º–∞–ª—å–Ω—ã—Ö –ò–ù–ù: 1 –∏–∑ 20

üéØ –õ—É—á—à–∞—è –º–æ–¥–µ–ª—å –¥–ª—è –º–∞–ø–ø–∏–Ω–≥–∞ –Ω–∞ —Ç—Ä–∞–Ω–∑–∞–∫—Ü–∏–∏: IsolationForest_score

–ü–æ—Ä–æ–≥ —Ç—Ä–∞–Ω–∑–∞–∫—Ü–∏–π (p97): 0.881 | –ê–Ω–æ–º–∞–ª—å–Ω—ã—Ö —Ç—Ä–∞–Ω–∑–∞–∫—Ü–∏–π: 81 –∏–∑ 2688

In [47]:
df['debit_score'].value_counts()

debit_score
0.999998    2152
0.572508     223
0.183957      60
0.136701      43
0.351055      32
0.075095      24
0.143359      23
0.236536      20
0.316077       9
0.045881       8
0.053857       7
0.197020       1
Name: count, dtype: int64