In [None]:
%mkdir -p /content/hackathon_case_hybrid/src
%mkdir -p /content/hackathon_case_hybrid/data/transactions_per_client
%mkdir -p /content/hackathon_case_hybrid/data/transfers_per_client
%mkdir -p /content/hackathon_case_hybrid/output

In [None]:
# 1. Устанавливаем библиотеки
!pip -q install lightgbm xgboost scikit-learn pandas numpy

In [None]:
# 3. Делаем src пакетным модулем
!touch /content/hackathon_case_hybrid/src/__init__.py

In [None]:
%%writefile /content/hackathon_case_hybrid/src/make_dataset.py
# -*- coding: utf-8 -*-
import re, glob, os
import pandas as pd

def infer_client_code(path: str) -> int:
    import os, re
    m = re.search(r'client_(\d+)', os.path.basename(path))
    if m: return int(m.group(1))
    m = re.search(r'(\d+)', os.path.basename(path))
    return int(m.group(1)) if m else -1

def concat_folder(folder: str, kind: str) -> pd.DataFrame:
    files = sorted(glob.glob(os.path.join(folder, "*.csv")))
    parts = []
    for f in files:
        df = pd.read_csv(f)
        if 'client_code' not in df.columns:
            df['client_code'] = infer_client_code(f)
        parts.append(df)
    if not parts: return pd.DataFrame()
    df_all = pd.concat(parts, ignore_index=True)
    need = {'date','amount','currency','client_code'}
    miss = need - set(df_all.columns)
    if miss:
        print(f"[WARN] {kind}: не найдены колонки {miss} — это ок, если они не требуются")
    return df_all

def build_all(data_dir="/content/hackathon_case_hybrid/data"):
    tx_dir = os.path.join(data_dir, "transactions_per_client")
    tr_dir = os.path.join(data_dir, "transfers_per_client")
    tx_all = concat_folder(tx_dir, 'transactions')
    tr_all = concat_folder(tr_dir, 'transfers')
    tx_all.to_csv(os.path.join(data_dir, "transactions.csv"), index=False)
    tr_all.to_csv(os.path.join(data_dir, "transfers.csv"), index=False)
    print(f"transactions.csv: {len(tx_all):,} rows")
    print(f"transfers.csv:    {len(tr_all):,} rows")

if __name__ == "__main__":
    build_all()

Overwriting /content/hackathon_case_hybrid/src/make_dataset.py


In [None]:
# Запуск склейки (если нужно)
!python /content/hackathon_case_hybrid/src/make_dataset.py

transactions.csv: 18,000 rows
transfers.csv:    18,000 rows


In [None]:
%%writefile /content/hackathon_case_hybrid/src/config.py
# -*- coding: utf-8 -*-

# ML
TRAIN_ML = True
USE_ML_IN_HYBRID = True
WEIGHT_RULES = 0.7
WEIGHT_ML = 0.3

# Триггеры потребности (для кредита — усиленные)
CREDIT_OUTFLOW_INFLOW_RATIO = 2.5    # расходы >= 2.5 × доходов
CREDIT_NEARZERO_BAL_MAX     = 15_000 # «почти ноль» на счёте
CREDIT_VERYLOW_BAL_MAX      = 30_000 # очень маленький баланс
CREDIT_LOAN_FREQ_MIN        = 6      # ≥ 6 платежей по займам за 3 мес


# Карты
PREMIUM_CB_LIMIT = 100_000.0
PREMIUM_TIER_THRESHOLDS = [1_000_000, 6_000_000]
PREMIUM_TIER_CBs = [0.02, 0.03, 0.04]

TRAVEL_CB = 0.04
PREMIUM_EXTRA_CB_CATS = ['Ювелирные украшения','Косметика и Парфюмерия','Кафе и рестораны']
PREMIUM_EXTRA_CB = 0.04

CC_TOP3_CB = 0.10
CC_ONLINE_CB_CATS = ['Едим дома','Смотрим дома','Играем дома']
CC_ONLINE_CB = 0.10

# Депозиты (годовые ставки)
RATE_SAVINGS = 0.165
RATE_GOAL    = 0.155
RATE_MULTI   = 0.145

# Обмен валют (экономия на спреде)
FX_SPREAD_SAVING = 0.004  # 0.4% * (среднемесячный объём FX)

# Триггеры потребности (кредит)
NEED_GAP_RATIO = 1.9
LOW_BALANCE_KZT = 100_000

TRAVEL_SHARE_MIN = 0.15
TRAVEL_TXN_MIN = 6

# Золото — utility пороги
GOLD_T1_BAL = 100_000_000  # 100 млн ₸ → 5 000 ₸/мес
GOLD_T2_BAL = 200_000_000  # 200 млн ₸ → 20 000 ₸/мес
GOLD_T1_UTILITY = 5_000
GOLD_T2_UTILITY = 20_000

# Пуш-линтер
PUSH_MIN_LEN = 180
PUSH_MAX_LEN = 220
CTA_WORDS = ['Открыть','Настроить','Посмотреть','Оформить','Подключить']

Overwriting /content/hackathon_case_hybrid/src/config.py


In [None]:
%%writefile /content/hackathon_case_hybrid/src/features.py
# -*- coding: utf-8 -*-
import pandas as pd
from .utils import safe_div

TRAVEL_CATS = ['Путешествия','Такси','Отели','Поезда','Самолёты']
PREMIUM_EXTRA = ['Ювелирные украшения','Косметика и Парфюмерия','Кафе и рестораны']
ONLINE_CATS = ['Едим дома','Смотрим дома','Играем дома']

def series_get(df, col, default_val):
    return df[col] if col in df.columns else pd.Series(default_val, index=df.index)

def build_features(df_clients, df_txn, df_trf):
    spend_by_cat = df_txn.groupby(['client_code','category'])['amount'].sum().unstack(fill_value=0.0)
    spend_total = spend_by_cat.sum(axis=1)

    travel_spend = spend_by_cat.reindex(columns=TRAVEL_CATS, fill_value=0.0).sum(axis=1)
    premium_extra_spend = spend_by_cat.reindex(columns=PREMIUM_EXTRA, fill_value=0.0).sum(axis=1)
    online_spend = spend_by_cat.reindex(columns=ONLINE_CATS, fill_value=0.0).sum(axis=1)

    freq = df_trf.groupby(['client_code','type']).size().unstack(fill_value=0)
    amount_trf = df_trf.groupby(['client_code','type'])['amount'].sum().unstack(fill_value=0.0)

    idx = spend_by_cat.index

    inflow = sum(series_get(amount_trf, c, 0.0) for c in ['salary_in','stipend_in','family_in','refund_in','cashback_in','invest_in','deposit_fx_withdraw_in'])
    outflow = sum(series_get(amount_trf, c, 0.0) for c in ['p2p_out','card_out','atm_withdrawal','utilities_out','loan_payment_out','cc_repayment_out','installment_payment_out','invest_out','deposit_topup_out','gold_buy_out'])

    fx_buy_amt = series_get(amount_trf, 'fx_buy', 0.0).abs()
    fx_sell_amt = series_get(amount_trf, 'fx_sell', 0.0).abs()
    fx_volume = fx_buy_amt + fx_sell_amt

    fx_buy_freq = series_get(freq, 'fx_buy', 0).astype(int)
    fx_sell_freq = series_get(freq, 'fx_sell', 0).astype(int)
    fx_freq = fx_buy_freq + fx_sell_freq

    atm_withdrawals = series_get(freq, 'atm_withdrawal', 0).astype(int)
    loan_payments = series_get(freq, 'loan_payment_out', 0).astype(int)
    cc_repayments = series_get(freq, 'cc_repayment_out', 0).astype(int)
    installments = series_get(freq, 'installment_payment_out', 0).astype(int)

    top3 = spend_by_cat.apply(lambda s: list(s.sort_values(ascending=False).head(3).index), axis=1)

    df_feat = pd.DataFrame({
        'client_code': idx,
        'spend_total_3m': spend_total.values,
        'travel_spend_3m': travel_spend.values,
        'premium_extra_spend_3m': premium_extra_spend.values,
        'online_spend_3m': online_spend.values,
        'fx_volume_3m': fx_volume.reindex(idx, fill_value=0.0).values,
        'fx_freq_3m': fx_freq.reindex(idx, fill_value=0).values,
        'atm_withdrawals_3m': atm_withdrawals.reindex(idx, fill_value=0).values,
        'loan_payments_3m': loan_payments.reindex(idx, fill_value=0).values,
        'cc_repayments_3m': cc_repayments.reindex(idx, fill_value=0).values,
        'installments_3m': installments.reindex(idx, fill_value=0).values,
        'inflow_3m': inflow.reindex(idx, fill_value=0.0).values,
        'outflow_3m': outflow.reindex(idx, fill_value=0.0).values,
        'top3_cats': top3.values,
    }).merge(df_clients[['client_code','name','status','age','city','avg_monthly_balance_KZT']], on='client_code', how='left')

    for col in ['spend_total_3m','travel_spend_3m','premium_extra_spend_3m','online_spend_3m','fx_volume_3m','inflow_3m','outflow_3m']:
        df_feat[col.replace('_3m','_m')] = df_feat[col] / 3.0

    df_feat['travel_share'] = df_feat['travel_spend_3m'] / df_feat['spend_total_3m'].replace(0,1)
    return df_feat

Overwriting /content/hackathon_case_hybrid/src/features.py


In [None]:
%%writefile /content/hackathon_case_hybrid/src/utils.py
# -*- coding: utf-8 -*-
import re
from datetime import datetime

def parse_date(s):
    try:
        return datetime.fromisoformat(s)
    except Exception:
        return None

def month_name_ru(dt):
    months = ['январе','феврале','марте','апреле','мае','июне','июле','августе','сентябре','октябре','ноябре','декабре']
    if dt is None: dt = datetime.now()
    return months[dt.month-1]

def format_currency_kzt(x):
    if x is None: return ''
    try:
        val = float(x)
    except:
        return str(x)
    if abs(val - int(val)) < 1e-9:
        s = f"{int(val):,}".replace(',', ' ')
        return f"{s} ₸"
    else:
        s = f"{val:,.2f}".replace(',', ' ')
        s = s.replace('.', ',')
        return f"{s} ₸"

def clamp(x, lo, hi):
    return max(lo, min(hi, x))

def safe_div(a, b, default=0.0):
    try:
        return a / b if b else default
    except:
        return default

def only_one_exclamation(text):
    if text.count('!') > 1:
        return False
    words = re.findall(r"\b[\wЁА-Я]{3,}\b", text)
    caps_words = [w for w in words if w.upper() == w and not w.isdigit()]
    return len(caps_words) <= 1

Overwriting /content/hackathon_case_hybrid/src/utils.py


In [None]:
%%writefile /content/hackathon_case_hybrid/src/rules_scoring.py
# -*- coding: utf-8 -*-
import pandas as pd
from .config import (
    PREMIUM_CB_LIMIT, PREMIUM_TIER_THRESHOLDS, PREMIUM_TIER_CBs,
    TRAVEL_CB, PREMIUM_EXTRA_CB_CATS, PREMIUM_EXTRA_CB,
    CC_TOP3_CB, CC_ONLINE_CB_CATS, CC_ONLINE_CB,
    RATE_SAVINGS, RATE_GOAL, RATE_MULTI, FX_SPREAD_SAVING,
    NEED_GAP_RATIO, LOW_BALANCE_KZT, TRAVEL_SHARE_MIN,
    GOLD_T1_BAL, GOLD_T2_BAL, GOLD_T1_UTILITY, GOLD_T2_UTILITY,
    CREDIT_OUTFLOW_INFLOW_RATIO, CREDIT_NEARZERO_BAL_MAX,
    CREDIT_VERYLOW_BAL_MAX, CREDIT_LOAN_FREQ_MIN )

# Денежное ранжирование — кредит наличными исключён (идёт отдельным флагом)
PRODUCTS = [
    'Карта для путешествий',
    'Премиальная карта',
    'Кредитная карта',
    'Обмен валют',
    'Депозит Сберегательный',
    'Депозит Накопительный',
    'Депозит Мультивалютный',
    'Инвестиции',
    'Золотые слитки'
]

def tier_cashback(avg_balance):
    if pd.isna(avg_balance): return PREMIUM_TIER_CBs[0]
    if avg_balance < PREMIUM_TIER_THRESHOLDS[0]: return PREMIUM_TIER_CBs[0]
    if avg_balance < PREMIUM_TIER_THRESHOLDS[1]: return PREMIUM_TIER_CBs[1]
    return PREMIUM_TIER_CBs[2]

def benefit_travel(r):
    spend = r.get('travel_spend_3m', 0.0) / 3.0
    return TRAVEL_CB * spend  # при желании можно ввести отдельный лимит

def benefit_premium(r):
    base_spend_m = r.get('spend_total_3m', 0.0)/3.0
    tcb = tier_cashback(r.get('avg_monthly_balance_KZT', 0.0))
    base_cb = min(PREMIUM_CB_LIMIT, tcb * base_spend_m)

    extra_spend_m = r.get('premium_extra_spend_3m', 0.0)/3.0
    extra_cb = PREMIUM_EXTRA_CB * extra_spend_m

    atm_count_m = r.get('atm_withdrawals_3m', 0) / 3.0
    saved_fees = 0.01 * min(3_000_000.0, atm_count_m * 30_000.0)  # пример оценки экономии

    return base_cb + extra_cb + saved_fees

def benefit_cc(r):
    total_m = r.get('spend_total_3m', 0.0)/3.0
    top3_spend_m = 0.6 * total_m
    online_spend_m = r.get('online_spend_3m', 0.0)/3.0
    return CC_TOP3_CB * top3_spend_m + CC_ONLINE_CB * online_spend_m

def benefit_fx(r):
    vol_m = r.get('fx_volume_3m', 0.0) / 3.0
    return FX_SPREAD_SAVING * vol_m

# ---------- КРЕДИТ НАЛИЧНЫМИ: отдельный флаг, НЕ деньги ----------
def utility_credit_cash(r):
    infl = float(r.get('inflow_3m', 0.0))
    out  = float(r.get('outflow_3m', 0.0))
    avgb = float(r.get('avg_monthly_balance_KZT', 0.0))
    loan_freq = int(r.get('loan_payments_3m', 0))

    big_gap = out >= CREDIT_OUTFLOW_INFLOW_RATIO * max(infl, 1.0)
    near_zero_bal = avgb <= CREDIT_NEARZERO_BAL_MAX
    very_low_bal  = avgb <= CREDIT_VERYLOW_BAL_MAX

    return 1.0 if (big_gap and (near_zero_bal or (very_low_bal and loan_freq >= CREDIT_LOAN_FREQ_MIN))) else 0.0
# ---------------------------------------------------------------

def benefit_deposits(r):
    bal = r.get('avg_monthly_balance_KZT', 0.0)
    savings = (RATE_SAVINGS/12.0) * bal
    goal    = (RATE_GOAL/12.0)    * bal
    multi   = (RATE_MULTI/12.0)   * bal * 0.4  # условно часть баланса
    return savings, goal, multi

# Инвестиции — 0 (по ТЗ не обещаем доходность)
def benefit_investments(r):
    return 0.0

# Золото — utility по порогам баланса
def benefit_gold(r):
    bal = r.get('avg_monthly_balance_KZT', 0.0)
    if bal >= GOLD_T2_BAL:
        return float(GOLD_T2_UTILITY)
    if bal >= GOLD_T1_BAL:
        return float(GOLD_T1_UTILITY)
    return 0.0

def score_all_products(df_feat: pd.DataFrame) -> pd.DataFrame:
    rows = []
    for _, r in df_feat.iterrows():
        scores = {}
        scores['Карта для путешествий'] = benefit_travel(r)
        scores['Премиальная карта'] = benefit_premium(r)
        scores['Кредитная карта'] = benefit_cc(r)
        scores['Обмен валют'] = benefit_fx(r)

        s,g,m = benefit_deposits(r)
        scores['Депозит Сберегательный'] = s
        scores['Депозит Накопительный'] = g
        scores['Депозит Мультивалютный'] = m

        scores['Инвестиции'] = benefit_investments(r)
        scores['Золотые слитки'] = benefit_gold(r)

        if r.get('travel_share', 0.0) >= TRAVEL_SHARE_MIN:
            scores['Карта для путешествий'] *= 1.1

        for p,v in scores.items():
            rows.append({'client_code': r['client_code'], 'product': p, 'rules_score': float(v)})
    return pd.DataFrame(rows)

Overwriting /content/hackathon_case_hybrid/src/rules_scoring.py


In [None]:
import os
os.environ["OPENAI_API_KEY"] = "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"

In [None]:
%%writefile /content/hackathon_case_hybrid/src/ml_classifier.py
# -*- coding: utf-8 -*-
import pandas as pd
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from dataclasses import dataclass
from typing import Dict, List

FEATURE_COLS = [
    'spend_total_3m','travel_spend_3m','premium_extra_spend_3m','online_spend_3m',
    'fx_volume_3m','fx_freq_3m','atm_withdrawals_3m','loan_payments_3m',
    'cc_repayments_3m','installments_3m','inflow_3m','outflow_3m',
    'avg_monthly_balance_KZT','travel_share'
]

@dataclass
class MLModel:
    label_encoder: Dict[str, int]
    booster: lgb.Booster
    classes_: List[str]

def synth_labels_from_rules(df_scores: pd.DataFrame) -> pd.DataFrame:
    idx = df_scores.sort_values(['client_code','rules_score'], ascending=[True, False]) \
                   .groupby('client_code').head(1)
    return idx[['client_code','product']].rename(columns={'product':'label'})

def train_ml_lightgbm(df_feat: pd.DataFrame, df_scores: pd.DataFrame) -> MLModel:
    labels = synth_labels_from_rules(df_scores)
    df = df_feat.merge(labels, on='client_code', how='inner')

    # 1) выкинем редкие классы (частота < 2), иначе stratify не сработает
    vc = df['label'].value_counts()
    keep_labels = vc[vc >= 2].index
    dropped = set(vc.index) - set(keep_labels)
    if dropped:
        print("[INFO] dropped rare classes from ML training:", {k:int(vc[k]) for k in dropped})
    df = df[df['label'].isin(keep_labels)].copy()

    # 2) проверка: осталось ли >=2 классов
    classes_left = sorted(df['label'].unique())
    if len(classes_left) < 2:
        raise RuntimeError(f"Too few classes after filtering: {classes_left}")

    X = df[FEATURE_COLS].fillna(0.0).astype(float)
    y = df['label'].astype(str)

    enc = {c:i for i,c in enumerate(classes_left)}
    y_enc = y.map(enc)

    from sklearn.model_selection import train_test_split
    Xtr, Xva, ytr, yva = train_test_split(
        X, y_enc, test_size=0.2, random_state=42, stratify=y_enc
    )

    import lightgbm as lgb
    train_set = lgb.Dataset(Xtr, label=ytr)
    valid_set = lgb.Dataset(Xva, label=yva)

    params = dict(
        objective='multiclass',
        num_class=len(classes_left),
        metric='multi_logloss',
        learning_rate=0.05,
        num_leaves=31,
        seed=42
    )
    booster = lgb.train(
        params, train_set,
        num_boost_round=300,
        valid_sets=[valid_set],
        callbacks=[lgb.early_stopping(30), lgb.log_evaluation(50)]
    )
    return MLModel(label_encoder=enc, booster=booster, classes_=classes_left)

def predict_ml_proba(model: MLModel, df_feat: pd.DataFrame, products: List[str]) -> pd.DataFrame:
    X = df_feat[FEATURE_COLS].fillna(0.0).astype(float)
    proba = model.booster.predict(X, num_iteration=model.booster.best_iteration)
    cols = model.classes_
    df_proba = pd.DataFrame(proba, columns=cols)
    df_proba.insert(0, 'client_code', df_feat['client_code'].values)
    for p in products:
        if p not in df_proba.columns:
            df_proba[p] = 0.0
    return df_proba[['client_code'] + products]

Overwriting /content/hackathon_case_hybrid/src/ml_classifier.py


In [None]:
%%writefile /content/hackathon_case_hybrid/src/hybrid_selector.py
# -*- coding: utf-8 -*-
import pandas as pd, numpy as np
from .rules_scoring import PRODUCTS, score_all_products, need_credit_flags
from .ml_classifier import train_ml_lightgbm, predict_ml_proba
from .config import TRAIN_ML, USE_ML_IN_HYBRID, WEIGHT_RULES, WEIGHT_ML

TIE_DELTA = 0.05

def softmax_log1p(x: np.ndarray) -> np.ndarray:
    z = np.log1p(np.maximum(x, 0.0))
    z = z - z.max()
    e = np.exp(z)
    return e / e.sum()

def rank_and_select(df_feat: pd.DataFrame):
    df_rules = score_all_products(df_feat)
    df_need = need_credit_flags(df_feat)

    use_ml = False
    if TRAIN_ML:
        try:
            model = train_ml_lightgbm(df_feat, df_rules)
            df_proba = predict_ml_proba(model, df_feat, PRODUCTS)
            use_ml = True
        except Exception as e:
            print("[WARN] ML disabled:", e)
            df_proba = df_feat[['client_code']].copy()
            for p in PRODUCTS: df_proba[p] = 0.0
    else:
        df_proba = df_feat[['client_code']].copy()
        for p in PRODUCTS: df_proba[p] = 0.0

    df = df_rules.merge(df_proba, on='client_code', how='left')
    df['ml_proba'] = df.apply(lambda r: r.get(r['product'], 0.0), axis=1)

    max_rules = df.groupby('client_code')['rules_score'].transform('max')
    if USE_ML_IN_HYBRID and use_ml:
        df['hybrid_score'] = WEIGHT_RULES * df['rules_score'] + WEIGHT_ML * (df['ml_proba'] * max_rules)
    else:
        df['hybrid_score'] = df['rules_score']

    winners, top4_rows = [], []
    for cid, grp in df.groupby('client_code', sort=False):
        grp = grp.sort_values('rules_score', ascending=False)
        top1, top2 = grp.iloc[0], (grp.iloc[1] if len(grp) > 1 else None)
        max_r = top1['rules_score']
        tie = top2 is not None and (abs(top1['rules_score'] - top2['rules_score']) < (TIE_DELTA*max_r) or top1['rules_score']==top2['rules_score'])
        if tie and use_ml:
            r_scores = grp['rules_score'].values
            p_rules = softmax_log1p(r_scores)
            ml_vec = grp['ml_proba'].values
            p_hybrid = WEIGHT_RULES*p_rules + WEIGHT_ML*ml_vec
            k = int(np.argmax(p_hybrid))
            winner_row = grp.iloc[k].copy()
            winner_row['hybrid_score'] = p_hybrid[k]
            grp['hybrid_score'] = p_hybrid
        else:
            k = grp['hybrid_score'].values.argmax()
            winner_row = grp.iloc[k]
        winners.append({'client_code': cid,'product':winner_row['product'],'hybrid_score':winner_row['hybrid_score']})
        top4_rows.extend(grp.sort_values('hybrid_score',ascending=False).head(4).to_dict('records'))

    best = pd.DataFrame(winners).merge(df_need, on='client_code', how='left')
    top4 = pd.DataFrame(top4_rows)
    return best, top4, df

Overwriting /content/hackathon_case_hybrid/src/hybrid_selector.py


In [None]:
%%writefile /content/hackathon_case_hybrid/src/push_nlp.py
# -*- coding: utf-8 -*-
from datetime import datetime
from .utils import month_name_ru, format_currency_kzt, only_one_exclamation
from .config import CTA_WORDS, PUSH_MIN_LEN, PUSH_MAX_LEN

def template_travel(name, month, taxi_cnt, travel_spend_m, benefit):
    return f"{name}, в {month} у вас {taxi_cnt} поездок на такси на {format_currency_kzt(travel_spend_m)}. С картой для путешествий вернули бы ≈{format_currency_kzt(benefit)} кешбэком. Откройте карту."

def template_premium(name, has_restos, benefit):
    return f"{name}, у вас стабильный крупный остаток{', траты в ресторанах' if has_restos else ''}. Премиальная карта даст кешбэк и бесплатные снятия. Выгода до {format_currency_kzt(benefit)}. Оформите карту."

def template_cc(name, cat1, cat2, cat3, benefit):
    return f"{name}, ваши топ-категории — {cat1}, {cat2}, {cat3}. Кредитная карта даёт до 10% кешбэка и онлайн-бонусы. Вернули бы {format_currency_kzt(benefit)}. Оформите карту."

def template_fx(name, curr):
    return f"{name}, вы часто пользуетесь валютой. Обмен {curr} по выгодному курсу без комиссии, моментально. Подключите обмен валют."

def template_deposit(name, benefit):
    return f"{name}, на депозите доход {format_currency_kzt(benefit)} в месяц. Сохраните и приумножьте средства. Откройте депозит."

def template_invest(name):
    return f"{name}, у вас есть запас на инвестиции. Диверсифицируйте портфель с фондами и акциями. Подключите инвестиции."

def template_cash_loan(name):
    return f"{name}, у вас расходы выше доходов. Наличный кредит поможет закрыть разрыв и планировать бюджет. Оформите кредит."

def lint_push(text: str) -> dict:
    ok = True
    if not (PUSH_MIN_LEN <= len(text) <= PUSH_MAX_LEN): ok=False
    if not only_one_exclamation(text): ok=False
    if not any(text.strip().endswith(w) for w in CTA_WORDS): ok=False
    return {"ok":ok,"len":len(text)}

def generate_push_text(product: str, facts: dict) -> str:
    name = facts.get("name","Клиент")
    if product=="Карта для путешествий":
        text = template_travel(name, month_name_ru(datetime.now()), facts.get("taxi_cnt",5), facts.get("travel_spend_m",30000), facts.get("benefit_kzt",1200))
    elif product=="Премиальная карта":
        text = template_premium(name, facts.get("has_restos",True), facts.get("benefit_kzt",2000))
    elif product=="Кредитная карта":
        cats = facts.get("top3",["рестораны","такси","продукты"]) + ["","",""]
        text = template_cc(name, cats[0], cats[1], cats[2], facts.get("benefit_kzt",2500))
    elif product=="Обмен валют":
        text = template_fx(name, facts.get("fx_curr","USD"))
    elif product.startswith("Депозит"):
        text = template_deposit(name, facts.get("benefit_kzt",1800))
    elif product=="Инвестиции":
        text = template_invest(name)  # без цифр, по ТЗ
    elif product=="Золотые слитки":
        text = "Долгосрочная защита и диверсификация капитала за счёт золотых слитков. Узнайте детали и подверстайте стратегию. Подключите золото."
    else:
        text = template_invest(name)
    check = lint_push(text)
    if not check["ok"]:
        if len(text)<PUSH_MIN_LEN: text=text+" Оформите карту."
        if len(text)>PUSH_MAX_LEN: text=text[:PUSH_MAX_LEN-1]+"…"
    return text

Overwriting /content/hackathon_case_hybrid/src/push_nlp.py


In [None]:
%%writefile /content/hackathon_case_hybrid/src/pipeline.py
# -*- coding: utf-8 -*-
import pandas as pd
from datetime import datetime
from .features import build_features
from .hybrid_selector import rank_and_select
from .push_nlp import generate_push_text, template_cash_loan

def run_pipeline():
    df_clients = pd.read_csv("/content/hackathon_case_hybrid/data/clients.csv")
    df_txn = pd.read_csv("/content/hackathon_case_hybrid/data/transactions.csv")
    df_trf = pd.read_csv("/content/hackathon_case_hybrid/data/transfers.csv")

    df_feat = build_features(df_clients, df_txn, df_trf)
    best, top4, df_all = rank_and_select(df_feat)

    facts_all = df_feat.set_index('client_code').to_dict(orient='index')
    rows = []
    for _, r in best.iterrows():
        cid, product = r['client_code'], r['product']
        f = facts_all.get(cid,{})
        f_enrich = {
            "name": f.get("name","Клиент"),
            "month_ru": datetime.now().month,
            "travel_spend_m": round(f.get("travel_spend_3m",0)/3.0),
            "benefit_kzt": round(float(r.get("hybrid_score",0))),
            "top3": f.get("top3_cats", []),
            "has_restos": (f.get("premium_extra_spend_3m",0)>0),
            "fx_curr": "USD",
            "taxi_cnt": 5
        }
        text_main = generate_push_text(product, f_enrich)

        credit_banner = ""
        if bool(r.get("need_credit", False)):
            credit_banner = template_cash_loan(f_enrich["name"])

        rows.append({
            "client_code": cid,
            "product": product,
            "push_notification": text_main,
            "credit_banner": credit_banner
        })

    df_out = pd.DataFrame(rows)
    out_path = "/content/hackathon_case_hybrid/output/recommendations.csv"
    df_out.to_csv(out_path,index=False)
    print("Сохранено:", out_path)

if __name__=="__main__":
    run_pipeline()

Overwriting /content/hackathon_case_hybrid/src/pipeline.py


In [None]:
%cd /content/hackathon_case_hybrid
%env PYTHONPATH=/content/hackathon_case_hybrid
!python -m src.pipeline

/content/hackathon_case_hybrid
env: PYTHONPATH=/content/hackathon_case_hybrid
Traceback (most recent call last):
  File "<frozen runpy>", line 198, in _run_module_as_main
  File "<frozen runpy>", line 88, in _run_code
  File "/content/hackathon_case_hybrid/src/pipeline.py", line 5, in <module>
    from .hybrid_selector import rank_and_select
  File "/content/hackathon_case_hybrid/src/hybrid_selector.py", line 3, in <module>
    from .rules_scoring import PRODUCTS, score_all_products, need_credit_flags
ImportError: cannot import name 'need_credit_flags' from 'src.rules_scoring' (/content/hackathon_case_hybrid/src/rules_scoring.py)


In [None]:
%%writefile /content/hackathon_case_hybrid/src/ml_classifier.py
# -*- coding: utf-8 -*-
import pandas as pd
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from dataclasses import dataclass
from typing import Dict, List

FEATURE_COLS = [
    'spend_total_3m','travel_spend_3m','premium_extra_spend_3m','online_spend_3m',
    'fx_volume_3m','fx_freq_3m','atm_withdrawals_3m','loan_payments_3m',
    'cc_repayments_3m','installments_3m','inflow_3m','outflow_3m',
    'avg_monthly_balance_KZT','travel_share'
]

EXCLUDE_TRAIN_LABELS = {'Кредит наличными', 'Кредитная карта'}  # не используем как таргет для обучения

@dataclass
class MLModel:
    label_encoder: Dict[str, int]
    booster: lgb.Booster
    classes_: List[str]

def synth_labels_from_rules(df_scores: pd.DataFrame) -> pd.DataFrame:
    # для каждого клиента возьмём топ-3 по rules_score и выберем первый НЕ из EXCLUDE_TRAIN_LABELS
    top3 = (
        df_scores.sort_values(['client_code','rules_score'], ascending=[True, False])
                 .groupby('client_code').head(3)
    )

    def pick_label(grp: pd.DataFrame) -> pd.Series:
        for _, row in grp.iterrows():
            if row['product'] not in EXCLUDE_TRAIN_LABELS:
                return pd.Series({'client_code': row['client_code'], 'label': row['product']})
        # если все в исключениях — возьмём первый как есть (чтобы совсем не потерять клиента)
        row = grp.iloc[0]
        return pd.Series({'client_code': row['client_code'], 'label': row['product']})

    labels = top3.groupby('client_code', as_index=False).apply(pick_label)
    return labels[['client_code','label']]

def train_ml_lightgbm(df_feat: pd.DataFrame, df_scores: pd.DataFrame) -> MLModel:
    labels = synth_labels_from_rules(df_scores)
    df = df_feat.merge(labels, on='client_code', how='inner')

    # отфильтруем редкие классы (<2), иначе stratify не сработает
    vc = df['label'].value_counts()
    keep = vc[vc >= 2].index
    dropped = set(vc.index) - set(keep)
    if dropped:
        print("[INFO] dropped rare classes from ML training:", {k:int(vc[k]) for k in dropped})
    df = df[df['label'].isin(keep)].copy()

    classes = sorted(df['label'].unique())
    if len(classes) < 2:
        raise RuntimeError(f"Too few classes after filtering: {classes}")

    X = df[FEATURE_COLS].fillna(0.0).astype(float)
    y = df['label'].astype(str)

    enc = {c:i for i,c in enumerate(classes)}
    y_enc = y.map(enc)

    Xtr, Xva, ytr, yva = train_test_split(
        X, y_enc, test_size=0.2, random_state=42, stratify=y_enc
    )

    train_set = lgb.Dataset(Xtr, label=ytr)
    valid_set = lgb.Dataset(Xva, label=yva)

    params = dict(
        objective='multiclass',
        num_class=len(classes),
        metric='multi_logloss',
        learning_rate=0.05,
        num_leaves=31,
        seed=42
    )
    booster = lgb.train(
        params, train_set,
        num_boost_round=300,
        valid_sets=[valid_set],
        callbacks=[lgb.early_stopping(30), lgb.log_evaluation(50)]
    )
    return MLModel(label_encoder=enc, booster=booster, classes_=classes)

def predict_ml_proba(model: MLModel, df_feat: pd.DataFrame, products: List[str]) -> pd.DataFrame:
    X = df_feat[FEATURE_COLS].fillna(0.0).astype(float)
    proba = model.booster.predict(X, num_iteration=model.booster.best_iteration)
    cols = model.classes_
    df_proba = pd.DataFrame(proba, columns=cols)
    df_proba.insert(0, 'client_code', df_feat['client_code'].values)
    for p in products:
        if p not in df_proba.columns:
            df_proba[p] = 0.0
    return df_proba[['client_code'] + products]

Overwriting /content/hackathon_case_hybrid/src/ml_classifier.py


In [None]:
%cd /content/hackathon_case_hybrid
%env PYTHONPATH=/content/hackathon_case_hybrid
!python -m src.pipeline

/content/hackathon_case_hybrid
env: PYTHONPATH=/content/hackathon_case_hybrid
Traceback (most recent call last):
  File "<frozen runpy>", line 198, in _run_module_as_main
  File "<frozen runpy>", line 88, in _run_code
  File "/content/hackathon_case_hybrid/src/pipeline.py", line 5, in <module>
    from .hybrid_selector import rank_and_select
  File "/content/hackathon_case_hybrid/src/hybrid_selector.py", line 3, in <module>
    from .rules_scoring import PRODUCTS, score_all_products, need_credit_flags
ImportError: cannot import name 'need_credit_flags' from 'src.rules_scoring' (/content/hackathon_case_hybrid/src/rules_scoring.py)


In [None]:
%cd /content/hackathon_case_hybrid
%env PYTHONPATH=/content/hackathon_case_hybrid

/content/hackathon_case_hybrid
env: PYTHONPATH=/content/hackathon_case_hybrid


In [None]:
!python -m src.pipeline

Traceback (most recent call last):
  File "<frozen runpy>", line 198, in _run_module_as_main
  File "<frozen runpy>", line 88, in _run_code
  File "/content/hackathon_case_hybrid/src/pipeline.py", line 5, in <module>
    from .hybrid_selector import rank_and_select
  File "/content/hackathon_case_hybrid/src/hybrid_selector.py", line 3, in <module>
    from .rules_scoring import PRODUCTS, score_all_products, need_credit_flags
ImportError: cannot import name 'need_credit_flags' from 'src.rules_scoring' (/content/hackathon_case_hybrid/src/rules_scoring.py)


In [None]:
from google.colab import files
files.download("/content/hackathon_case_hybrid/output/recommendations.csv")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
import pandas as pd

path = "/content/hackathon_case_hybrid/output/recommendations.csv"
df = pd.read_csv(path)  # читаем файл как есть (он в UTF-8)
df.to_csv(path, index=False, encoding="utf-8-sig")  # пересохраняем в нужной кодировке

In [None]:
%%writefile /content/hackathon_case_hybrid/src/hybrid_selector.py
# -*- coding: utf-8 -*-
import pandas as pd, numpy as np
from .rules_scoring import PRODUCTS, score_all_products, need_credit_flags
from .ml_classifier import train_ml_lightgbm, predict_ml_proba
from .config import TRAIN_ML, USE_ML_IN_HYBRID, WEIGHT_RULES, WEIGHT_ML

TIE_DELTA = 0.05

def softmax_log1p(x: np.ndarray) -> np.ndarray:
    z = np.log1p(np.maximum(x, 0.0))
    z = z - z.max()
    e = np.exp(z)
    return e / e.sum()

def rank_and_select(df_feat: pd.DataFrame):
    df_rules = score_all_products(df_feat)
    df_need = need_credit_flags(df_feat)  # флаг потребности
    need_map = dict(zip(df_need['client_code'], df_need['need_credit']))

    # ML proba
    use_ml = False
    if TRAIN_ML:
        try:
            model = train_ml_lightgbm(df_feat, df_rules)
            df_proba = predict_ml_proba(model, df_feat, PRODUCTS)
            use_ml = True
        except Exception as e:
            print("[WARN] ML disabled:", e)
            df_proba = df_feat[['client_code']].copy()
            for p in PRODUCTS: df_proba[p] = 0.0
    else:
        df_proba = df_feat[['client_code']].copy()
        for p in PRODUCTS: df_proba[p] = 0.0

    df = df_rules.merge(df_proba, on='client_code', how='left')
    df['ml_proba'] = df.apply(lambda r: r.get(r['product'], 0.0), axis=1)

    max_rules = df.groupby('client_code')['rules_score'].transform('max')
    if USE_ML_IN_HYBRID and use_ml:
        df['hybrid_score'] = WEIGHT_RULES * df['rules_score'] + WEIGHT_ML * (df['ml_proba'] * max_rules)
    else:
        df['hybrid_score'] = df['rules_score']

    winners, top4_rows = [], []
    for cid, grp in df.groupby('client_code', sort=False):
        # 1) если сильные кредит-сигналы — форсим «Кредит наличными» на 1 место
        if need_map.get(cid, False):
            winners.append({'client_code': cid, 'product': 'Кредит наличными', 'hybrid_score': float('inf')})
            # топ-4 остальных по гибриду (кредит не входит в PRODUCTS и не участвует в скоринге)
            top4_rows.extend(grp.sort_values('hybrid_score', ascending=False).head(4).to_dict('records'))
            continue

        # 2) обычный режим: max rules / tie-break через ML
        grp = grp.sort_values('rules_score', ascending=False)
        top1, top2 = grp.iloc[0], (grp.iloc[1] if len(grp) > 1 else None)
        max_r = top1['rules_score']
        tie = top2 is not None and (abs(top1['rules_score'] - top2['rules_score']) < (TIE_DELTA*max_r) or top1['rules_score']==top2['rules_score'])

        if tie and use_ml:
            r_scores = grp['rules_score'].values
            p_rules = softmax_log1p(r_scores)
            ml_vec = grp['ml_proba'].values
            p_hybrid = WEIGHT_RULES*p_rules + WEIGHT_ML*ml_vec
            k = int(np.argmax(p_hybrid))
            winner_row = grp.iloc[k].copy()
            winner_row['hybrid_score'] = p_hybrid[k]
            grp['hybrid_score'] = p_hybrid
        else:
            k = grp['hybrid_score'].values.argmax()
            winner_row = grp.iloc[k]

        winners.append({'client_code': cid, 'product': winner_row['product'], 'hybrid_score': winner_row['hybrid_score']})
        top4_rows.extend(grp.sort_values('hybrid_score', ascending=False).head(4).to_dict('records'))

    best = pd.DataFrame(winners)
    top4 = pd.DataFrame(top4_rows)
    return best, top4, df

Overwriting /content/hackathon_case_hybrid/src/hybrid_selector.py


In [None]:
%%writefile /content/hackathon_case_hybrid/src/push_nlp.py
# -*- coding: utf-8 -*-
from datetime import datetime
from .utils import month_name_ru, format_currency_kzt, only_one_exclamation
from .config import CTA_WORDS, PUSH_MIN_LEN, PUSH_MAX_LEN

def template_travel(name, month, taxi_cnt, travel_spend_m, benefit):
    return f"{name}, в {month} у вас {taxi_cnt} поездок на такси на {format_currency_kzt(travel_spend_m)}. С картой для путешествий вернули бы ≈{format_currency_kzt(benefit)} кешбэком. Откройте карту."

def template_premium(name, has_restos, benefit):
    return f"{name}, у вас стабильный крупный остаток{', траты в ресторанах' if has_restos else ''}. Премиальная карта даст кешбэк и бесплатные снятия. Выгода до {format_currency_kzt(benefit)}. Оформите карту."

def template_cc(name, cat1, cat2, cat3, benefit):
    return f"{name}, ваши топ-категории — {cat1}, {cat2}, {cat3}. Кредитная карта даёт до 10% кешбэка и онлайн-бонусы. Вернули бы {format_currency_kzt(benefit)}. Оформите карту."

def template_fx(name, curr):
    return f"{name}, вы часто пользуетесь валютой. Обмен {curr} по выгодному курсу без комиссии, моментально. Подключите обмен валют."

def template_deposit(name, benefit):
    return f"{name}, на депозите доход {format_currency_kzt(benefit)} в месяц. Сохраните и приумножьте средства. Откройте депозит."

def template_invest(name):
    return f"{name}, у вас есть запас на инвестиции. Диверсифицируйте портфель с фондами и акциями. Подключите инвестиции."

def template_cash_loan(name):
    return f"{name}, у вас расходы выше доходов. Наличный кредит поможет закрыть разрыв и планировать бюджет. Оформите кредит."

def lint_push(text: str) -> dict:
    ok = True
    if not (PUSH_MIN_LEN <= len(text) <= PUSH_MAX_LEN): ok=False
    if not only_one_exclamation(text): ok=False
    if not any(text.strip().endswith(w) for w in CTA_WORDS): ok=False
    return {"ok":ok,"len":len(text)}

def generate_push_text(product: str, facts: dict) -> str:
    name = facts.get("name","Клиент")
    if product=="Карта для путешествий":
        text = template_travel(name, month_name_ru(datetime.now()), facts.get("taxi_cnt",5), facts.get("travel_spend_m",30000), facts.get("benefit_kzt",1200))
    elif product=="Премиальная карта":
        text = template_premium(name, facts.get("has_restos",True), facts.get("benefit_kzt",2000))
    elif product=="Кредитная карта":
        cats = facts.get("top3",["рестораны","такси","продукты"]) + ["","",""]
        text = template_cc(name, cats[0], cats[1], cats[2], facts.get("benefit_kzt",2500))
    elif product=="Обмен валют":
        text = template_fx(name, facts.get("fx_curr","USD"))
    elif product.startswith("Депозит"):
        text = template_deposit(name, facts.get("benefit_kzt",1800))
    elif product=="Инвестиции":
        text = template_invest(name)  # без цифр, по ТЗ
    elif product=="Золотые слитки":
        text = "Долгосрочная защита и диверсификация капитала за счёт золотых слитков. Узнайте детали и подверстайте стратегию. Подключите золото."
    elif product=="Кредит наличными":
        text = template_cash_loan(name)  # теперь кредит как обычный продукт №1
    else:
        text = template_invest(name)

    check = lint_push(text)
    if not check["ok"]:
        if len(text)<PUSH_MIN_LEN: text=text+" Оформите карту."
        if len(text)>PUSH_MAX_LEN: text=text[:PUSH_MAX_LEN-1]+"…"
    return text

Overwriting /content/hackathon_case_hybrid/src/push_nlp.py


In [None]:
%%writefile /content/hackathon_case_hybrid/src/pipeline.py
# -*- coding: utf-8 -*-
import pandas as pd
from datetime import datetime
from .features import build_features
from .hybrid_selector import rank_and_select
from .push_nlp import generate_push_text

def run_pipeline():
    df_clients = pd.read_csv("/content/hackathon_case_hybrid/data/clients.csv")
    df_txn = pd.read_csv("/content/hackathon_case_hybrid/data/transactions.csv")
    df_trf = pd.read_csv("/content/hackathon_case_hybrid/data/transfers.csv")

    df_feat = build_features(df_clients, df_txn, df_trf)
    best, top4, df_all = rank_and_select(df_feat)

    facts_all = df_feat.set_index('client_code').to_dict(orient='index')
    rows = []
    for _, r in best.iterrows():
        cid, product = r['client_code'], r['product']
        f = facts_all.get(cid,{})
        f_enrich = {
            "name": f.get("name","Клиент"),
            "month_ru": datetime.now().month,
            "travel_spend_m": round(f.get("travel_spend_3m",0)/3.0),
            "benefit_kzt": round(float(r.get("hybrid_score",0))) if str(r.get("hybrid_score","")) not in ("inf","-inf") else 0,
            "top3": f.get("top3_cats", []),
            "has_restos": (f.get("premium_extra_spend_3m",0)>0),
            "fx_curr": "USD",
            "taxi_cnt": 5
        }
        text_main = generate_push_text(product, f_enrich)
        rows.append({"client_code": cid, "product": product, "push_notification": text_main})

    df_out = pd.DataFrame(rows)
    out_path = "/content/hackathon_case_hybrid/output/recommendations.csv"
    df_out.to_csv(out_path, index=False, encoding="utf-8-sig")  # Excel-friendly
    print("Сохранено:", out_path)

if __name__=="__main__":
    run_pipeline()

Overwriting /content/hackathon_case_hybrid/src/pipeline.py


In [None]:
%cd /content/hackathon_case_hybrid
%env PYTHONPATH=/content/hackathon_case_hybrid
!python -m src.pipeline

/content/hackathon_case_hybrid
env: PYTHONPATH=/content/hackathon_case_hybrid
Traceback (most recent call last):
  File "<frozen runpy>", line 198, in _run_module_as_main
  File "<frozen runpy>", line 88, in _run_code
  File "/content/hackathon_case_hybrid/src/pipeline.py", line 5, in <module>
    from .hybrid_selector import rank_and_select
  File "/content/hackathon_case_hybrid/src/hybrid_selector.py", line 3, in <module>
    from .rules_scoring import PRODUCTS, score_all_products, need_credit_flags
ImportError: cannot import name 'need_credit_flags' from 'src.rules_scoring' (/content/hackathon_case_hybrid/src/rules_scoring.py)


In [None]:
import pandas as pd
pd.read_csv("/content/hackathon_case_hybrid/output/recommendations.csv").head()

Unnamed: 0,client_code,name,product,push_notification
0,1,Айгерим,Кредит наличными,"Айгерим, у вас на счету 92 643 ₸! 🤑 Обратите в..."
1,2,Данияр,Кредитная карта,"Привет, Данияр! 👋 Ваш баланс на кредитной карт..."
2,3,Сабина,Кредит наличными,"Сабина, привет! 😊 У вас на балансе 63 116 ₸. О..."
3,4,Тимур,Кредит наличными,"Привет, Тимур! 👋 Ваш баланс 83 351 ₸. Мы замет..."
4,5,Камилла,Кредитная карта,"Камилла, ваш баланс на кредитной карте составл..."


In [None]:
code = r"""
import pandas as pd

def need_credit_flags(df_feat: pd.DataFrame) -> pd.DataFrame:
    flags = []
    for _, r in df_feat.iterrows():
        flags.append({
            'client_code': r['client_code'],
            'need_credit': bool(utility_credit_cash(r))
        })
    return pd.DataFrame(flags)
"""
with open("/content/hackathon_case_hybrid/src/rules_scoring.py", "a", encoding="utf-8") as f:
    f.write("\n" + code + "\n")
print("need_credit_flags добавлена в rules_scoring.py")

need_credit_flags добавлена в rules_scoring.py


In [None]:
%cd /content/hackathon_case_hybrid
%env PYTHONPATH=/content/hackathon_case_hybrid
!python -m src.pipeline

/content/hackathon_case_hybrid
env: PYTHONPATH=/content/hackathon_case_hybrid
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.061176 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 136
[LightGBM] [Info] Number of data points in the train set: 48, number of used features: 8
[LightGBM] [Info] Start training from score -2.079442
[LightGBM] [Info] Start training from score -0.133531
Training until validation scores don't improve for 30 rounds
[50]	valid_0's multi_logloss: 0.0526071
[100]	valid_0's multi_logloss: 0.00396756
[150]	valid_0's multi_logloss: 0.000323661
[200]	valid_0's multi_logloss: 2.65543e-05
Early stopping, best iteration is:
[206]	valid_0's multi_logloss: 1.96717e-05
Сохранено: /content/hackathon_case_hybrid/output/recommendations.csv


In [None]:
%%writefile /content/hackathon_case_hybrid/src/rules_scoring.py
# -*- coding: utf-8 -*-
import pandas as pd
from .config import (
    PREMIUM_CB_LIMIT, PREMIUM_TIER_THRESHOLDS, PREMIUM_TIER_CBs,
    TRAVEL_CB, PREMIUM_EXTRA_CB_CATS, PREMIUM_EXTRA_CB,
    CC_TOP3_CB, CC_ONLINE_CB_CATS, CC_ONLINE_CB,
    RATE_SAVINGS, RATE_GOAL, RATE_MULTI, FX_SPREAD_SAVING,
    TRAVEL_SHARE_MIN,
    GOLD_T1_BAL, GOLD_T2_BAL, GOLD_T1_UTILITY, GOLD_T2_UTILITY,
    CREDIT_OUTFLOW_INFLOW_RATIO, CREDIT_NEARZERO_BAL_MAX,
    CREDIT_VERYLOW_BAL_MAX, CREDIT_LOAN_FREQ_MIN
)

# Продукты, участвующие в денежном ранжировании (кредит НЕ здесь)
PRODUCTS = [
    'Карта для путешествий',
    'Премиальная карта',
    'Кредитная карта',
    'Обмен валют',
    'Депозит Сберегательный',
    'Депозит Накопительный',
    'Депозит Мультивалютный',
    'Инвестиции',
    'Золотые слитки'
]

def tier_cashback(avg_balance):
    if pd.isna(avg_balance): return PREMIUM_TIER_CBs[0]
    if avg_balance < PREMIUM_TIER_THRESHOLDS[0]: return PREMIUM_TIER_CBs[0]
    if avg_balance < PREMIUM_TIER_THRESHOLDS[1]: return PREMIUM_TIER_CBs[1]
    return PREMIUM_TIER_CBs[2]

def benefit_travel(r):
    spend = r.get('travel_spend_3m', 0.0) / 3.0
    return TRAVEL_CB * spend  # при желании можно добавить лимит

def benefit_premium(r):
    base_spend_m = r.get('spend_total_3m', 0.0)/3.0
    tcb = tier_cashback(r.get('avg_monthly_balance_KZT', 0.0))
    base_cb = min(PREMIUM_CB_LIMIT, tcb * base_spend_m)

    extra_spend_m = r.get('premium_extra_spend_3m', 0.0)/3.0
    extra_cb = PREMIUM_EXTRA_CB * extra_spend_m

    atm_count_m = r.get('atm_withdrawals_3m', 0) / 3.0
    saved_fees = 0.01 * min(3_000_000.0, atm_count_m * 30_000.0)  # грубая оценка экономии

    return base_cb + extra_cb + saved_fees

def benefit_cc(r):
    total_m = r.get('spend_total_3m', 0.0)/3.0
    top3_spend_m = 0.6 * total_m
    online_spend_m = r.get('online_spend_3m', 0.0)/3.0
    return CC_TOP3_CB * top3_spend_m + CC_ONLINE_CB * online_spend_m

def benefit_fx(r):
    vol_m = r.get('fx_volume_3m', 0.0) / 3.0
    return FX_SPREAD_SAVING * vol_m

def benefit_deposits(r):
    bal = r.get('avg_monthly_balance_KZT', 0.0)
    savings = (RATE_SAVINGS/12.0) * bal
    goal    = (RATE_GOAL/12.0)    * bal
    multi   = (RATE_MULTI/12.0)   * bal * 0.4  # условно часть баланса в мультивалюте
    return savings, goal, multi

# Инвестиции — 0 (по ТЗ не обещаем доходность)
def benefit_investments(r):
    return 0.0

# Золото — utility по порогам баланса
def benefit_gold(r):
    bal = r.get('avg_monthly_balance_KZT', 0.0)
    if bal >= GOLD_T2_BAL:
        return float(GOLD_T2_UTILITY)
    if bal >= GOLD_T1_BAL:
        return float(GOLD_T1_UTILITY)
    return 0.0

# --- Жёсткие сигналы на кредит №1 ---
def utility_credit_cash(r):
    infl = float(r.get('inflow_3m', 0.0))
    out  = float(r.get('outflow_3m', 0.0))
    avgb = float(r.get('avg_monthly_balance_KZT', 0.0))
    loan_freq = int(r.get('loan_payments_3m', 0))

    big_gap = out >= CREDIT_OUTFLOW_INFLOW_RATIO * max(infl, 1.0)
    near_zero_bal = avgb <= CREDIT_NEARZERO_BAL_MAX
    very_low_bal  = avgb <= CREDIT_VERYLOW_BAL_MAX

    # Кредит как №1 только в тяжёлом случае:
    # A) огромный разрыв И баланс почти нулевой
    # Б) или огромный разрыв И очень низкий средний баланс И частые платежи по займам
    return 1.0 if (big_gap and (near_zero_bal or (very_low_bal and loan_freq >= CREDIT_LOAN_FREQ_MIN))) else 0.0

def need_credit_flags(df_feat: pd.DataFrame) -> pd.DataFrame:
    flags = []
    for _, r in df_feat.iterrows():
        flags.append({'client_code': r['client_code'], 'need_credit': bool(utility_credit_cash(r))})
    return pd.DataFrame(flags)

def score_all_products(df_feat: pd.DataFrame) -> pd.DataFrame:
    rows = []
    for _, r in df_feat.iterrows():
        scores = {}
        scores['Карта для путешествий'] = benefit_travel(r)
        scores['Премиальная карта'] = benefit_premium(r)
        scores['Кредитная карта'] = benefit_cc(r)
        scores['Обмен валют'] = benefit_fx(r)

        s,g,m = benefit_deposits(r)
        scores['Депозит Сберегательный'] = s
        scores['Депозит Накопительный'] = g
        scores['Депозит Мультивалютный'] = m

        scores['Инвестиции'] = benefit_investments(r)
        scores['Золотые слитки'] = benefit_gold(r)

        if r.get('travel_share', 0.0) >= TRAVEL_SHARE_MIN:
            scores['Карта для путешествий'] *= 2.5

        for p,v in scores.items():
            rows.append({'client_code': r['client_code'], 'product': p, 'rules_score': float(v)})
    return pd.DataFrame(rows)

Overwriting /content/hackathon_case_hybrid/src/rules_scoring.py


In [None]:
%cd /content/hackathon_case_hybrid
%env PYTHONPATH=/content/hackathon_case_hybrid
!python -m src.pipeline

import pandas as pd
df = pd.read_csv("/content/hackathon_case_hybrid/output/recommendations.csv")
df['product'].value_counts()

/content/hackathon_case_hybrid
env: PYTHONPATH=/content/hackathon_case_hybrid
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000034 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 136
[LightGBM] [Info] Number of data points in the train set: 48, number of used features: 8
[LightGBM] [Info] Start training from score -2.079442
[LightGBM] [Info] Start training from score -0.133531
Training until validation scores don't improve for 30 rounds
[50]	valid_0's multi_logloss: 0.0526071
[100]	valid_0's multi_logloss: 0.00396756
[150]	valid_0's multi_logloss: 0.000323661
[200]	valid_0's multi_logloss: 2.65543e-05
Early stopping, best iteration is:
[206]	valid_0's multi_logloss: 1.96717e-05
Сохранено: /content/hackathon_case_hybrid/output/recommendations.csv


Unnamed: 0_level_0,count
product,Unnamed: 1_level_1
Кредитная карта,45
Премиальная карта,8
Депозит Сберегательный,7


In [None]:
%%writefile /content/hackathon_case_hybrid/src/features.py
# -*- coding: utf-8 -*-
import pandas as pd
from .utils import safe_div

PREMIUM_EXTRA = ['Ювелирные украшения','Косметика и Парфюмерия','Кафе и рестораны']
ONLINE_CATS   = ['Едим дома','Смотрим дома','Играем дома']

def series_get(df, col, default_val):
    return df[col] if col in df.columns else pd.Series(default_val, index=df.index)

def build_features(df_clients, df_txn, df_trf):
    # Траты по категориям
    spend_by_cat = df_txn.groupby(['client_code','category'])['amount'].sum().unstack(fill_value=0.0)
    spend_total  = spend_by_cat.sum(axis=1)

    # ТРИ ключевые категории для travel-карты
    travel_core  = spend_by_cat.reindex(columns=['Путешествия'], fill_value=0.0).sum(axis=1)
    taxi_spend   = spend_by_cat.reindex(columns=['Такси'],       fill_value=0.0).sum(axis=1)
    hotels_spend = spend_by_cat.reindex(columns=['Отели'],       fill_value=0.0).sum(axis=1)

    # Для совместимости: общая travel-сумма = Путешествия + Такси + Отели
    travel_sum = travel_core + taxi_spend + hotels_spend

    premium_extra_spend = spend_by_cat.reindex(columns=PREMIUM_EXTRA, fill_value=0.0).sum(axis=1)
    online_spend        = spend_by_cat.reindex(columns=ONLINE_CATS,   fill_value=0.0).sum(axis=1)

    # Transfers: частоты и суммы
    freq = df_trf.groupby(['client_code','type']).size().unstack(fill_value=0)
    amount_trf = df_trf.groupby(['client_code','type'])['amount'].sum().unstack(fill_value=0.0)

    idx = spend_by_cat.index

    inflow = sum(series_get(amount_trf, c, 0.0) for c in [
        'salary_in','stipend_in','family_in','refund_in','cashback_in','invest_in','deposit_fx_withdraw_in'
    ])
    outflow = sum(series_get(amount_trf, c, 0.0) for c in [
        'p2p_out','card_out','atm_withdrawal','utilities_out','loan_payment_out','cc_repayment_out',
        'installment_payment_out','invest_out','deposit_topup_out','gold_buy_out'
    ])

    fx_buy_amt  = series_get(amount_trf, 'fx_buy', 0.0).abs()
    fx_sell_amt = series_get(amount_trf, 'fx_sell', 0.0).abs()
    fx_volume   = fx_buy_amt + fx_sell_amt

    fx_buy_freq  = series_get(freq, 'fx_buy', 0).astype(int)
    fx_sell_freq = series_get(freq, 'fx_sell', 0).astype(int)
    fx_freq      = fx_buy_freq + fx_sell_freq

    atm_withdrawals = series_get(freq, 'atm_withdrawal',       0).astype(int)
    loan_payments   = series_get(freq, 'loan_payment_out',     0).astype(int)
    cc_repayments   = series_get(freq, 'cc_repayment_out',     0).astype(int)
    installments    = series_get(freq, 'installment_payment_out', 0).astype(int)

    top3 = spend_by_cat.apply(lambda s: list(s.sort_values(ascending=False).head(3).index), axis=1)

    df_feat = pd.DataFrame({
        'client_code': idx,
        'spend_total_3m': spend_total.values,

        # travel-разделение
        'travel_core_spend_3m':  travel_core.reindex(idx, fill_value=0.0).values,   # «Путешествия»
        'taxi_spend_3m':         taxi_spend.reindex(idx,  fill_value=0.0).values,   # «Такси»
        'hotels_spend_3m':       hotels_spend.reindex(idx,fill_value=0.0).values,   # «Отели»
        'travel_spend_3m':       travel_sum.reindex(idx,  fill_value=0.0).values,   # сумма трёх

        'premium_extra_spend_3m': premium_extra_spend.values,
        'online_spend_3m':        online_spend.values,

        'fx_volume_3m':        fx_volume.reindex(idx,      fill_value=0.0).values,
        'fx_freq_3m':          fx_freq.reindex(idx,        fill_value=0).values,
        'atm_withdrawals_3m':  atm_withdrawals.reindex(idx,fill_value=0).values,
        'loan_payments_3m':    loan_payments.reindex(idx,  fill_value=0).values,
        'cc_repayments_3m':    cc_repayments.reindex(idx,  fill_value=0).values,
        'installments_3m':     installments.reindex(idx,   fill_value=0).values,

        'inflow_3m':  inflow.reindex(idx,  fill_value=0.0).values,
        'outflow_3m': outflow.reindex(idx, fill_value=0.0).values,

        'top3_cats': top3.values,
    }).merge(
        df_clients[['client_code','name','status','age','city','avg_monthly_balance_KZT']],
        on='client_code', how='left'
    )

    # «в месяц»
    for col in [
        'spend_total_3m','travel_core_spend_3m','taxi_spend_3m','hotels_spend_3m','travel_spend_3m',
        'premium_extra_spend_3m','online_spend_3m','fx_volume_3m','inflow_3m','outflow_3m'
    ]:
        df_feat[col.replace('_3m','_m')] = df_feat[col] / 3.0

    # доля travel (по сумме трёх категорий)
    df_feat['travel_share'] = df_feat['travel_spend_3m'] / df_feat['spend_total_3m'].replace(0,1)

    return df_feat

Overwriting /content/hackathon_case_hybrid/src/features.py


In [None]:
# переопределим benefit_travel в rules_scoring.py
import io, re, pathlib

path = pathlib.Path("/content/hackathon_case_hybrid/src/rules_scoring.py")
code = path.read_text(encoding="utf-8")

new_func = """
def benefit_travel(r):
    # Чистый вариант A: Путешествия + Такси + Отели
    spend_m = (
        r.get('travel_core_spend_3m', 0.0)
        + r.get('taxi_spend_3m', 0.0)
        + r.get('hotels_spend_3m', 0.0)
    ) / 3.0
    return TRAVEL_CB * spend_m
"""

code = re.sub(r"def benefit_travel\(.*?\)\:\n(?:.|\n)*?return.*?\n", new_func, code, flags=re.S)
path.write_text(code, encoding="utf-8")
print("benefit_travel обновлён на «чистый вариант A».")

benefit_travel обновлён на «чистый вариант A».


In [None]:
%cd /content/hackathon_case_hybrid
%env PYTHONPATH=/content/hackathon_case_hybrid
!python -m src.pipeline

/content/hackathon_case_hybrid
env: PYTHONPATH=/content/hackathon_case_hybrid
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000035 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 136
[LightGBM] [Info] Number of data points in the train set: 48, number of used features: 8
[LightGBM] [Info] Start training from score -2.079442
[LightGBM] [Info] Start training from score -0.133531
Training until validation scores don't improve for 30 rounds
[50]	valid_0's multi_logloss: 0.0526071
[100]	valid_0's multi_logloss: 0.00396756
[150]	valid_0's multi_logloss: 0.000323661
[200]	valid_0's multi_logloss: 2.65543e-05
Early stopping, best iteration is:
[206]	valid_0's multi_logloss: 1.96717e-05
Сохранено: /content/hackathon_case_hybrid/output/recommendations.csv


In [None]:
import re, pathlib

cfg = pathlib.Path("/content/hackathon_case_hybrid/src/config.py")
txt = cfg.read_text(encoding="utf-8")

def set_const(text, name, value):
    pattern = rf"^{name}\s*=\s*.*$"
    repl    = f"{name} = {value}"
    if re.search(pattern, text, flags=re.M):
        return re.sub(pattern, repl, text, flags=re.M)
    else:
        return text + f"\n{repl}\n"

# ← ПОДБЕРИ значения под себя (пример делает кредит «мягче»)
txt = set_const(txt, "CREDIT_OUTFLOW_INFLOW_RATIO", "1.6")   # было 2.5
txt = set_const(txt, "CREDIT_NEARZERO_BAL_MAX",     "50_000") # было 15_000
txt = set_const(txt, "CREDIT_VERYLOW_BAL_MAX",      "100_000")# было 30_000
txt = set_const(txt, "CREDIT_LOAN_FREQ_MIN",        "3")      # было 6

cfg.write_text(txt, encoding="utf-8")
print("Пороговые параметры кредита обновлены.")

Пороговые параметры кредита обновлены.


In [None]:
%cd /content/hackathon_case_hybrid
%env PYTHONPATH=/content/hackathon_case_hybrid
!python -m src.pipeline

/content/hackathon_case_hybrid
env: PYTHONPATH=/content/hackathon_case_hybrid
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000036 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 136
[LightGBM] [Info] Number of data points in the train set: 48, number of used features: 8
[LightGBM] [Info] Start training from score -2.079442
[LightGBM] [Info] Start training from score -0.133531
Training until validation scores don't improve for 30 rounds
[50]	valid_0's multi_logloss: 0.0526071
[100]	valid_0's multi_logloss: 0.00396756
[150]	valid_0's multi_logloss: 0.000323661
[200]	valid_0's multi_logloss: 2.65543e-05
Early stopping, best iteration is:
[206]	valid_0's multi_logloss: 1.96717e-05
Сохранено: /content/hackathon_case_hybrid/output/recommendations.csv


In [None]:
import pandas as pd
df = pd.read_csv("/content/hackathon_case_hybrid/output/recommendations.csv")
df['product'].value_counts()

Unnamed: 0_level_0,count
product,Unnamed: 1_level_1
Кредит наличными,24
Кредитная карта,21
Премиальная карта,8
Депозит Сберегательный,7


In [None]:
import os
os.environ["OPENAI_API_KEY"] = "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"


In [None]:
%%writefile /content/hackathon_case_hybrid/src/push_generator.py
# -*- coding: utf-8 -*-
"""
LLM только перефразирует заданный нами шаблон,
НИЧЕГО не выдумывает и не добавляет новых чисел/фактов.
"""

import os
from datetime import datetime
from typing import Dict, Any

# --- форматирование чисел и валюты ---
def fmt_int(x):
    try:
        return f"{int(round(float(x))):,}".replace(",", " ")
    except Exception:
        return "0"

def fmt_kzt(x):
    return f"{fmt_int(x)} ₸"

RU_MONTHS = ["январе","феврале","марте","апреле","мае","июне","июле","августе","сентябре","октябре","ноябре","декабре"]
def month_ru_loc(dt=None):
    dt = dt or datetime.now()
    return RU_MONTHS[dt.month-1]

# --- строгие шаблоны по продуктам (только их и отдаём в LLM) ---
def make_base_text(product: str, name: str, f: Dict[str, Any]) -> str:
    m = month_ru_loc()
    if product == "Карта для путешествий":
        taxi_cnt  = int(f.get("taxi_cnt", 0))
        taxi_sum  = fmt_kzt(f.get("taxi_spend_m", f.get("travel_spend_m", 0)))
        benefit   = fmt_kzt(f.get("benefit_kzt", 0))
        return (f"{name}, в {m} вы сделали {taxi_cnt} поездок на такси на {taxi_sum}. "
                f"С картой для путешествий вернулась бы часть расходов —≈{benefit} кешбэком. Откройте карту в приложении.")
    elif product == "Премиальная карта":
        benefit = fmt_kzt(f.get("benefit_kzt", 0))
        has_restos = bool(f.get("has_restos", False))
        restos = " и траты в ресторанах" if has_restos else ""
        return (f"{name}, у вас стабильно высокий остаток{restos}. "
                f"Премиальная карта даёт повышенный кешбэк и бесплатные снятия. Выгода до {benefit}. Оформить сейчас.")
    elif product == "Кредитная карта":
        top3 = f.get("top3", []) + ["","",""]
        benefit = fmt_kzt(f.get("benefit_kzt", 0))
        return (f"{name}, ваши топ-категории — {top3[0]}, {top3[1]}, {top3[2]}. "
                f"Кредитная карта даёт до 10% в любимых категориях и на онлайн-сервисы. Вернули бы ≈{benefit}. Оформить карту.")
    elif product == "Обмен валют":
        curr = f.get("fx_curr","USD")
        return (f"{name}, вы часто платите в {curr}. "
                f"В приложении выгодный обмен и авто-покупка по целевому курсу. Настроить обмен.")
    elif product.startswith("Депозит"):
        benefit = fmt_kzt(f.get("benefit_kzt", 0))
        return (f"{name}, у вас остаются свободные средства. "
                f"Разместите их на вкладе — удобно копить и получать вознаграждение. Доход ~{benefit} в месяц. Открыть вклад.")
    elif product == "Инвестиции":
        return (f"{name}, попробуйте инвестиции с низким порогом входа и без комиссий на старт. "
                f"Откройте счёт и начните с небольших сумм. Открыть счёт.")
    elif product == "Золотые слитки":
        return (f"{name}, для долгосрочной защиты капитала подойдут золотые слитки. "
                f"Узнайте условия и подберите вес слитка. Подключить золото.")
    elif product == "Кредит наличными":
    return (f"{name}, если планируются крупные траты — "
            f"кредит наличными поможет и вернёт гибкость бюджету. "
            f"Узнайте доступный лимит.")
    else:
        # запасной вариант
        return (f"{name}, посмотрите новый продукт — он может подойти под ваши расходы. "
                f"Подробнее в приложении. Посмотреть.")

# --- LLM-полировка тона (без добавления фактов) ---
SYSTEM_PROMPT = """
Вы — редактор пуш-уведомлений банка. Перефразируйте переданный текст без искажений фактов.
Требования:
- тон: на равных, просто, по-человечески; обращение на «вы»
- важное в начало, без канцеляризмов; 180–220 символов
- эмодзи 0–1 по делу; максимум один «!»
- нет call to action
- Ничего не менять в тексте. Вернуть его как есть.
- если product "Кредит наличными":
то писать , если планируются крупные траты — "
            "кредит наличными поможет и вернёт гибкость бюджету. "
            .")
- числа: пробелы как разделители разрядов; дробная часть — запятая; валюта — «₸» с пробелом: 2 490 ₸
- даты: дд.мм.гггг или «30 августа 2025» (если встречаются)
- никаких новых чисел/сумм/условий; НЕ добавлять «баланс», «лимит» и т. п., если этого нет в тексте
Верните один абзац без кавычек.
"""

def _llm_polish(text: str) -> str:
    # можно заменить на любой совместимый клиент LLM
    try:
        from openai import OpenAI
        client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
        resp = client.chat.completions.create(
            model="gpt-4o-mini",
            messages=[
                {"role": "system", "content": SYSTEM_PROMPT},
                {"role": "user", "content": text}
            ],
            temperature=0.4,
            max_tokens=140
        )
        return resp.choices[0].message.content.strip()
    except Exception:
        return text  # на любой сбой — без полировки

def generate_push(name: str, product: str, facts: Dict[str, Any]) -> str:
    base = make_base_text(product, name, facts)
    polished = _llm_polish(base)
    # Жёсткая страховка: если вдруг LLM выкатил слишком длинный текст
    if len(polished) > 230:
        polished = polished[:229].rstrip() + "…"
    return polished

Overwriting /content/hackathon_case_hybrid/src/push_generator.py


In [None]:
%cd /content/hackathon_case_hybrid
%env PYTHONPATH=/content/hackathon_case_hybrid
!python -m src.pipeline

/content/hackathon_case_hybrid
env: PYTHONPATH=/content/hackathon_case_hybrid
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000038 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 136
[LightGBM] [Info] Number of data points in the train set: 48, number of used features: 8
[LightGBM] [Info] Start training from score -2.079442
[LightGBM] [Info] Start training from score -0.133531
Training until validation scores don't improve for 30 rounds
[50]	valid_0's multi_logloss: 0.0526071
[100]	valid_0's multi_logloss: 0.00396756
[150]	valid_0's multi_logloss: 0.000323661
[200]	valid_0's multi_logloss: 2.65543e-05
Early stopping, best iteration is:
[206]	valid_0's multi_logloss: 1.96717e-05
Сохранено: /content/hackathon_case_hybrid/output/recommendations.csv
