In [None]:
%mkdir -p /content/hackathon_case_hybrid/src
%mkdir -p /content/hackathon_case_hybrid/data/transactions_per_client
%mkdir -p /content/hackathon_case_hybrid/data/transfers_per_client
%mkdir -p /content/hackathon_case_hybrid/output

In [None]:
# 1. –£—Å—Ç–∞–Ω–∞–≤–ª–∏–≤–∞–µ–º –±–∏–±–ª–∏–æ—Ç–µ–∫–∏
!pip -q install lightgbm xgboost scikit-learn pandas numpy

In [None]:
# 3. –î–µ–ª–∞–µ–º src –ø–∞–∫–µ—Ç–Ω—ã–º –º–æ–¥—É–ª–µ–º
!touch /content/hackathon_case_hybrid/src/__init__.py

In [None]:
%%writefile /content/hackathon_case_hybrid/src/make_dataset.py
# -*- coding: utf-8 -*-
import re, glob, os
import pandas as pd

def infer_client_code(path: str) -> int:
    import os, re
    m = re.search(r'client_(\d+)', os.path.basename(path))
    if m: return int(m.group(1))
    m = re.search(r'(\d+)', os.path.basename(path))
    return int(m.group(1)) if m else -1

def concat_folder(folder: str, kind: str) -> pd.DataFrame:
    files = sorted(glob.glob(os.path.join(folder, "*.csv")))
    parts = []
    for f in files:
        df = pd.read_csv(f)
        if 'client_code' not in df.columns:
            df['client_code'] = infer_client_code(f)
        parts.append(df)
    if not parts: return pd.DataFrame()
    df_all = pd.concat(parts, ignore_index=True)
    need = {'date','amount','currency','client_code'}
    miss = need - set(df_all.columns)
    if miss:
        print(f"[WARN] {kind}: –Ω–µ –Ω–∞–π–¥–µ–Ω—ã –∫–æ–ª–æ–Ω–∫–∏ {miss} ‚Äî —ç—Ç–æ –æ–∫, –µ—Å–ª–∏ –æ–Ω–∏ –Ω–µ —Ç—Ä–µ–±—É—é—Ç—Å—è")
    return df_all

def build_all(data_dir="/content/hackathon_case_hybrid/data"):
    tx_dir = os.path.join(data_dir, "transactions_per_client")
    tr_dir = os.path.join(data_dir, "transfers_per_client")
    tx_all = concat_folder(tx_dir, 'transactions')
    tr_all = concat_folder(tr_dir, 'transfers')
    tx_all.to_csv(os.path.join(data_dir, "transactions.csv"), index=False)
    tr_all.to_csv(os.path.join(data_dir, "transfers.csv"), index=False)
    print(f"transactions.csv: {len(tx_all):,} rows")
    print(f"transfers.csv:    {len(tr_all):,} rows")

if __name__ == "__main__":
    build_all()

Overwriting /content/hackathon_case_hybrid/src/make_dataset.py


In [None]:
# –ó–∞–ø—É—Å–∫ —Å–∫–ª–µ–π–∫–∏ (–µ—Å–ª–∏ –Ω—É–∂–Ω–æ)
!python /content/hackathon_case_hybrid/src/make_dataset.py

transactions.csv: 18,000 rows
transfers.csv:    18,000 rows


In [None]:
%%writefile /content/hackathon_case_hybrid/src/config.py
# -*- coding: utf-8 -*-

# ML
TRAIN_ML = True
USE_ML_IN_HYBRID = True
WEIGHT_RULES = 0.7
WEIGHT_ML = 0.3

# –¢—Ä–∏–≥–≥–µ—Ä—ã –ø–æ—Ç—Ä–µ–±–Ω–æ—Å—Ç–∏ (–¥–ª—è –∫—Ä–µ–¥–∏—Ç–∞ ‚Äî —É—Å–∏–ª–µ–Ω–Ω—ã–µ)
CREDIT_OUTFLOW_INFLOW_RATIO = 2.5    # —Ä–∞—Å—Ö–æ–¥—ã >= 2.5 √ó –¥–æ—Ö–æ–¥–æ–≤
CREDIT_NEARZERO_BAL_MAX     = 15_000 # ¬´–ø–æ—á—Ç–∏ –Ω–æ–ª—å¬ª –Ω–∞ —Å—á—ë—Ç–µ
CREDIT_VERYLOW_BAL_MAX      = 30_000 # –æ—á–µ–Ω—å –º–∞–ª–µ–Ω—å–∫–∏–π –±–∞–ª–∞–Ω—Å
CREDIT_LOAN_FREQ_MIN        = 6      # ‚â• 6 –ø–ª–∞—Ç–µ–∂–µ–π –ø–æ –∑–∞–π–º–∞–º –∑–∞ 3 –º–µ—Å


# –ö–∞—Ä—Ç—ã
PREMIUM_CB_LIMIT = 100_000.0
PREMIUM_TIER_THRESHOLDS = [1_000_000, 6_000_000]
PREMIUM_TIER_CBs = [0.02, 0.03, 0.04]

TRAVEL_CB = 0.04
PREMIUM_EXTRA_CB_CATS = ['–Æ–≤–µ–ª–∏—Ä–Ω—ã–µ —É–∫—Ä–∞—à–µ–Ω–∏—è','–ö–æ—Å–º–µ—Ç–∏–∫–∞ –∏ –ü–∞—Ä—Ñ—é–º–µ—Ä–∏—è','–ö–∞—Ñ–µ –∏ —Ä–µ—Å—Ç–æ—Ä–∞–Ω—ã']
PREMIUM_EXTRA_CB = 0.04

CC_TOP3_CB = 0.10
CC_ONLINE_CB_CATS = ['–ï–¥–∏–º –¥–æ–º–∞','–°–º–æ—Ç—Ä–∏–º –¥–æ–º–∞','–ò–≥—Ä–∞–µ–º –¥–æ–º–∞']
CC_ONLINE_CB = 0.10

# –î–µ–ø–æ–∑–∏—Ç—ã (–≥–æ–¥–æ–≤—ã–µ —Å—Ç–∞–≤–∫–∏)
RATE_SAVINGS = 0.165
RATE_GOAL    = 0.155
RATE_MULTI   = 0.145

# –û–±–º–µ–Ω –≤–∞–ª—é—Ç (—ç–∫–æ–Ω–æ–º–∏—è –Ω–∞ —Å–ø—Ä–µ–¥–µ)
FX_SPREAD_SAVING = 0.004  # 0.4% * (—Å—Ä–µ–¥–Ω–µ–º–µ—Å—è—á–Ω—ã–π –æ–±—ä—ë–º FX)

# –¢—Ä–∏–≥–≥–µ—Ä—ã –ø–æ—Ç—Ä–µ–±–Ω–æ—Å—Ç–∏ (–∫—Ä–µ–¥–∏—Ç)
NEED_GAP_RATIO = 1.9
LOW_BALANCE_KZT = 100_000

TRAVEL_SHARE_MIN = 0.15
TRAVEL_TXN_MIN = 6

# –ó–æ–ª–æ—Ç–æ ‚Äî utility –ø–æ—Ä–æ–≥–∏
GOLD_T1_BAL = 100_000_000  # 100 –º–ª–Ω ‚Ç∏ ‚Üí 5 000 ‚Ç∏/–º–µ—Å
GOLD_T2_BAL = 200_000_000  # 200 –º–ª–Ω ‚Ç∏ ‚Üí 20 000 ‚Ç∏/–º–µ—Å
GOLD_T1_UTILITY = 5_000
GOLD_T2_UTILITY = 20_000

# –ü—É—à-–ª–∏–Ω—Ç–µ—Ä
PUSH_MIN_LEN = 180
PUSH_MAX_LEN = 220
CTA_WORDS = ['–û—Ç–∫—Ä—ã—Ç—å','–ù–∞—Å—Ç—Ä–æ–∏—Ç—å','–ü–æ—Å–º–æ—Ç—Ä–µ—Ç—å','–û—Ñ–æ—Ä–º–∏—Ç—å','–ü–æ–¥–∫–ª—é—á–∏—Ç—å']

Overwriting /content/hackathon_case_hybrid/src/config.py


In [None]:
%%writefile /content/hackathon_case_hybrid/src/features.py
# -*- coding: utf-8 -*-
import pandas as pd
from .utils import safe_div

TRAVEL_CATS = ['–ü—É—Ç–µ—à–µ—Å—Ç–≤–∏—è','–¢–∞–∫—Å–∏','–û—Ç–µ–ª–∏','–ü–æ–µ–∑–¥–∞','–°–∞–º–æ–ª—ë—Ç—ã']
PREMIUM_EXTRA = ['–Æ–≤–µ–ª–∏—Ä–Ω—ã–µ —É–∫—Ä–∞—à–µ–Ω–∏—è','–ö–æ—Å–º–µ—Ç–∏–∫–∞ –∏ –ü–∞—Ä—Ñ—é–º–µ—Ä–∏—è','–ö–∞—Ñ–µ –∏ —Ä–µ—Å—Ç–æ—Ä–∞–Ω—ã']
ONLINE_CATS = ['–ï–¥–∏–º –¥–æ–º–∞','–°–º–æ—Ç—Ä–∏–º –¥–æ–º–∞','–ò–≥—Ä–∞–µ–º –¥–æ–º–∞']

def series_get(df, col, default_val):
    return df[col] if col in df.columns else pd.Series(default_val, index=df.index)

def build_features(df_clients, df_txn, df_trf):
    spend_by_cat = df_txn.groupby(['client_code','category'])['amount'].sum().unstack(fill_value=0.0)
    spend_total = spend_by_cat.sum(axis=1)

    travel_spend = spend_by_cat.reindex(columns=TRAVEL_CATS, fill_value=0.0).sum(axis=1)
    premium_extra_spend = spend_by_cat.reindex(columns=PREMIUM_EXTRA, fill_value=0.0).sum(axis=1)
    online_spend = spend_by_cat.reindex(columns=ONLINE_CATS, fill_value=0.0).sum(axis=1)

    freq = df_trf.groupby(['client_code','type']).size().unstack(fill_value=0)
    amount_trf = df_trf.groupby(['client_code','type'])['amount'].sum().unstack(fill_value=0.0)

    idx = spend_by_cat.index

    inflow = sum(series_get(amount_trf, c, 0.0) for c in ['salary_in','stipend_in','family_in','refund_in','cashback_in','invest_in','deposit_fx_withdraw_in'])
    outflow = sum(series_get(amount_trf, c, 0.0) for c in ['p2p_out','card_out','atm_withdrawal','utilities_out','loan_payment_out','cc_repayment_out','installment_payment_out','invest_out','deposit_topup_out','gold_buy_out'])

    fx_buy_amt = series_get(amount_trf, 'fx_buy', 0.0).abs()
    fx_sell_amt = series_get(amount_trf, 'fx_sell', 0.0).abs()
    fx_volume = fx_buy_amt + fx_sell_amt

    fx_buy_freq = series_get(freq, 'fx_buy', 0).astype(int)
    fx_sell_freq = series_get(freq, 'fx_sell', 0).astype(int)
    fx_freq = fx_buy_freq + fx_sell_freq

    atm_withdrawals = series_get(freq, 'atm_withdrawal', 0).astype(int)
    loan_payments = series_get(freq, 'loan_payment_out', 0).astype(int)
    cc_repayments = series_get(freq, 'cc_repayment_out', 0).astype(int)
    installments = series_get(freq, 'installment_payment_out', 0).astype(int)

    top3 = spend_by_cat.apply(lambda s: list(s.sort_values(ascending=False).head(3).index), axis=1)

    df_feat = pd.DataFrame({
        'client_code': idx,
        'spend_total_3m': spend_total.values,
        'travel_spend_3m': travel_spend.values,
        'premium_extra_spend_3m': premium_extra_spend.values,
        'online_spend_3m': online_spend.values,
        'fx_volume_3m': fx_volume.reindex(idx, fill_value=0.0).values,
        'fx_freq_3m': fx_freq.reindex(idx, fill_value=0).values,
        'atm_withdrawals_3m': atm_withdrawals.reindex(idx, fill_value=0).values,
        'loan_payments_3m': loan_payments.reindex(idx, fill_value=0).values,
        'cc_repayments_3m': cc_repayments.reindex(idx, fill_value=0).values,
        'installments_3m': installments.reindex(idx, fill_value=0).values,
        'inflow_3m': inflow.reindex(idx, fill_value=0.0).values,
        'outflow_3m': outflow.reindex(idx, fill_value=0.0).values,
        'top3_cats': top3.values,
    }).merge(df_clients[['client_code','name','status','age','city','avg_monthly_balance_KZT']], on='client_code', how='left')

    for col in ['spend_total_3m','travel_spend_3m','premium_extra_spend_3m','online_spend_3m','fx_volume_3m','inflow_3m','outflow_3m']:
        df_feat[col.replace('_3m','_m')] = df_feat[col] / 3.0

    df_feat['travel_share'] = df_feat['travel_spend_3m'] / df_feat['spend_total_3m'].replace(0,1)
    return df_feat

Overwriting /content/hackathon_case_hybrid/src/features.py


In [None]:
%%writefile /content/hackathon_case_hybrid/src/utils.py
# -*- coding: utf-8 -*-
import re
from datetime import datetime

def parse_date(s):
    try:
        return datetime.fromisoformat(s)
    except Exception:
        return None

def month_name_ru(dt):
    months = ['—è–Ω–≤–∞—Ä–µ','—Ñ–µ–≤—Ä–∞–ª–µ','–º–∞—Ä—Ç–µ','–∞–ø—Ä–µ–ª–µ','–º–∞–µ','–∏—é–Ω–µ','–∏—é–ª–µ','–∞–≤–≥—É—Å—Ç–µ','—Å–µ–Ω—Ç—è–±—Ä–µ','–æ–∫—Ç—è–±—Ä–µ','–Ω–æ—è–±—Ä–µ','–¥–µ–∫–∞–±—Ä–µ']
    if dt is None: dt = datetime.now()
    return months[dt.month-1]

def format_currency_kzt(x):
    if x is None: return ''
    try:
        val = float(x)
    except:
        return str(x)
    if abs(val - int(val)) < 1e-9:
        s = f"{int(val):,}".replace(',', ' ')
        return f"{s} ‚Ç∏"
    else:
        s = f"{val:,.2f}".replace(',', ' ')
        s = s.replace('.', ',')
        return f"{s} ‚Ç∏"

def clamp(x, lo, hi):
    return max(lo, min(hi, x))

def safe_div(a, b, default=0.0):
    try:
        return a / b if b else default
    except:
        return default

def only_one_exclamation(text):
    if text.count('!') > 1:
        return False
    words = re.findall(r"\b[\w–Å–ê-–Ø]{3,}\b", text)
    caps_words = [w for w in words if w.upper() == w and not w.isdigit()]
    return len(caps_words) <= 1

Overwriting /content/hackathon_case_hybrid/src/utils.py


In [None]:
%%writefile /content/hackathon_case_hybrid/src/rules_scoring.py
# -*- coding: utf-8 -*-
import pandas as pd
from .config import (
    PREMIUM_CB_LIMIT, PREMIUM_TIER_THRESHOLDS, PREMIUM_TIER_CBs,
    TRAVEL_CB, PREMIUM_EXTRA_CB_CATS, PREMIUM_EXTRA_CB,
    CC_TOP3_CB, CC_ONLINE_CB_CATS, CC_ONLINE_CB,
    RATE_SAVINGS, RATE_GOAL, RATE_MULTI, FX_SPREAD_SAVING,
    NEED_GAP_RATIO, LOW_BALANCE_KZT, TRAVEL_SHARE_MIN,
    GOLD_T1_BAL, GOLD_T2_BAL, GOLD_T1_UTILITY, GOLD_T2_UTILITY,
    CREDIT_OUTFLOW_INFLOW_RATIO, CREDIT_NEARZERO_BAL_MAX,
    CREDIT_VERYLOW_BAL_MAX, CREDIT_LOAN_FREQ_MIN )

# –î–µ–Ω–µ–∂–Ω–æ–µ —Ä–∞–Ω–∂–∏—Ä–æ–≤–∞–Ω–∏–µ ‚Äî –∫—Ä–µ–¥–∏—Ç –Ω–∞–ª–∏—á–Ω—ã–º–∏ –∏—Å–∫–ª—é—á—ë–Ω (–∏–¥—ë—Ç –æ—Ç–¥–µ–ª—å–Ω—ã–º —Ñ–ª–∞–≥–æ–º)
PRODUCTS = [
    '–ö–∞—Ä—Ç–∞ –¥–ª—è –ø—É—Ç–µ—à–µ—Å—Ç–≤–∏–π',
    '–ü—Ä–µ–º–∏–∞–ª—å–Ω–∞—è –∫–∞—Ä—Ç–∞',
    '–ö—Ä–µ–¥–∏—Ç–Ω–∞—è –∫–∞—Ä—Ç–∞',
    '–û–±–º–µ–Ω –≤–∞–ª—é—Ç',
    '–î–µ–ø–æ–∑–∏—Ç –°–±–µ—Ä–µ–≥–∞—Ç–µ–ª—å–Ω—ã–π',
    '–î–µ–ø–æ–∑–∏—Ç –ù–∞–∫–æ–ø–∏—Ç–µ–ª—å–Ω—ã–π',
    '–î–µ–ø–æ–∑–∏—Ç –ú—É–ª—å—Ç–∏–≤–∞–ª—é—Ç–Ω—ã–π',
    '–ò–Ω–≤–µ—Å—Ç–∏—Ü–∏–∏',
    '–ó–æ–ª–æ—Ç—ã–µ —Å–ª–∏—Ç–∫–∏'
]

def tier_cashback(avg_balance):
    if pd.isna(avg_balance): return PREMIUM_TIER_CBs[0]
    if avg_balance < PREMIUM_TIER_THRESHOLDS[0]: return PREMIUM_TIER_CBs[0]
    if avg_balance < PREMIUM_TIER_THRESHOLDS[1]: return PREMIUM_TIER_CBs[1]
    return PREMIUM_TIER_CBs[2]

def benefit_travel(r):
    spend = r.get('travel_spend_3m', 0.0) / 3.0
    return TRAVEL_CB * spend  # –ø—Ä–∏ –∂–µ–ª–∞–Ω–∏–∏ –º–æ–∂–Ω–æ –≤–≤–µ—Å—Ç–∏ –æ—Ç–¥–µ–ª—å–Ω—ã–π –ª–∏–º–∏—Ç

def benefit_premium(r):
    base_spend_m = r.get('spend_total_3m', 0.0)/3.0
    tcb = tier_cashback(r.get('avg_monthly_balance_KZT', 0.0))
    base_cb = min(PREMIUM_CB_LIMIT, tcb * base_spend_m)

    extra_spend_m = r.get('premium_extra_spend_3m', 0.0)/3.0
    extra_cb = PREMIUM_EXTRA_CB * extra_spend_m

    atm_count_m = r.get('atm_withdrawals_3m', 0) / 3.0
    saved_fees = 0.01 * min(3_000_000.0, atm_count_m * 30_000.0)  # –ø—Ä–∏–º–µ—Ä –æ—Ü–µ–Ω–∫–∏ —ç–∫–æ–Ω–æ–º–∏–∏

    return base_cb + extra_cb + saved_fees

def benefit_cc(r):
    total_m = r.get('spend_total_3m', 0.0)/3.0
    top3_spend_m = 0.6 * total_m
    online_spend_m = r.get('online_spend_3m', 0.0)/3.0
    return CC_TOP3_CB * top3_spend_m + CC_ONLINE_CB * online_spend_m

def benefit_fx(r):
    vol_m = r.get('fx_volume_3m', 0.0) / 3.0
    return FX_SPREAD_SAVING * vol_m

# ---------- –ö–†–ï–î–ò–¢ –ù–ê–õ–ò–ß–ù–´–ú–ò: –æ—Ç–¥–µ–ª—å–Ω—ã–π —Ñ–ª–∞–≥, –ù–ï –¥–µ–Ω—å–≥–∏ ----------
def utility_credit_cash(r):
    infl = float(r.get('inflow_3m', 0.0))
    out  = float(r.get('outflow_3m', 0.0))
    avgb = float(r.get('avg_monthly_balance_KZT', 0.0))
    loan_freq = int(r.get('loan_payments_3m', 0))

    big_gap = out >= CREDIT_OUTFLOW_INFLOW_RATIO * max(infl, 1.0)
    near_zero_bal = avgb <= CREDIT_NEARZERO_BAL_MAX
    very_low_bal  = avgb <= CREDIT_VERYLOW_BAL_MAX

    return 1.0 if (big_gap and (near_zero_bal or (very_low_bal and loan_freq >= CREDIT_LOAN_FREQ_MIN))) else 0.0
# ---------------------------------------------------------------

def benefit_deposits(r):
    bal = r.get('avg_monthly_balance_KZT', 0.0)
    savings = (RATE_SAVINGS/12.0) * bal
    goal    = (RATE_GOAL/12.0)    * bal
    multi   = (RATE_MULTI/12.0)   * bal * 0.4  # —É—Å–ª–æ–≤–Ω–æ —á–∞—Å—Ç—å –±–∞–ª–∞–Ω—Å–∞
    return savings, goal, multi

# –ò–Ω–≤–µ—Å—Ç–∏—Ü–∏–∏ ‚Äî 0 (–ø–æ –¢–ó –Ω–µ –æ–±–µ—â–∞–µ–º –¥–æ—Ö–æ–¥–Ω–æ—Å—Ç—å)
def benefit_investments(r):
    return 0.0

# –ó–æ–ª–æ—Ç–æ ‚Äî utility –ø–æ –ø–æ—Ä–æ–≥–∞–º –±–∞–ª–∞–Ω—Å–∞
def benefit_gold(r):
    bal = r.get('avg_monthly_balance_KZT', 0.0)
    if bal >= GOLD_T2_BAL:
        return float(GOLD_T2_UTILITY)
    if bal >= GOLD_T1_BAL:
        return float(GOLD_T1_UTILITY)
    return 0.0

def score_all_products(df_feat: pd.DataFrame) -> pd.DataFrame:
    rows = []
    for _, r in df_feat.iterrows():
        scores = {}
        scores['–ö–∞—Ä—Ç–∞ –¥–ª—è –ø—É—Ç–µ—à–µ—Å—Ç–≤–∏–π'] = benefit_travel(r)
        scores['–ü—Ä–µ–º–∏–∞–ª—å–Ω–∞—è –∫–∞—Ä—Ç–∞'] = benefit_premium(r)
        scores['–ö—Ä–µ–¥–∏—Ç–Ω–∞—è –∫–∞—Ä—Ç–∞'] = benefit_cc(r)
        scores['–û–±–º–µ–Ω –≤–∞–ª—é—Ç'] = benefit_fx(r)

        s,g,m = benefit_deposits(r)
        scores['–î–µ–ø–æ–∑–∏—Ç –°–±–µ—Ä–µ–≥–∞—Ç–µ–ª—å–Ω—ã–π'] = s
        scores['–î–µ–ø–æ–∑–∏—Ç –ù–∞–∫–æ–ø–∏—Ç–µ–ª—å–Ω—ã–π'] = g
        scores['–î–µ–ø–æ–∑–∏—Ç –ú—É–ª—å—Ç–∏–≤–∞–ª—é—Ç–Ω—ã–π'] = m

        scores['–ò–Ω–≤–µ—Å—Ç–∏—Ü–∏–∏'] = benefit_investments(r)
        scores['–ó–æ–ª–æ—Ç—ã–µ —Å–ª–∏—Ç–∫–∏'] = benefit_gold(r)

        if r.get('travel_share', 0.0) >= TRAVEL_SHARE_MIN:
            scores['–ö–∞—Ä—Ç–∞ –¥–ª—è –ø—É—Ç–µ—à–µ—Å—Ç–≤–∏–π'] *= 1.1

        for p,v in scores.items():
            rows.append({'client_code': r['client_code'], 'product': p, 'rules_score': float(v)})
    return pd.DataFrame(rows)

Overwriting /content/hackathon_case_hybrid/src/rules_scoring.py


In [None]:
import os
os.environ["OPENAI_API_KEY"] = "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"

In [None]:
%%writefile /content/hackathon_case_hybrid/src/ml_classifier.py
# -*- coding: utf-8 -*-
import pandas as pd
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from dataclasses import dataclass
from typing import Dict, List

FEATURE_COLS = [
    'spend_total_3m','travel_spend_3m','premium_extra_spend_3m','online_spend_3m',
    'fx_volume_3m','fx_freq_3m','atm_withdrawals_3m','loan_payments_3m',
    'cc_repayments_3m','installments_3m','inflow_3m','outflow_3m',
    'avg_monthly_balance_KZT','travel_share'
]

@dataclass
class MLModel:
    label_encoder: Dict[str, int]
    booster: lgb.Booster
    classes_: List[str]

def synth_labels_from_rules(df_scores: pd.DataFrame) -> pd.DataFrame:
    idx = df_scores.sort_values(['client_code','rules_score'], ascending=[True, False]) \
                   .groupby('client_code').head(1)
    return idx[['client_code','product']].rename(columns={'product':'label'})

def train_ml_lightgbm(df_feat: pd.DataFrame, df_scores: pd.DataFrame) -> MLModel:
    labels = synth_labels_from_rules(df_scores)
    df = df_feat.merge(labels, on='client_code', how='inner')

    # 1) –≤—ã–∫–∏–Ω–µ–º —Ä–µ–¥–∫–∏–µ –∫–ª–∞—Å—Å—ã (—á–∞—Å—Ç–æ—Ç–∞ < 2), –∏–Ω–∞—á–µ stratify –Ω–µ —Å—Ä–∞–±–æ—Ç–∞–µ—Ç
    vc = df['label'].value_counts()
    keep_labels = vc[vc >= 2].index
    dropped = set(vc.index) - set(keep_labels)
    if dropped:
        print("[INFO] dropped rare classes from ML training:", {k:int(vc[k]) for k in dropped})
    df = df[df['label'].isin(keep_labels)].copy()

    # 2) –ø—Ä–æ–≤–µ—Ä–∫–∞: –æ—Å—Ç–∞–ª–æ—Å—å –ª–∏ >=2 –∫–ª–∞—Å—Å–æ–≤
    classes_left = sorted(df['label'].unique())
    if len(classes_left) < 2:
        raise RuntimeError(f"Too few classes after filtering: {classes_left}")

    X = df[FEATURE_COLS].fillna(0.0).astype(float)
    y = df['label'].astype(str)

    enc = {c:i for i,c in enumerate(classes_left)}
    y_enc = y.map(enc)

    from sklearn.model_selection import train_test_split
    Xtr, Xva, ytr, yva = train_test_split(
        X, y_enc, test_size=0.2, random_state=42, stratify=y_enc
    )

    import lightgbm as lgb
    train_set = lgb.Dataset(Xtr, label=ytr)
    valid_set = lgb.Dataset(Xva, label=yva)

    params = dict(
        objective='multiclass',
        num_class=len(classes_left),
        metric='multi_logloss',
        learning_rate=0.05,
        num_leaves=31,
        seed=42
    )
    booster = lgb.train(
        params, train_set,
        num_boost_round=300,
        valid_sets=[valid_set],
        callbacks=[lgb.early_stopping(30), lgb.log_evaluation(50)]
    )
    return MLModel(label_encoder=enc, booster=booster, classes_=classes_left)

def predict_ml_proba(model: MLModel, df_feat: pd.DataFrame, products: List[str]) -> pd.DataFrame:
    X = df_feat[FEATURE_COLS].fillna(0.0).astype(float)
    proba = model.booster.predict(X, num_iteration=model.booster.best_iteration)
    cols = model.classes_
    df_proba = pd.DataFrame(proba, columns=cols)
    df_proba.insert(0, 'client_code', df_feat['client_code'].values)
    for p in products:
        if p not in df_proba.columns:
            df_proba[p] = 0.0
    return df_proba[['client_code'] + products]

Overwriting /content/hackathon_case_hybrid/src/ml_classifier.py


In [None]:
%%writefile /content/hackathon_case_hybrid/src/hybrid_selector.py
# -*- coding: utf-8 -*-
import pandas as pd, numpy as np
from .rules_scoring import PRODUCTS, score_all_products, need_credit_flags
from .ml_classifier import train_ml_lightgbm, predict_ml_proba
from .config import TRAIN_ML, USE_ML_IN_HYBRID, WEIGHT_RULES, WEIGHT_ML

TIE_DELTA = 0.05

def softmax_log1p(x: np.ndarray) -> np.ndarray:
    z = np.log1p(np.maximum(x, 0.0))
    z = z - z.max()
    e = np.exp(z)
    return e / e.sum()

def rank_and_select(df_feat: pd.DataFrame):
    df_rules = score_all_products(df_feat)
    df_need = need_credit_flags(df_feat)

    use_ml = False
    if TRAIN_ML:
        try:
            model = train_ml_lightgbm(df_feat, df_rules)
            df_proba = predict_ml_proba(model, df_feat, PRODUCTS)
            use_ml = True
        except Exception as e:
            print("[WARN] ML disabled:", e)
            df_proba = df_feat[['client_code']].copy()
            for p in PRODUCTS: df_proba[p] = 0.0
    else:
        df_proba = df_feat[['client_code']].copy()
        for p in PRODUCTS: df_proba[p] = 0.0

    df = df_rules.merge(df_proba, on='client_code', how='left')
    df['ml_proba'] = df.apply(lambda r: r.get(r['product'], 0.0), axis=1)

    max_rules = df.groupby('client_code')['rules_score'].transform('max')
    if USE_ML_IN_HYBRID and use_ml:
        df['hybrid_score'] = WEIGHT_RULES * df['rules_score'] + WEIGHT_ML * (df['ml_proba'] * max_rules)
    else:
        df['hybrid_score'] = df['rules_score']

    winners, top4_rows = [], []
    for cid, grp in df.groupby('client_code', sort=False):
        grp = grp.sort_values('rules_score', ascending=False)
        top1, top2 = grp.iloc[0], (grp.iloc[1] if len(grp) > 1 else None)
        max_r = top1['rules_score']
        tie = top2 is not None and (abs(top1['rules_score'] - top2['rules_score']) < (TIE_DELTA*max_r) or top1['rules_score']==top2['rules_score'])
        if tie and use_ml:
            r_scores = grp['rules_score'].values
            p_rules = softmax_log1p(r_scores)
            ml_vec = grp['ml_proba'].values
            p_hybrid = WEIGHT_RULES*p_rules + WEIGHT_ML*ml_vec
            k = int(np.argmax(p_hybrid))
            winner_row = grp.iloc[k].copy()
            winner_row['hybrid_score'] = p_hybrid[k]
            grp['hybrid_score'] = p_hybrid
        else:
            k = grp['hybrid_score'].values.argmax()
            winner_row = grp.iloc[k]
        winners.append({'client_code': cid,'product':winner_row['product'],'hybrid_score':winner_row['hybrid_score']})
        top4_rows.extend(grp.sort_values('hybrid_score',ascending=False).head(4).to_dict('records'))

    best = pd.DataFrame(winners).merge(df_need, on='client_code', how='left')
    top4 = pd.DataFrame(top4_rows)
    return best, top4, df

Overwriting /content/hackathon_case_hybrid/src/hybrid_selector.py


In [None]:
%%writefile /content/hackathon_case_hybrid/src/push_nlp.py
# -*- coding: utf-8 -*-
from datetime import datetime
from .utils import month_name_ru, format_currency_kzt, only_one_exclamation
from .config import CTA_WORDS, PUSH_MIN_LEN, PUSH_MAX_LEN

def template_travel(name, month, taxi_cnt, travel_spend_m, benefit):
    return f"{name}, –≤ {month} —É –≤–∞—Å {taxi_cnt} –ø–æ–µ–∑–¥–æ–∫ –Ω–∞ —Ç–∞–∫—Å–∏ –Ω–∞ {format_currency_kzt(travel_spend_m)}. –° –∫–∞—Ä—Ç–æ–π –¥–ª—è –ø—É—Ç–µ—à–µ—Å—Ç–≤–∏–π –≤–µ—Ä–Ω—É–ª–∏ –±—ã ‚âà{format_currency_kzt(benefit)} –∫–µ—à–±—ç–∫–æ–º. –û—Ç–∫—Ä–æ–π—Ç–µ –∫–∞—Ä—Ç—É."

def template_premium(name, has_restos, benefit):
    return f"{name}, —É –≤–∞—Å —Å—Ç–∞–±–∏–ª—å–Ω—ã–π –∫—Ä—É–ø–Ω—ã–π –æ—Å—Ç–∞—Ç–æ–∫{', —Ç—Ä–∞—Ç—ã –≤ —Ä–µ—Å—Ç–æ—Ä–∞–Ω–∞—Ö' if has_restos else ''}. –ü—Ä–µ–º–∏–∞–ª—å–Ω–∞—è –∫–∞—Ä—Ç–∞ –¥–∞—Å—Ç –∫–µ—à–±—ç–∫ –∏ –±–µ—Å–ø–ª–∞—Ç–Ω—ã–µ —Å–Ω—è—Ç–∏—è. –í—ã–≥–æ–¥–∞ –¥–æ {format_currency_kzt(benefit)}. –û—Ñ–æ—Ä–º–∏—Ç–µ –∫–∞—Ä—Ç—É."

def template_cc(name, cat1, cat2, cat3, benefit):
    return f"{name}, –≤–∞—à–∏ —Ç–æ–ø-–∫–∞—Ç–µ–≥–æ—Ä–∏–∏ ‚Äî {cat1}, {cat2}, {cat3}. –ö—Ä–µ–¥–∏—Ç–Ω–∞—è –∫–∞—Ä—Ç–∞ –¥–∞—ë—Ç –¥–æ 10% –∫–µ—à–±—ç–∫–∞ –∏ –æ–Ω–ª–∞–π–Ω-–±–æ–Ω—É—Å—ã. –í–µ—Ä–Ω—É–ª–∏ –±—ã {format_currency_kzt(benefit)}. –û—Ñ–æ—Ä–º–∏—Ç–µ –∫–∞—Ä—Ç—É."

def template_fx(name, curr):
    return f"{name}, –≤—ã —á–∞—Å—Ç–æ –ø–æ–ª—å–∑—É–µ—Ç–µ—Å—å –≤–∞–ª—é—Ç–æ–π. –û–±–º–µ–Ω {curr} –ø–æ –≤—ã–≥–æ–¥–Ω–æ–º—É –∫—É—Ä—Å—É –±–µ–∑ –∫–æ–º–∏—Å—Å–∏–∏, –º–æ–º–µ–Ω—Ç–∞–ª—å–Ω–æ. –ü–æ–¥–∫–ª—é—á–∏—Ç–µ –æ–±–º–µ–Ω –≤–∞–ª—é—Ç."

def template_deposit(name, benefit):
    return f"{name}, –Ω–∞ –¥–µ–ø–æ–∑–∏—Ç–µ –¥–æ—Ö–æ–¥ {format_currency_kzt(benefit)} –≤ –º–µ—Å—è—Ü. –°–æ—Ö—Ä–∞–Ω–∏—Ç–µ –∏ –ø—Ä–∏—É–º–Ω–æ–∂—å—Ç–µ —Å—Ä–µ–¥—Å—Ç–≤–∞. –û—Ç–∫—Ä–æ–π—Ç–µ –¥–µ–ø–æ–∑–∏—Ç."

def template_invest(name):
    return f"{name}, —É –≤–∞—Å –µ—Å—Ç—å –∑–∞–ø–∞—Å –Ω–∞ –∏–Ω–≤–µ—Å—Ç–∏—Ü–∏–∏. –î–∏–≤–µ—Ä—Å–∏—Ñ–∏—Ü–∏—Ä—É–π—Ç–µ –ø–æ—Ä—Ç—Ñ–µ–ª—å —Å —Ñ–æ–Ω–¥–∞–º–∏ –∏ –∞–∫—Ü–∏—è–º–∏. –ü–æ–¥–∫–ª—é—á–∏—Ç–µ –∏–Ω–≤–µ—Å—Ç–∏—Ü–∏–∏."

def template_cash_loan(name):
    return f"{name}, —É –≤–∞—Å —Ä–∞—Å—Ö–æ–¥—ã –≤—ã—à–µ –¥–æ—Ö–æ–¥–æ–≤. –ù–∞–ª–∏—á–Ω—ã–π –∫—Ä–µ–¥–∏—Ç –ø–æ–º–æ–∂–µ—Ç –∑–∞–∫—Ä—ã—Ç—å —Ä–∞–∑—Ä—ã–≤ –∏ –ø–ª–∞–Ω–∏—Ä–æ–≤–∞—Ç—å –±—é–¥–∂–µ—Ç. –û—Ñ–æ—Ä–º–∏—Ç–µ –∫—Ä–µ–¥–∏—Ç."

def lint_push(text: str) -> dict:
    ok = True
    if not (PUSH_MIN_LEN <= len(text) <= PUSH_MAX_LEN): ok=False
    if not only_one_exclamation(text): ok=False
    if not any(text.strip().endswith(w) for w in CTA_WORDS): ok=False
    return {"ok":ok,"len":len(text)}

def generate_push_text(product: str, facts: dict) -> str:
    name = facts.get("name","–ö–ª–∏–µ–Ω—Ç")
    if product=="–ö–∞—Ä—Ç–∞ –¥–ª—è –ø—É—Ç–µ—à–µ—Å—Ç–≤–∏–π":
        text = template_travel(name, month_name_ru(datetime.now()), facts.get("taxi_cnt",5), facts.get("travel_spend_m",30000), facts.get("benefit_kzt",1200))
    elif product=="–ü—Ä–µ–º–∏–∞–ª—å–Ω–∞—è –∫–∞—Ä—Ç–∞":
        text = template_premium(name, facts.get("has_restos",True), facts.get("benefit_kzt",2000))
    elif product=="–ö—Ä–µ–¥–∏—Ç–Ω–∞—è –∫–∞—Ä—Ç–∞":
        cats = facts.get("top3",["—Ä–µ—Å—Ç–æ—Ä–∞–Ω—ã","—Ç–∞–∫—Å–∏","–ø—Ä–æ–¥—É–∫—Ç—ã"]) + ["","",""]
        text = template_cc(name, cats[0], cats[1], cats[2], facts.get("benefit_kzt",2500))
    elif product=="–û–±–º–µ–Ω –≤–∞–ª—é—Ç":
        text = template_fx(name, facts.get("fx_curr","USD"))
    elif product.startswith("–î–µ–ø–æ–∑–∏—Ç"):
        text = template_deposit(name, facts.get("benefit_kzt",1800))
    elif product=="–ò–Ω–≤–µ—Å—Ç–∏—Ü–∏–∏":
        text = template_invest(name)  # –±–µ–∑ —Ü–∏—Ñ—Ä, –ø–æ –¢–ó
    elif product=="–ó–æ–ª–æ—Ç—ã–µ —Å–ª–∏—Ç–∫–∏":
        text = "–î–æ–ª–≥–æ—Å—Ä–æ—á–Ω–∞—è –∑–∞—â–∏—Ç–∞ –∏ –¥–∏–≤–µ—Ä—Å–∏—Ñ–∏–∫–∞—Ü–∏—è –∫–∞–ø–∏—Ç–∞–ª–∞ –∑–∞ —Å—á—ë—Ç –∑–æ–ª–æ—Ç—ã—Ö —Å–ª–∏—Ç–∫–æ–≤. –£–∑–Ω–∞–π—Ç–µ –¥–µ—Ç–∞–ª–∏ –∏ –ø–æ–¥–≤–µ—Ä—Å—Ç–∞–π—Ç–µ —Å—Ç—Ä–∞—Ç–µ–≥–∏—é. –ü–æ–¥–∫–ª—é—á–∏—Ç–µ –∑–æ–ª–æ—Ç–æ."
    else:
        text = template_invest(name)
    check = lint_push(text)
    if not check["ok"]:
        if len(text)<PUSH_MIN_LEN: text=text+" –û—Ñ–æ—Ä–º–∏—Ç–µ –∫–∞—Ä—Ç—É."
        if len(text)>PUSH_MAX_LEN: text=text[:PUSH_MAX_LEN-1]+"‚Ä¶"
    return text

Overwriting /content/hackathon_case_hybrid/src/push_nlp.py


In [None]:
%%writefile /content/hackathon_case_hybrid/src/pipeline.py
# -*- coding: utf-8 -*-
import pandas as pd
from datetime import datetime
from .features import build_features
from .hybrid_selector import rank_and_select
from .push_nlp import generate_push_text, template_cash_loan

def run_pipeline():
    df_clients = pd.read_csv("/content/hackathon_case_hybrid/data/clients.csv")
    df_txn = pd.read_csv("/content/hackathon_case_hybrid/data/transactions.csv")
    df_trf = pd.read_csv("/content/hackathon_case_hybrid/data/transfers.csv")

    df_feat = build_features(df_clients, df_txn, df_trf)
    best, top4, df_all = rank_and_select(df_feat)

    facts_all = df_feat.set_index('client_code').to_dict(orient='index')
    rows = []
    for _, r in best.iterrows():
        cid, product = r['client_code'], r['product']
        f = facts_all.get(cid,{})
        f_enrich = {
            "name": f.get("name","–ö–ª–∏–µ–Ω—Ç"),
            "month_ru": datetime.now().month,
            "travel_spend_m": round(f.get("travel_spend_3m",0)/3.0),
            "benefit_kzt": round(float(r.get("hybrid_score",0))),
            "top3": f.get("top3_cats", []),
            "has_restos": (f.get("premium_extra_spend_3m",0)>0),
            "fx_curr": "USD",
            "taxi_cnt": 5
        }
        text_main = generate_push_text(product, f_enrich)

        credit_banner = ""
        if bool(r.get("need_credit", False)):
            credit_banner = template_cash_loan(f_enrich["name"])

        rows.append({
            "client_code": cid,
            "product": product,
            "push_notification": text_main,
            "credit_banner": credit_banner
        })

    df_out = pd.DataFrame(rows)
    out_path = "/content/hackathon_case_hybrid/output/recommendations.csv"
    df_out.to_csv(out_path,index=False)
    print("–°–æ—Ö—Ä–∞–Ω–µ–Ω–æ:", out_path)

if __name__=="__main__":
    run_pipeline()

Overwriting /content/hackathon_case_hybrid/src/pipeline.py


In [None]:
%cd /content/hackathon_case_hybrid
%env PYTHONPATH=/content/hackathon_case_hybrid
!python -m src.pipeline

/content/hackathon_case_hybrid
env: PYTHONPATH=/content/hackathon_case_hybrid
Traceback (most recent call last):
  File "<frozen runpy>", line 198, in _run_module_as_main
  File "<frozen runpy>", line 88, in _run_code
  File "/content/hackathon_case_hybrid/src/pipeline.py", line 5, in <module>
    from .hybrid_selector import rank_and_select
  File "/content/hackathon_case_hybrid/src/hybrid_selector.py", line 3, in <module>
    from .rules_scoring import PRODUCTS, score_all_products, need_credit_flags
ImportError: cannot import name 'need_credit_flags' from 'src.rules_scoring' (/content/hackathon_case_hybrid/src/rules_scoring.py)


In [None]:
%%writefile /content/hackathon_case_hybrid/src/ml_classifier.py
# -*- coding: utf-8 -*-
import pandas as pd
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from dataclasses import dataclass
from typing import Dict, List

FEATURE_COLS = [
    'spend_total_3m','travel_spend_3m','premium_extra_spend_3m','online_spend_3m',
    'fx_volume_3m','fx_freq_3m','atm_withdrawals_3m','loan_payments_3m',
    'cc_repayments_3m','installments_3m','inflow_3m','outflow_3m',
    'avg_monthly_balance_KZT','travel_share'
]

EXCLUDE_TRAIN_LABELS = {'–ö—Ä–µ–¥–∏—Ç –Ω–∞–ª–∏—á–Ω—ã–º–∏', '–ö—Ä–µ–¥–∏—Ç–Ω–∞—è –∫–∞—Ä—Ç–∞'}  # –Ω–µ –∏—Å–ø–æ–ª—å–∑—É–µ–º –∫–∞–∫ —Ç–∞—Ä–≥–µ—Ç –¥–ª—è –æ–±—É—á–µ–Ω–∏—è

@dataclass
class MLModel:
    label_encoder: Dict[str, int]
    booster: lgb.Booster
    classes_: List[str]

def synth_labels_from_rules(df_scores: pd.DataFrame) -> pd.DataFrame:
    # –¥–ª—è –∫–∞–∂–¥–æ–≥–æ –∫–ª–∏–µ–Ω—Ç–∞ –≤–æ–∑—å–º—ë–º —Ç–æ–ø-3 –ø–æ rules_score –∏ –≤—ã–±–µ—Ä–µ–º –ø–µ—Ä–≤—ã–π –ù–ï –∏–∑ EXCLUDE_TRAIN_LABELS
    top3 = (
        df_scores.sort_values(['client_code','rules_score'], ascending=[True, False])
                 .groupby('client_code').head(3)
    )

    def pick_label(grp: pd.DataFrame) -> pd.Series:
        for _, row in grp.iterrows():
            if row['product'] not in EXCLUDE_TRAIN_LABELS:
                return pd.Series({'client_code': row['client_code'], 'label': row['product']})
        # –µ—Å–ª–∏ –≤—Å–µ –≤ –∏—Å–∫–ª—é—á–µ–Ω–∏—è—Ö ‚Äî –≤–æ–∑—å–º—ë–º –ø–µ—Ä–≤—ã–π –∫–∞–∫ –µ—Å—Ç—å (—á—Ç–æ–±—ã —Å–æ–≤—Å–µ–º –Ω–µ –ø–æ—Ç–µ—Ä—è—Ç—å –∫–ª–∏–µ–Ω—Ç–∞)
        row = grp.iloc[0]
        return pd.Series({'client_code': row['client_code'], 'label': row['product']})

    labels = top3.groupby('client_code', as_index=False).apply(pick_label)
    return labels[['client_code','label']]

def train_ml_lightgbm(df_feat: pd.DataFrame, df_scores: pd.DataFrame) -> MLModel:
    labels = synth_labels_from_rules(df_scores)
    df = df_feat.merge(labels, on='client_code', how='inner')

    # –æ—Ç—Ñ–∏–ª—å—Ç—Ä—É–µ–º —Ä–µ–¥–∫–∏–µ –∫–ª–∞—Å—Å—ã (<2), –∏–Ω–∞—á–µ stratify –Ω–µ —Å—Ä–∞–±–æ—Ç–∞–µ—Ç
    vc = df['label'].value_counts()
    keep = vc[vc >= 2].index
    dropped = set(vc.index) - set(keep)
    if dropped:
        print("[INFO] dropped rare classes from ML training:", {k:int(vc[k]) for k in dropped})
    df = df[df['label'].isin(keep)].copy()

    classes = sorted(df['label'].unique())
    if len(classes) < 2:
        raise RuntimeError(f"Too few classes after filtering: {classes}")

    X = df[FEATURE_COLS].fillna(0.0).astype(float)
    y = df['label'].astype(str)

    enc = {c:i for i,c in enumerate(classes)}
    y_enc = y.map(enc)

    Xtr, Xva, ytr, yva = train_test_split(
        X, y_enc, test_size=0.2, random_state=42, stratify=y_enc
    )

    train_set = lgb.Dataset(Xtr, label=ytr)
    valid_set = lgb.Dataset(Xva, label=yva)

    params = dict(
        objective='multiclass',
        num_class=len(classes),
        metric='multi_logloss',
        learning_rate=0.05,
        num_leaves=31,
        seed=42
    )
    booster = lgb.train(
        params, train_set,
        num_boost_round=300,
        valid_sets=[valid_set],
        callbacks=[lgb.early_stopping(30), lgb.log_evaluation(50)]
    )
    return MLModel(label_encoder=enc, booster=booster, classes_=classes)

def predict_ml_proba(model: MLModel, df_feat: pd.DataFrame, products: List[str]) -> pd.DataFrame:
    X = df_feat[FEATURE_COLS].fillna(0.0).astype(float)
    proba = model.booster.predict(X, num_iteration=model.booster.best_iteration)
    cols = model.classes_
    df_proba = pd.DataFrame(proba, columns=cols)
    df_proba.insert(0, 'client_code', df_feat['client_code'].values)
    for p in products:
        if p not in df_proba.columns:
            df_proba[p] = 0.0
    return df_proba[['client_code'] + products]

Overwriting /content/hackathon_case_hybrid/src/ml_classifier.py


In [None]:
%cd /content/hackathon_case_hybrid
%env PYTHONPATH=/content/hackathon_case_hybrid
!python -m src.pipeline

/content/hackathon_case_hybrid
env: PYTHONPATH=/content/hackathon_case_hybrid
Traceback (most recent call last):
  File "<frozen runpy>", line 198, in _run_module_as_main
  File "<frozen runpy>", line 88, in _run_code
  File "/content/hackathon_case_hybrid/src/pipeline.py", line 5, in <module>
    from .hybrid_selector import rank_and_select
  File "/content/hackathon_case_hybrid/src/hybrid_selector.py", line 3, in <module>
    from .rules_scoring import PRODUCTS, score_all_products, need_credit_flags
ImportError: cannot import name 'need_credit_flags' from 'src.rules_scoring' (/content/hackathon_case_hybrid/src/rules_scoring.py)


In [None]:
%cd /content/hackathon_case_hybrid
%env PYTHONPATH=/content/hackathon_case_hybrid

/content/hackathon_case_hybrid
env: PYTHONPATH=/content/hackathon_case_hybrid


In [None]:
!python -m src.pipeline

Traceback (most recent call last):
  File "<frozen runpy>", line 198, in _run_module_as_main
  File "<frozen runpy>", line 88, in _run_code
  File "/content/hackathon_case_hybrid/src/pipeline.py", line 5, in <module>
    from .hybrid_selector import rank_and_select
  File "/content/hackathon_case_hybrid/src/hybrid_selector.py", line 3, in <module>
    from .rules_scoring import PRODUCTS, score_all_products, need_credit_flags
ImportError: cannot import name 'need_credit_flags' from 'src.rules_scoring' (/content/hackathon_case_hybrid/src/rules_scoring.py)


In [None]:
from google.colab import files
files.download("/content/hackathon_case_hybrid/output/recommendations.csv")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
import pandas as pd

path = "/content/hackathon_case_hybrid/output/recommendations.csv"
df = pd.read_csv(path)  # —á–∏—Ç–∞–µ–º —Ñ–∞–π–ª –∫–∞–∫ –µ—Å—Ç—å (–æ–Ω –≤ UTF-8)
df.to_csv(path, index=False, encoding="utf-8-sig")  # –ø–µ—Ä–µ—Å–æ—Ö—Ä–∞–Ω—è–µ–º –≤ –Ω—É–∂–Ω–æ–π –∫–æ–¥–∏—Ä–æ–≤–∫–µ

In [None]:
%%writefile /content/hackathon_case_hybrid/src/hybrid_selector.py
# -*- coding: utf-8 -*-
import pandas as pd, numpy as np
from .rules_scoring import PRODUCTS, score_all_products, need_credit_flags
from .ml_classifier import train_ml_lightgbm, predict_ml_proba
from .config import TRAIN_ML, USE_ML_IN_HYBRID, WEIGHT_RULES, WEIGHT_ML

TIE_DELTA = 0.05

def softmax_log1p(x: np.ndarray) -> np.ndarray:
    z = np.log1p(np.maximum(x, 0.0))
    z = z - z.max()
    e = np.exp(z)
    return e / e.sum()

def rank_and_select(df_feat: pd.DataFrame):
    df_rules = score_all_products(df_feat)
    df_need = need_credit_flags(df_feat)  # —Ñ–ª–∞–≥ –ø–æ—Ç—Ä–µ–±–Ω–æ—Å—Ç–∏
    need_map = dict(zip(df_need['client_code'], df_need['need_credit']))

    # ML proba
    use_ml = False
    if TRAIN_ML:
        try:
            model = train_ml_lightgbm(df_feat, df_rules)
            df_proba = predict_ml_proba(model, df_feat, PRODUCTS)
            use_ml = True
        except Exception as e:
            print("[WARN] ML disabled:", e)
            df_proba = df_feat[['client_code']].copy()
            for p in PRODUCTS: df_proba[p] = 0.0
    else:
        df_proba = df_feat[['client_code']].copy()
        for p in PRODUCTS: df_proba[p] = 0.0

    df = df_rules.merge(df_proba, on='client_code', how='left')
    df['ml_proba'] = df.apply(lambda r: r.get(r['product'], 0.0), axis=1)

    max_rules = df.groupby('client_code')['rules_score'].transform('max')
    if USE_ML_IN_HYBRID and use_ml:
        df['hybrid_score'] = WEIGHT_RULES * df['rules_score'] + WEIGHT_ML * (df['ml_proba'] * max_rules)
    else:
        df['hybrid_score'] = df['rules_score']

    winners, top4_rows = [], []
    for cid, grp in df.groupby('client_code', sort=False):
        # 1) –µ—Å–ª–∏ —Å–∏–ª—å–Ω—ã–µ –∫—Ä–µ–¥–∏—Ç-—Å–∏–≥–Ω–∞–ª—ã ‚Äî —Ñ–æ—Ä—Å–∏–º ¬´–ö—Ä–µ–¥–∏—Ç –Ω–∞–ª–∏—á–Ω—ã–º–∏¬ª –Ω–∞ 1 –º–µ—Å—Ç–æ
        if need_map.get(cid, False):
            winners.append({'client_code': cid, 'product': '–ö—Ä–µ–¥–∏—Ç –Ω–∞–ª–∏—á–Ω—ã–º–∏', 'hybrid_score': float('inf')})
            # —Ç–æ–ø-4 –æ—Å—Ç–∞–ª—å–Ω—ã—Ö –ø–æ –≥–∏–±—Ä–∏–¥—É (–∫—Ä–µ–¥–∏—Ç –Ω–µ –≤—Ö–æ–¥–∏—Ç –≤ PRODUCTS –∏ –Ω–µ —É—á–∞—Å—Ç–≤—É–µ—Ç –≤ —Å–∫–æ—Ä–∏–Ω–≥–µ)
            top4_rows.extend(grp.sort_values('hybrid_score', ascending=False).head(4).to_dict('records'))
            continue

        # 2) –æ–±—ã—á–Ω—ã–π —Ä–µ–∂–∏–º: max rules / tie-break —á–µ—Ä–µ–∑ ML
        grp = grp.sort_values('rules_score', ascending=False)
        top1, top2 = grp.iloc[0], (grp.iloc[1] if len(grp) > 1 else None)
        max_r = top1['rules_score']
        tie = top2 is not None and (abs(top1['rules_score'] - top2['rules_score']) < (TIE_DELTA*max_r) or top1['rules_score']==top2['rules_score'])

        if tie and use_ml:
            r_scores = grp['rules_score'].values
            p_rules = softmax_log1p(r_scores)
            ml_vec = grp['ml_proba'].values
            p_hybrid = WEIGHT_RULES*p_rules + WEIGHT_ML*ml_vec
            k = int(np.argmax(p_hybrid))
            winner_row = grp.iloc[k].copy()
            winner_row['hybrid_score'] = p_hybrid[k]
            grp['hybrid_score'] = p_hybrid
        else:
            k = grp['hybrid_score'].values.argmax()
            winner_row = grp.iloc[k]

        winners.append({'client_code': cid, 'product': winner_row['product'], 'hybrid_score': winner_row['hybrid_score']})
        top4_rows.extend(grp.sort_values('hybrid_score', ascending=False).head(4).to_dict('records'))

    best = pd.DataFrame(winners)
    top4 = pd.DataFrame(top4_rows)
    return best, top4, df

Overwriting /content/hackathon_case_hybrid/src/hybrid_selector.py


In [None]:
%%writefile /content/hackathon_case_hybrid/src/push_nlp.py
# -*- coding: utf-8 -*-
from datetime import datetime
from .utils import month_name_ru, format_currency_kzt, only_one_exclamation
from .config import CTA_WORDS, PUSH_MIN_LEN, PUSH_MAX_LEN

def template_travel(name, month, taxi_cnt, travel_spend_m, benefit):
    return f"{name}, –≤ {month} —É –≤–∞—Å {taxi_cnt} –ø–æ–µ–∑–¥–æ–∫ –Ω–∞ —Ç–∞–∫—Å–∏ –Ω–∞ {format_currency_kzt(travel_spend_m)}. –° –∫–∞—Ä—Ç–æ–π –¥–ª—è –ø—É—Ç–µ—à–µ—Å—Ç–≤–∏–π –≤–µ—Ä–Ω—É–ª–∏ –±—ã ‚âà{format_currency_kzt(benefit)} –∫–µ—à–±—ç–∫–æ–º. –û—Ç–∫—Ä–æ–π—Ç–µ –∫–∞—Ä—Ç—É."

def template_premium(name, has_restos, benefit):
    return f"{name}, —É –≤–∞—Å —Å—Ç–∞–±–∏–ª—å–Ω—ã–π –∫—Ä—É–ø–Ω—ã–π –æ—Å—Ç–∞—Ç–æ–∫{', —Ç—Ä–∞—Ç—ã –≤ —Ä–µ—Å—Ç–æ—Ä–∞–Ω–∞—Ö' if has_restos else ''}. –ü—Ä–µ–º–∏–∞–ª—å–Ω–∞—è –∫–∞—Ä—Ç–∞ –¥–∞—Å—Ç –∫–µ—à–±—ç–∫ –∏ –±–µ—Å–ø–ª–∞—Ç–Ω—ã–µ —Å–Ω—è—Ç–∏—è. –í—ã–≥–æ–¥–∞ –¥–æ {format_currency_kzt(benefit)}. –û—Ñ–æ—Ä–º–∏—Ç–µ –∫–∞—Ä—Ç—É."

def template_cc(name, cat1, cat2, cat3, benefit):
    return f"{name}, –≤–∞—à–∏ —Ç–æ–ø-–∫–∞—Ç–µ–≥–æ—Ä–∏–∏ ‚Äî {cat1}, {cat2}, {cat3}. –ö—Ä–µ–¥–∏—Ç–Ω–∞—è –∫–∞—Ä—Ç–∞ –¥–∞—ë—Ç –¥–æ 10% –∫–µ—à–±—ç–∫–∞ –∏ –æ–Ω–ª–∞–π–Ω-–±–æ–Ω—É—Å—ã. –í–µ—Ä–Ω—É–ª–∏ –±—ã {format_currency_kzt(benefit)}. –û—Ñ–æ—Ä–º–∏—Ç–µ –∫–∞—Ä—Ç—É."

def template_fx(name, curr):
    return f"{name}, –≤—ã —á–∞—Å—Ç–æ –ø–æ–ª—å–∑—É–µ—Ç–µ—Å—å –≤–∞–ª—é—Ç–æ–π. –û–±–º–µ–Ω {curr} –ø–æ –≤—ã–≥–æ–¥–Ω–æ–º—É –∫—É—Ä—Å—É –±–µ–∑ –∫–æ–º–∏—Å—Å–∏–∏, –º–æ–º–µ–Ω—Ç–∞–ª—å–Ω–æ. –ü–æ–¥–∫–ª—é—á–∏—Ç–µ –æ–±–º–µ–Ω –≤–∞–ª—é—Ç."

def template_deposit(name, benefit):
    return f"{name}, –Ω–∞ –¥–µ–ø–æ–∑–∏—Ç–µ –¥–æ—Ö–æ–¥ {format_currency_kzt(benefit)} –≤ –º–µ—Å—è—Ü. –°–æ—Ö—Ä–∞–Ω–∏—Ç–µ –∏ –ø—Ä–∏—É–º–Ω–æ–∂—å—Ç–µ —Å—Ä–µ–¥—Å—Ç–≤–∞. –û—Ç–∫—Ä–æ–π—Ç–µ –¥–µ–ø–æ–∑–∏—Ç."

def template_invest(name):
    return f"{name}, —É –≤–∞—Å –µ—Å—Ç—å –∑–∞–ø–∞—Å –Ω–∞ –∏–Ω–≤–µ—Å—Ç–∏—Ü–∏–∏. –î–∏–≤–µ—Ä—Å–∏—Ñ–∏—Ü–∏—Ä—É–π—Ç–µ –ø–æ—Ä—Ç—Ñ–µ–ª—å —Å —Ñ–æ–Ω–¥–∞–º–∏ –∏ –∞–∫—Ü–∏—è–º–∏. –ü–æ–¥–∫–ª—é—á–∏—Ç–µ –∏–Ω–≤–µ—Å—Ç–∏—Ü–∏–∏."

def template_cash_loan(name):
    return f"{name}, —É –≤–∞—Å —Ä–∞—Å—Ö–æ–¥—ã –≤—ã—à–µ –¥–æ—Ö–æ–¥–æ–≤. –ù–∞–ª–∏—á–Ω—ã–π –∫—Ä–µ–¥–∏—Ç –ø–æ–º–æ–∂–µ—Ç –∑–∞–∫—Ä—ã—Ç—å —Ä–∞–∑—Ä—ã–≤ –∏ –ø–ª–∞–Ω–∏—Ä–æ–≤–∞—Ç—å –±—é–¥–∂–µ—Ç. –û—Ñ–æ—Ä–º–∏—Ç–µ –∫—Ä–µ–¥–∏—Ç."

def lint_push(text: str) -> dict:
    ok = True
    if not (PUSH_MIN_LEN <= len(text) <= PUSH_MAX_LEN): ok=False
    if not only_one_exclamation(text): ok=False
    if not any(text.strip().endswith(w) for w in CTA_WORDS): ok=False
    return {"ok":ok,"len":len(text)}

def generate_push_text(product: str, facts: dict) -> str:
    name = facts.get("name","–ö–ª–∏–µ–Ω—Ç")
    if product=="–ö–∞—Ä—Ç–∞ –¥–ª—è –ø—É—Ç–µ—à–µ—Å—Ç–≤–∏–π":
        text = template_travel(name, month_name_ru(datetime.now()), facts.get("taxi_cnt",5), facts.get("travel_spend_m",30000), facts.get("benefit_kzt",1200))
    elif product=="–ü—Ä–µ–º–∏–∞–ª—å–Ω–∞—è –∫–∞—Ä—Ç–∞":
        text = template_premium(name, facts.get("has_restos",True), facts.get("benefit_kzt",2000))
    elif product=="–ö—Ä–µ–¥–∏—Ç–Ω–∞—è –∫–∞—Ä—Ç–∞":
        cats = facts.get("top3",["—Ä–µ—Å—Ç–æ—Ä–∞–Ω—ã","—Ç–∞–∫—Å–∏","–ø—Ä–æ–¥—É–∫—Ç—ã"]) + ["","",""]
        text = template_cc(name, cats[0], cats[1], cats[2], facts.get("benefit_kzt",2500))
    elif product=="–û–±–º–µ–Ω –≤–∞–ª—é—Ç":
        text = template_fx(name, facts.get("fx_curr","USD"))
    elif product.startswith("–î–µ–ø–æ–∑–∏—Ç"):
        text = template_deposit(name, facts.get("benefit_kzt",1800))
    elif product=="–ò–Ω–≤–µ—Å—Ç–∏—Ü–∏–∏":
        text = template_invest(name)  # –±–µ–∑ —Ü–∏—Ñ—Ä, –ø–æ –¢–ó
    elif product=="–ó–æ–ª–æ—Ç—ã–µ —Å–ª–∏—Ç–∫–∏":
        text = "–î–æ–ª–≥–æ—Å—Ä–æ—á–Ω–∞—è –∑–∞—â–∏—Ç–∞ –∏ –¥–∏–≤–µ—Ä—Å–∏—Ñ–∏–∫–∞—Ü–∏—è –∫–∞–ø–∏—Ç–∞–ª–∞ –∑–∞ —Å—á—ë—Ç –∑–æ–ª–æ—Ç—ã—Ö —Å–ª–∏—Ç–∫–æ–≤. –£–∑–Ω–∞–π—Ç–µ –¥–µ—Ç–∞–ª–∏ –∏ –ø–æ–¥–≤–µ—Ä—Å—Ç–∞–π—Ç–µ —Å—Ç—Ä–∞—Ç–µ–≥–∏—é. –ü–æ–¥–∫–ª—é—á–∏—Ç–µ –∑–æ–ª–æ—Ç–æ."
    elif product=="–ö—Ä–µ–¥–∏—Ç –Ω–∞–ª–∏—á–Ω—ã–º–∏":
        text = template_cash_loan(name)  # —Ç–µ–ø–µ—Ä—å –∫—Ä–µ–¥–∏—Ç –∫–∞–∫ –æ–±—ã—á–Ω—ã–π –ø—Ä–æ–¥—É–∫—Ç ‚Ññ1
    else:
        text = template_invest(name)

    check = lint_push(text)
    if not check["ok"]:
        if len(text)<PUSH_MIN_LEN: text=text+" –û—Ñ–æ—Ä–º–∏—Ç–µ –∫–∞—Ä—Ç—É."
        if len(text)>PUSH_MAX_LEN: text=text[:PUSH_MAX_LEN-1]+"‚Ä¶"
    return text

Overwriting /content/hackathon_case_hybrid/src/push_nlp.py


In [None]:
%%writefile /content/hackathon_case_hybrid/src/pipeline.py
# -*- coding: utf-8 -*-
import pandas as pd
from datetime import datetime
from .features import build_features
from .hybrid_selector import rank_and_select
from .push_nlp import generate_push_text

def run_pipeline():
    df_clients = pd.read_csv("/content/hackathon_case_hybrid/data/clients.csv")
    df_txn = pd.read_csv("/content/hackathon_case_hybrid/data/transactions.csv")
    df_trf = pd.read_csv("/content/hackathon_case_hybrid/data/transfers.csv")

    df_feat = build_features(df_clients, df_txn, df_trf)
    best, top4, df_all = rank_and_select(df_feat)

    facts_all = df_feat.set_index('client_code').to_dict(orient='index')
    rows = []
    for _, r in best.iterrows():
        cid, product = r['client_code'], r['product']
        f = facts_all.get(cid,{})
        f_enrich = {
            "name": f.get("name","–ö–ª–∏–µ–Ω—Ç"),
            "month_ru": datetime.now().month,
            "travel_spend_m": round(f.get("travel_spend_3m",0)/3.0),
            "benefit_kzt": round(float(r.get("hybrid_score",0))) if str(r.get("hybrid_score","")) not in ("inf","-inf") else 0,
            "top3": f.get("top3_cats", []),
            "has_restos": (f.get("premium_extra_spend_3m",0)>0),
            "fx_curr": "USD",
            "taxi_cnt": 5
        }
        text_main = generate_push_text(product, f_enrich)
        rows.append({"client_code": cid, "product": product, "push_notification": text_main})

    df_out = pd.DataFrame(rows)
    out_path = "/content/hackathon_case_hybrid/output/recommendations.csv"
    df_out.to_csv(out_path, index=False, encoding="utf-8-sig")  # Excel-friendly
    print("–°–æ—Ö—Ä–∞–Ω–µ–Ω–æ:", out_path)

if __name__=="__main__":
    run_pipeline()

Overwriting /content/hackathon_case_hybrid/src/pipeline.py


In [None]:
%cd /content/hackathon_case_hybrid
%env PYTHONPATH=/content/hackathon_case_hybrid
!python -m src.pipeline

/content/hackathon_case_hybrid
env: PYTHONPATH=/content/hackathon_case_hybrid
Traceback (most recent call last):
  File "<frozen runpy>", line 198, in _run_module_as_main
  File "<frozen runpy>", line 88, in _run_code
  File "/content/hackathon_case_hybrid/src/pipeline.py", line 5, in <module>
    from .hybrid_selector import rank_and_select
  File "/content/hackathon_case_hybrid/src/hybrid_selector.py", line 3, in <module>
    from .rules_scoring import PRODUCTS, score_all_products, need_credit_flags
ImportError: cannot import name 'need_credit_flags' from 'src.rules_scoring' (/content/hackathon_case_hybrid/src/rules_scoring.py)


In [None]:
import pandas as pd
pd.read_csv("/content/hackathon_case_hybrid/output/recommendations.csv").head()

Unnamed: 0,client_code,name,product,push_notification
0,1,–ê–π–≥–µ—Ä–∏–º,–ö—Ä–µ–¥–∏—Ç –Ω–∞–ª–∏—á–Ω—ã–º–∏,"–ê–π–≥–µ—Ä–∏–º, —É –≤–∞—Å –Ω–∞ —Å—á–µ—Ç—É 92 643 ‚Ç∏! ü§ë –û–±—Ä–∞—Ç–∏—Ç–µ –≤..."
1,2,–î–∞–Ω–∏—è—Ä,–ö—Ä–µ–¥–∏—Ç–Ω–∞—è –∫–∞—Ä—Ç–∞,"–ü—Ä–∏–≤–µ—Ç, –î–∞–Ω–∏—è—Ä! üëã –í–∞—à –±–∞–ª–∞–Ω—Å –Ω–∞ –∫—Ä–µ–¥–∏—Ç–Ω–æ–π –∫–∞—Ä—Ç..."
2,3,–°–∞–±–∏–Ω–∞,–ö—Ä–µ–¥–∏—Ç –Ω–∞–ª–∏—á–Ω—ã–º–∏,"–°–∞–±–∏–Ω–∞, –ø—Ä–∏–≤–µ—Ç! üòä –£ –≤–∞—Å –Ω–∞ –±–∞–ª–∞–Ω—Å–µ 63 116 ‚Ç∏. –û..."
3,4,–¢–∏–º—É—Ä,–ö—Ä–µ–¥–∏—Ç –Ω–∞–ª–∏—á–Ω—ã–º–∏,"–ü—Ä–∏–≤–µ—Ç, –¢–∏–º—É—Ä! üëã –í–∞—à –±–∞–ª–∞–Ω—Å 83 351 ‚Ç∏. –ú—ã –∑–∞–º–µ—Ç..."
4,5,–ö–∞–º–∏–ª–ª–∞,–ö—Ä–µ–¥–∏—Ç–Ω–∞—è –∫–∞—Ä—Ç–∞,"–ö–∞–º–∏–ª–ª–∞, –≤–∞—à –±–∞–ª–∞–Ω—Å –Ω–∞ –∫—Ä–µ–¥–∏—Ç–Ω–æ–π –∫–∞—Ä—Ç–µ —Å–æ—Å—Ç–∞–≤–ª..."


In [None]:
code = r"""
import pandas as pd

def need_credit_flags(df_feat: pd.DataFrame) -> pd.DataFrame:
    flags = []
    for _, r in df_feat.iterrows():
        flags.append({
            'client_code': r['client_code'],
            'need_credit': bool(utility_credit_cash(r))
        })
    return pd.DataFrame(flags)
"""
with open("/content/hackathon_case_hybrid/src/rules_scoring.py", "a", encoding="utf-8") as f:
    f.write("\n" + code + "\n")
print("need_credit_flags –¥–æ–±–∞–≤–ª–µ–Ω–∞ –≤ rules_scoring.py")

need_credit_flags –¥–æ–±–∞–≤–ª–µ–Ω–∞ –≤ rules_scoring.py


In [None]:
%cd /content/hackathon_case_hybrid
%env PYTHONPATH=/content/hackathon_case_hybrid
!python -m src.pipeline

/content/hackathon_case_hybrid
env: PYTHONPATH=/content/hackathon_case_hybrid
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.061176 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 136
[LightGBM] [Info] Number of data points in the train set: 48, number of used features: 8
[LightGBM] [Info] Start training from score -2.079442
[LightGBM] [Info] Start training from score -0.133531
Training until validation scores don't improve for 30 rounds
[50]	valid_0's multi_logloss: 0.0526071
[100]	valid_0's multi_logloss: 0.00396756
[150]	valid_0's multi_logloss: 0.000323661
[200]	valid_0's multi_logloss: 2.65543e-05
Early stopping, best iteration is:
[206]	valid_0's multi_logloss: 1.96717e-05
–°–æ—Ö—Ä–∞–Ω–µ–Ω–æ: /content/hackathon_case_hybrid/output/recommendations.csv


In [None]:
%%writefile /content/hackathon_case_hybrid/src/rules_scoring.py
# -*- coding: utf-8 -*-
import pandas as pd
from .config import (
    PREMIUM_CB_LIMIT, PREMIUM_TIER_THRESHOLDS, PREMIUM_TIER_CBs,
    TRAVEL_CB, PREMIUM_EXTRA_CB_CATS, PREMIUM_EXTRA_CB,
    CC_TOP3_CB, CC_ONLINE_CB_CATS, CC_ONLINE_CB,
    RATE_SAVINGS, RATE_GOAL, RATE_MULTI, FX_SPREAD_SAVING,
    TRAVEL_SHARE_MIN,
    GOLD_T1_BAL, GOLD_T2_BAL, GOLD_T1_UTILITY, GOLD_T2_UTILITY,
    CREDIT_OUTFLOW_INFLOW_RATIO, CREDIT_NEARZERO_BAL_MAX,
    CREDIT_VERYLOW_BAL_MAX, CREDIT_LOAN_FREQ_MIN
)

# –ü—Ä–æ–¥—É–∫—Ç—ã, —É—á–∞—Å—Ç–≤—É—é—â–∏–µ –≤ –¥–µ–Ω–µ–∂–Ω–æ–º —Ä–∞–Ω–∂–∏—Ä–æ–≤–∞–Ω–∏–∏ (–∫—Ä–µ–¥–∏—Ç –ù–ï –∑–¥–µ—Å—å)
PRODUCTS = [
    '–ö–∞—Ä—Ç–∞ –¥–ª—è –ø—É—Ç–µ—à–µ—Å—Ç–≤–∏–π',
    '–ü—Ä–µ–º–∏–∞–ª—å–Ω–∞—è –∫–∞—Ä—Ç–∞',
    '–ö—Ä–µ–¥–∏—Ç–Ω–∞—è –∫–∞—Ä—Ç–∞',
    '–û–±–º–µ–Ω –≤–∞–ª—é—Ç',
    '–î–µ–ø–æ–∑–∏—Ç –°–±–µ—Ä–µ–≥–∞—Ç–µ–ª—å–Ω—ã–π',
    '–î–µ–ø–æ–∑–∏—Ç –ù–∞–∫–æ–ø–∏—Ç–µ–ª—å–Ω—ã–π',
    '–î–µ–ø–æ–∑–∏—Ç –ú—É–ª—å—Ç–∏–≤–∞–ª—é—Ç–Ω—ã–π',
    '–ò–Ω–≤–µ—Å—Ç–∏—Ü–∏–∏',
    '–ó–æ–ª–æ—Ç—ã–µ —Å–ª–∏—Ç–∫–∏'
]

def tier_cashback(avg_balance):
    if pd.isna(avg_balance): return PREMIUM_TIER_CBs[0]
    if avg_balance < PREMIUM_TIER_THRESHOLDS[0]: return PREMIUM_TIER_CBs[0]
    if avg_balance < PREMIUM_TIER_THRESHOLDS[1]: return PREMIUM_TIER_CBs[1]
    return PREMIUM_TIER_CBs[2]

def benefit_travel(r):
    spend = r.get('travel_spend_3m', 0.0) / 3.0
    return TRAVEL_CB * spend  # –ø—Ä–∏ –∂–µ–ª–∞–Ω–∏–∏ –º–æ–∂–Ω–æ –¥–æ–±–∞–≤–∏—Ç—å –ª–∏–º–∏—Ç

def benefit_premium(r):
    base_spend_m = r.get('spend_total_3m', 0.0)/3.0
    tcb = tier_cashback(r.get('avg_monthly_balance_KZT', 0.0))
    base_cb = min(PREMIUM_CB_LIMIT, tcb * base_spend_m)

    extra_spend_m = r.get('premium_extra_spend_3m', 0.0)/3.0
    extra_cb = PREMIUM_EXTRA_CB * extra_spend_m

    atm_count_m = r.get('atm_withdrawals_3m', 0) / 3.0
    saved_fees = 0.01 * min(3_000_000.0, atm_count_m * 30_000.0)  # –≥—Ä—É–±–∞—è –æ—Ü–µ–Ω–∫–∞ —ç–∫–æ–Ω–æ–º–∏–∏

    return base_cb + extra_cb + saved_fees

def benefit_cc(r):
    total_m = r.get('spend_total_3m', 0.0)/3.0
    top3_spend_m = 0.6 * total_m
    online_spend_m = r.get('online_spend_3m', 0.0)/3.0
    return CC_TOP3_CB * top3_spend_m + CC_ONLINE_CB * online_spend_m

def benefit_fx(r):
    vol_m = r.get('fx_volume_3m', 0.0) / 3.0
    return FX_SPREAD_SAVING * vol_m

def benefit_deposits(r):
    bal = r.get('avg_monthly_balance_KZT', 0.0)
    savings = (RATE_SAVINGS/12.0) * bal
    goal    = (RATE_GOAL/12.0)    * bal
    multi   = (RATE_MULTI/12.0)   * bal * 0.4  # —É—Å–ª–æ–≤–Ω–æ —á–∞—Å—Ç—å –±–∞–ª–∞–Ω—Å–∞ –≤ –º—É–ª—å—Ç–∏–≤–∞–ª—é—Ç–µ
    return savings, goal, multi

# –ò–Ω–≤–µ—Å—Ç–∏—Ü–∏–∏ ‚Äî 0 (–ø–æ –¢–ó –Ω–µ –æ–±–µ—â–∞–µ–º –¥–æ—Ö–æ–¥–Ω–æ—Å—Ç—å)
def benefit_investments(r):
    return 0.0

# –ó–æ–ª–æ—Ç–æ ‚Äî utility –ø–æ –ø–æ—Ä–æ–≥–∞–º –±–∞–ª–∞–Ω—Å–∞
def benefit_gold(r):
    bal = r.get('avg_monthly_balance_KZT', 0.0)
    if bal >= GOLD_T2_BAL:
        return float(GOLD_T2_UTILITY)
    if bal >= GOLD_T1_BAL:
        return float(GOLD_T1_UTILITY)
    return 0.0

# --- –ñ—ë—Å—Ç–∫–∏–µ —Å–∏–≥–Ω–∞–ª—ã –Ω–∞ –∫—Ä–µ–¥–∏—Ç ‚Ññ1 ---
def utility_credit_cash(r):
    infl = float(r.get('inflow_3m', 0.0))
    out  = float(r.get('outflow_3m', 0.0))
    avgb = float(r.get('avg_monthly_balance_KZT', 0.0))
    loan_freq = int(r.get('loan_payments_3m', 0))

    big_gap = out >= CREDIT_OUTFLOW_INFLOW_RATIO * max(infl, 1.0)
    near_zero_bal = avgb <= CREDIT_NEARZERO_BAL_MAX
    very_low_bal  = avgb <= CREDIT_VERYLOW_BAL_MAX

    # –ö—Ä–µ–¥–∏—Ç –∫–∞–∫ ‚Ññ1 —Ç–æ–ª—å–∫–æ –≤ —Ç—è–∂—ë–ª–æ–º —Å–ª—É—á–∞–µ:
    # A) –æ–≥—Ä–æ–º–Ω—ã–π —Ä–∞–∑—Ä—ã–≤ –ò –±–∞–ª–∞–Ω—Å –ø–æ—á—Ç–∏ –Ω—É–ª–µ–≤–æ–π
    # –ë) –∏–ª–∏ –æ–≥—Ä–æ–º–Ω—ã–π —Ä–∞–∑—Ä—ã–≤ –ò –æ—á–µ–Ω—å –Ω–∏–∑–∫–∏–π —Å—Ä–µ–¥–Ω–∏–π –±–∞–ª–∞–Ω—Å –ò —á–∞—Å—Ç—ã–µ –ø–ª–∞—Ç–µ–∂–∏ –ø–æ –∑–∞–π–º–∞–º
    return 1.0 if (big_gap and (near_zero_bal or (very_low_bal and loan_freq >= CREDIT_LOAN_FREQ_MIN))) else 0.0

def need_credit_flags(df_feat: pd.DataFrame) -> pd.DataFrame:
    flags = []
    for _, r in df_feat.iterrows():
        flags.append({'client_code': r['client_code'], 'need_credit': bool(utility_credit_cash(r))})
    return pd.DataFrame(flags)

def score_all_products(df_feat: pd.DataFrame) -> pd.DataFrame:
    rows = []
    for _, r in df_feat.iterrows():
        scores = {}
        scores['–ö–∞—Ä—Ç–∞ –¥–ª—è –ø—É—Ç–µ—à–µ—Å—Ç–≤–∏–π'] = benefit_travel(r)
        scores['–ü—Ä–µ–º–∏–∞–ª—å–Ω–∞—è –∫–∞—Ä—Ç–∞'] = benefit_premium(r)
        scores['–ö—Ä–µ–¥–∏—Ç–Ω–∞—è –∫–∞—Ä—Ç–∞'] = benefit_cc(r)
        scores['–û–±–º–µ–Ω –≤–∞–ª—é—Ç'] = benefit_fx(r)

        s,g,m = benefit_deposits(r)
        scores['–î–µ–ø–æ–∑–∏—Ç –°–±–µ—Ä–µ–≥–∞—Ç–µ–ª—å–Ω—ã–π'] = s
        scores['–î–µ–ø–æ–∑–∏—Ç –ù–∞–∫–æ–ø–∏—Ç–µ–ª—å–Ω—ã–π'] = g
        scores['–î–µ–ø–æ–∑–∏—Ç –ú—É–ª—å—Ç–∏–≤–∞–ª—é—Ç–Ω—ã–π'] = m

        scores['–ò–Ω–≤–µ—Å—Ç–∏—Ü–∏–∏'] = benefit_investments(r)
        scores['–ó–æ–ª–æ—Ç—ã–µ —Å–ª–∏—Ç–∫–∏'] = benefit_gold(r)

        if r.get('travel_share', 0.0) >= TRAVEL_SHARE_MIN:
            scores['–ö–∞—Ä—Ç–∞ –¥–ª—è –ø—É—Ç–µ—à–µ—Å—Ç–≤–∏–π'] *= 2.5

        for p,v in scores.items():
            rows.append({'client_code': r['client_code'], 'product': p, 'rules_score': float(v)})
    return pd.DataFrame(rows)

Overwriting /content/hackathon_case_hybrid/src/rules_scoring.py


In [None]:
%cd /content/hackathon_case_hybrid
%env PYTHONPATH=/content/hackathon_case_hybrid
!python -m src.pipeline

import pandas as pd
df = pd.read_csv("/content/hackathon_case_hybrid/output/recommendations.csv")
df['product'].value_counts()

/content/hackathon_case_hybrid
env: PYTHONPATH=/content/hackathon_case_hybrid
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000034 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 136
[LightGBM] [Info] Number of data points in the train set: 48, number of used features: 8
[LightGBM] [Info] Start training from score -2.079442
[LightGBM] [Info] Start training from score -0.133531
Training until validation scores don't improve for 30 rounds
[50]	valid_0's multi_logloss: 0.0526071
[100]	valid_0's multi_logloss: 0.00396756
[150]	valid_0's multi_logloss: 0.000323661
[200]	valid_0's multi_logloss: 2.65543e-05
Early stopping, best iteration is:
[206]	valid_0's multi_logloss: 1.96717e-05
–°–æ—Ö—Ä–∞–Ω–µ–Ω–æ: /content/hackathon_case_hybrid/output/recommendations.csv


Unnamed: 0_level_0,count
product,Unnamed: 1_level_1
–ö—Ä–µ–¥–∏—Ç–Ω–∞—è –∫–∞—Ä—Ç–∞,45
–ü—Ä–µ–º–∏–∞–ª—å–Ω–∞—è –∫–∞—Ä—Ç–∞,8
–î–µ–ø–æ–∑–∏—Ç –°–±–µ—Ä–µ–≥–∞—Ç–µ–ª—å–Ω—ã–π,7


In [None]:
%%writefile /content/hackathon_case_hybrid/src/features.py
# -*- coding: utf-8 -*-
import pandas as pd
from .utils import safe_div

PREMIUM_EXTRA = ['–Æ–≤–µ–ª–∏—Ä–Ω—ã–µ —É–∫—Ä–∞—à–µ–Ω–∏—è','–ö–æ—Å–º–µ—Ç–∏–∫–∞ –∏ –ü–∞—Ä—Ñ—é–º–µ—Ä–∏—è','–ö–∞—Ñ–µ –∏ —Ä–µ—Å—Ç–æ—Ä–∞–Ω—ã']
ONLINE_CATS   = ['–ï–¥–∏–º –¥–æ–º–∞','–°–º–æ—Ç—Ä–∏–º –¥–æ–º–∞','–ò–≥—Ä–∞–µ–º –¥–æ–º–∞']

def series_get(df, col, default_val):
    return df[col] if col in df.columns else pd.Series(default_val, index=df.index)

def build_features(df_clients, df_txn, df_trf):
    # –¢—Ä–∞—Ç—ã –ø–æ –∫–∞—Ç–µ–≥–æ—Ä–∏—è–º
    spend_by_cat = df_txn.groupby(['client_code','category'])['amount'].sum().unstack(fill_value=0.0)
    spend_total  = spend_by_cat.sum(axis=1)

    # –¢–†–ò –∫–ª—é—á–µ–≤—ã–µ –∫–∞—Ç–µ–≥–æ—Ä–∏–∏ –¥–ª—è travel-–∫–∞—Ä—Ç—ã
    travel_core  = spend_by_cat.reindex(columns=['–ü—É—Ç–µ—à–µ—Å—Ç–≤–∏—è'], fill_value=0.0).sum(axis=1)
    taxi_spend   = spend_by_cat.reindex(columns=['–¢–∞–∫—Å–∏'],       fill_value=0.0).sum(axis=1)
    hotels_spend = spend_by_cat.reindex(columns=['–û—Ç–µ–ª–∏'],       fill_value=0.0).sum(axis=1)

    # –î–ª—è —Å–æ–≤–º–µ—Å—Ç–∏–º–æ—Å—Ç–∏: –æ–±—â–∞—è travel-—Å—É–º–º–∞ = –ü—É—Ç–µ—à–µ—Å—Ç–≤–∏—è + –¢–∞–∫—Å–∏ + –û—Ç–µ–ª–∏
    travel_sum = travel_core + taxi_spend + hotels_spend

    premium_extra_spend = spend_by_cat.reindex(columns=PREMIUM_EXTRA, fill_value=0.0).sum(axis=1)
    online_spend        = spend_by_cat.reindex(columns=ONLINE_CATS,   fill_value=0.0).sum(axis=1)

    # Transfers: —á–∞—Å—Ç–æ—Ç—ã –∏ —Å—É–º–º—ã
    freq = df_trf.groupby(['client_code','type']).size().unstack(fill_value=0)
    amount_trf = df_trf.groupby(['client_code','type'])['amount'].sum().unstack(fill_value=0.0)

    idx = spend_by_cat.index

    inflow = sum(series_get(amount_trf, c, 0.0) for c in [
        'salary_in','stipend_in','family_in','refund_in','cashback_in','invest_in','deposit_fx_withdraw_in'
    ])
    outflow = sum(series_get(amount_trf, c, 0.0) for c in [
        'p2p_out','card_out','atm_withdrawal','utilities_out','loan_payment_out','cc_repayment_out',
        'installment_payment_out','invest_out','deposit_topup_out','gold_buy_out'
    ])

    fx_buy_amt  = series_get(amount_trf, 'fx_buy', 0.0).abs()
    fx_sell_amt = series_get(amount_trf, 'fx_sell', 0.0).abs()
    fx_volume   = fx_buy_amt + fx_sell_amt

    fx_buy_freq  = series_get(freq, 'fx_buy', 0).astype(int)
    fx_sell_freq = series_get(freq, 'fx_sell', 0).astype(int)
    fx_freq      = fx_buy_freq + fx_sell_freq

    atm_withdrawals = series_get(freq, 'atm_withdrawal',       0).astype(int)
    loan_payments   = series_get(freq, 'loan_payment_out',     0).astype(int)
    cc_repayments   = series_get(freq, 'cc_repayment_out',     0).astype(int)
    installments    = series_get(freq, 'installment_payment_out', 0).astype(int)

    top3 = spend_by_cat.apply(lambda s: list(s.sort_values(ascending=False).head(3).index), axis=1)

    df_feat = pd.DataFrame({
        'client_code': idx,
        'spend_total_3m': spend_total.values,

        # travel-—Ä–∞–∑–¥–µ–ª–µ–Ω–∏–µ
        'travel_core_spend_3m':  travel_core.reindex(idx, fill_value=0.0).values,   # ¬´–ü—É—Ç–µ—à–µ—Å—Ç–≤–∏—è¬ª
        'taxi_spend_3m':         taxi_spend.reindex(idx,  fill_value=0.0).values,   # ¬´–¢–∞–∫—Å–∏¬ª
        'hotels_spend_3m':       hotels_spend.reindex(idx,fill_value=0.0).values,   # ¬´–û—Ç–µ–ª–∏¬ª
        'travel_spend_3m':       travel_sum.reindex(idx,  fill_value=0.0).values,   # —Å—É–º–º–∞ —Ç—Ä—ë—Ö

        'premium_extra_spend_3m': premium_extra_spend.values,
        'online_spend_3m':        online_spend.values,

        'fx_volume_3m':        fx_volume.reindex(idx,      fill_value=0.0).values,
        'fx_freq_3m':          fx_freq.reindex(idx,        fill_value=0).values,
        'atm_withdrawals_3m':  atm_withdrawals.reindex(idx,fill_value=0).values,
        'loan_payments_3m':    loan_payments.reindex(idx,  fill_value=0).values,
        'cc_repayments_3m':    cc_repayments.reindex(idx,  fill_value=0).values,
        'installments_3m':     installments.reindex(idx,   fill_value=0).values,

        'inflow_3m':  inflow.reindex(idx,  fill_value=0.0).values,
        'outflow_3m': outflow.reindex(idx, fill_value=0.0).values,

        'top3_cats': top3.values,
    }).merge(
        df_clients[['client_code','name','status','age','city','avg_monthly_balance_KZT']],
        on='client_code', how='left'
    )

    # ¬´–≤ –º–µ—Å—è—Ü¬ª
    for col in [
        'spend_total_3m','travel_core_spend_3m','taxi_spend_3m','hotels_spend_3m','travel_spend_3m',
        'premium_extra_spend_3m','online_spend_3m','fx_volume_3m','inflow_3m','outflow_3m'
    ]:
        df_feat[col.replace('_3m','_m')] = df_feat[col] / 3.0

    # –¥–æ–ª—è travel (–ø–æ —Å—É–º–º–µ —Ç—Ä—ë—Ö –∫–∞—Ç–µ–≥–æ—Ä–∏–π)
    df_feat['travel_share'] = df_feat['travel_spend_3m'] / df_feat['spend_total_3m'].replace(0,1)

    return df_feat

Overwriting /content/hackathon_case_hybrid/src/features.py


In [None]:
# –ø–µ—Ä–µ–æ–ø—Ä–µ–¥–µ–ª–∏–º benefit_travel –≤ rules_scoring.py
import io, re, pathlib

path = pathlib.Path("/content/hackathon_case_hybrid/src/rules_scoring.py")
code = path.read_text(encoding="utf-8")

new_func = """
def benefit_travel(r):
    # –ß–∏—Å—Ç—ã–π –≤–∞—Ä–∏–∞–Ω—Ç A: –ü—É—Ç–µ—à–µ—Å—Ç–≤–∏—è + –¢–∞–∫—Å–∏ + –û—Ç–µ–ª–∏
    spend_m = (
        r.get('travel_core_spend_3m', 0.0)
        + r.get('taxi_spend_3m', 0.0)
        + r.get('hotels_spend_3m', 0.0)
    ) / 3.0
    return TRAVEL_CB * spend_m
"""

code = re.sub(r"def benefit_travel\(.*?\)\:\n(?:.|\n)*?return.*?\n", new_func, code, flags=re.S)
path.write_text(code, encoding="utf-8")
print("benefit_travel –æ–±–Ω–æ–≤–ª—ë–Ω –Ω–∞ ¬´—á–∏—Å—Ç—ã–π –≤–∞—Ä–∏–∞–Ω—Ç A¬ª.")

benefit_travel –æ–±–Ω–æ–≤–ª—ë–Ω –Ω–∞ ¬´—á–∏—Å—Ç—ã–π –≤–∞—Ä–∏–∞–Ω—Ç A¬ª.


In [None]:
%cd /content/hackathon_case_hybrid
%env PYTHONPATH=/content/hackathon_case_hybrid
!python -m src.pipeline

/content/hackathon_case_hybrid
env: PYTHONPATH=/content/hackathon_case_hybrid
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000035 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 136
[LightGBM] [Info] Number of data points in the train set: 48, number of used features: 8
[LightGBM] [Info] Start training from score -2.079442
[LightGBM] [Info] Start training from score -0.133531
Training until validation scores don't improve for 30 rounds
[50]	valid_0's multi_logloss: 0.0526071
[100]	valid_0's multi_logloss: 0.00396756
[150]	valid_0's multi_logloss: 0.000323661
[200]	valid_0's multi_logloss: 2.65543e-05
Early stopping, best iteration is:
[206]	valid_0's multi_logloss: 1.96717e-05
–°–æ—Ö—Ä–∞–Ω–µ–Ω–æ: /content/hackathon_case_hybrid/output/recommendations.csv


In [None]:
import re, pathlib

cfg = pathlib.Path("/content/hackathon_case_hybrid/src/config.py")
txt = cfg.read_text(encoding="utf-8")

def set_const(text, name, value):
    pattern = rf"^{name}\s*=\s*.*$"
    repl    = f"{name} = {value}"
    if re.search(pattern, text, flags=re.M):
        return re.sub(pattern, repl, text, flags=re.M)
    else:
        return text + f"\n{repl}\n"

# ‚Üê –ü–û–î–ë–ï–†–ò –∑–Ω–∞—á–µ–Ω–∏—è –ø–æ–¥ —Å–µ–±—è (–ø—Ä–∏–º–µ—Ä –¥–µ–ª–∞–µ—Ç –∫—Ä–µ–¥–∏—Ç ¬´–º—è–≥—á–µ¬ª)
txt = set_const(txt, "CREDIT_OUTFLOW_INFLOW_RATIO", "1.6")   # –±—ã–ª–æ 2.5
txt = set_const(txt, "CREDIT_NEARZERO_BAL_MAX",     "50_000") # –±—ã–ª–æ 15_000
txt = set_const(txt, "CREDIT_VERYLOW_BAL_MAX",      "100_000")# –±—ã–ª–æ 30_000
txt = set_const(txt, "CREDIT_LOAN_FREQ_MIN",        "3")      # –±—ã–ª–æ 6

cfg.write_text(txt, encoding="utf-8")
print("–ü–æ—Ä–æ–≥–æ–≤—ã–µ –ø–∞—Ä–∞–º–µ—Ç—Ä—ã –∫—Ä–µ–¥–∏—Ç–∞ –æ–±–Ω–æ–≤–ª–µ–Ω—ã.")

–ü–æ—Ä–æ–≥–æ–≤—ã–µ –ø–∞—Ä–∞–º–µ—Ç—Ä—ã –∫—Ä–µ–¥–∏—Ç–∞ –æ–±–Ω–æ–≤–ª–µ–Ω—ã.


In [None]:
%cd /content/hackathon_case_hybrid
%env PYTHONPATH=/content/hackathon_case_hybrid
!python -m src.pipeline

/content/hackathon_case_hybrid
env: PYTHONPATH=/content/hackathon_case_hybrid
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000036 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 136
[LightGBM] [Info] Number of data points in the train set: 48, number of used features: 8
[LightGBM] [Info] Start training from score -2.079442
[LightGBM] [Info] Start training from score -0.133531
Training until validation scores don't improve for 30 rounds
[50]	valid_0's multi_logloss: 0.0526071
[100]	valid_0's multi_logloss: 0.00396756
[150]	valid_0's multi_logloss: 0.000323661
[200]	valid_0's multi_logloss: 2.65543e-05
Early stopping, best iteration is:
[206]	valid_0's multi_logloss: 1.96717e-05
–°–æ—Ö—Ä–∞–Ω–µ–Ω–æ: /content/hackathon_case_hybrid/output/recommendations.csv


In [None]:
import pandas as pd
df = pd.read_csv("/content/hackathon_case_hybrid/output/recommendations.csv")
df['product'].value_counts()

Unnamed: 0_level_0,count
product,Unnamed: 1_level_1
–ö—Ä–µ–¥–∏—Ç –Ω–∞–ª–∏—á–Ω—ã–º–∏,24
–ö—Ä–µ–¥–∏—Ç–Ω–∞—è –∫–∞—Ä—Ç–∞,21
–ü—Ä–µ–º–∏–∞–ª—å–Ω–∞—è –∫–∞—Ä—Ç–∞,8
–î–µ–ø–æ–∑–∏—Ç –°–±–µ—Ä–µ–≥–∞—Ç–µ–ª—å–Ω—ã–π,7


In [None]:
import os
os.environ["OPENAI_API_KEY"] = "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"


In [None]:
%%writefile /content/hackathon_case_hybrid/src/push_generator.py
# -*- coding: utf-8 -*-
"""
LLM —Ç–æ–ª—å–∫–æ –ø–µ—Ä–µ—Ñ—Ä–∞–∑–∏—Ä—É–µ—Ç –∑–∞–¥–∞–Ω–Ω—ã–π –Ω–∞–º–∏ —à–∞–±–ª–æ–Ω,
–ù–ò–ß–ï–ì–û –Ω–µ –≤—ã–¥—É–º—ã–≤–∞–µ—Ç –∏ –Ω–µ –¥–æ–±–∞–≤–ª—è–µ—Ç –Ω–æ–≤—ã—Ö —á–∏—Å–µ–ª/—Ñ–∞–∫—Ç–æ–≤.
"""

import os
from datetime import datetime
from typing import Dict, Any

# --- —Ñ–æ—Ä–º–∞—Ç–∏—Ä–æ–≤–∞–Ω–∏–µ —á–∏—Å–µ–ª –∏ –≤–∞–ª—é—Ç—ã ---
def fmt_int(x):
    try:
        return f"{int(round(float(x))):,}".replace(",", " ")
    except Exception:
        return "0"

def fmt_kzt(x):
    return f"{fmt_int(x)} ‚Ç∏"

RU_MONTHS = ["—è–Ω–≤–∞—Ä–µ","—Ñ–µ–≤—Ä–∞–ª–µ","–º–∞—Ä—Ç–µ","–∞–ø—Ä–µ–ª–µ","–º–∞–µ","–∏—é–Ω–µ","–∏—é–ª–µ","–∞–≤–≥—É—Å—Ç–µ","—Å–µ–Ω—Ç—è–±—Ä–µ","–æ–∫—Ç—è–±—Ä–µ","–Ω–æ—è–±—Ä–µ","–¥–µ–∫–∞–±—Ä–µ"]
def month_ru_loc(dt=None):
    dt = dt or datetime.now()
    return RU_MONTHS[dt.month-1]

# --- —Å—Ç—Ä–æ–≥–∏–µ —à–∞–±–ª–æ–Ω—ã –ø–æ –ø—Ä–æ–¥—É–∫—Ç–∞–º (—Ç–æ–ª—å–∫–æ –∏—Ö –∏ –æ—Ç–¥–∞—ë–º –≤ LLM) ---
def make_base_text(product: str, name: str, f: Dict[str, Any]) -> str:
    m = month_ru_loc()
    if product == "–ö–∞—Ä—Ç–∞ –¥–ª—è –ø—É—Ç–µ—à–µ—Å—Ç–≤–∏–π":
        taxi_cnt  = int(f.get("taxi_cnt", 0))
        taxi_sum  = fmt_kzt(f.get("taxi_spend_m", f.get("travel_spend_m", 0)))
        benefit   = fmt_kzt(f.get("benefit_kzt", 0))
        return (f"{name}, –≤ {m} –≤—ã —Å–¥–µ–ª–∞–ª–∏ {taxi_cnt} –ø–æ–µ–∑–¥–æ–∫ –Ω–∞ —Ç–∞–∫—Å–∏ –Ω–∞ {taxi_sum}. "
                f"–° –∫–∞—Ä—Ç–æ–π –¥–ª—è –ø—É—Ç–µ—à–µ—Å—Ç–≤–∏–π –≤–µ—Ä–Ω—É–ª–∞—Å—å –±—ã —á–∞—Å—Ç—å —Ä–∞—Å—Ö–æ–¥–æ–≤ ‚Äî‚âà{benefit} –∫–µ—à–±—ç–∫–æ–º. –û—Ç–∫—Ä–æ–π—Ç–µ –∫–∞—Ä—Ç—É –≤ –ø—Ä–∏–ª–æ–∂–µ–Ω–∏–∏.")
    elif product == "–ü—Ä–µ–º–∏–∞–ª—å–Ω–∞—è –∫–∞—Ä—Ç–∞":
        benefit = fmt_kzt(f.get("benefit_kzt", 0))
        has_restos = bool(f.get("has_restos", False))
        restos = " –∏ —Ç—Ä–∞—Ç—ã –≤ —Ä–µ—Å—Ç–æ—Ä–∞–Ω–∞—Ö" if has_restos else ""
        return (f"{name}, —É –≤–∞—Å —Å—Ç–∞–±–∏–ª—å–Ω–æ –≤—ã—Å–æ–∫–∏–π –æ—Å—Ç–∞—Ç–æ–∫{restos}. "
                f"–ü—Ä–µ–º–∏–∞–ª—å–Ω–∞—è –∫–∞—Ä—Ç–∞ –¥–∞—ë—Ç –ø–æ–≤—ã—à–µ–Ω–Ω—ã–π –∫–µ—à–±—ç–∫ –∏ –±–µ—Å–ø–ª–∞—Ç–Ω—ã–µ —Å–Ω—è—Ç–∏—è. –í—ã–≥–æ–¥–∞ –¥–æ {benefit}. –û—Ñ–æ—Ä–º–∏—Ç—å —Å–µ–π—á–∞—Å.")
    elif product == "–ö—Ä–µ–¥–∏—Ç–Ω–∞—è –∫–∞—Ä—Ç–∞":
        top3 = f.get("top3", []) + ["","",""]
        benefit = fmt_kzt(f.get("benefit_kzt", 0))
        return (f"{name}, –≤–∞—à–∏ —Ç–æ–ø-–∫–∞—Ç–µ–≥–æ—Ä–∏–∏ ‚Äî {top3[0]}, {top3[1]}, {top3[2]}. "
                f"–ö—Ä–µ–¥–∏—Ç–Ω–∞—è –∫–∞—Ä—Ç–∞ –¥–∞—ë—Ç –¥–æ 10% –≤ –ª—é–±–∏–º—ã—Ö –∫–∞—Ç–µ–≥–æ—Ä–∏—è—Ö –∏ –Ω–∞ –æ–Ω–ª–∞–π–Ω-—Å–µ—Ä–≤–∏—Å—ã. –í–µ—Ä–Ω—É–ª–∏ –±—ã ‚âà{benefit}. –û—Ñ–æ—Ä–º–∏—Ç—å –∫–∞—Ä—Ç—É.")
    elif product == "–û–±–º–µ–Ω –≤–∞–ª—é—Ç":
        curr = f.get("fx_curr","USD")
        return (f"{name}, –≤—ã —á–∞—Å—Ç–æ –ø–ª–∞—Ç–∏—Ç–µ –≤ {curr}. "
                f"–í –ø—Ä–∏–ª–æ–∂–µ–Ω–∏–∏ –≤—ã–≥–æ–¥–Ω—ã–π –æ–±–º–µ–Ω –∏ –∞–≤—Ç–æ-–ø–æ–∫—É–ø–∫–∞ –ø–æ —Ü–µ–ª–µ–≤–æ–º—É –∫—É—Ä—Å—É. –ù–∞—Å—Ç—Ä–æ–∏—Ç—å –æ–±–º–µ–Ω.")
    elif product.startswith("–î–µ–ø–æ–∑–∏—Ç"):
        benefit = fmt_kzt(f.get("benefit_kzt", 0))
        return (f"{name}, —É –≤–∞—Å –æ—Å—Ç–∞—é—Ç—Å—è —Å–≤–æ–±–æ–¥–Ω—ã–µ —Å—Ä–µ–¥—Å—Ç–≤–∞. "
                f"–†–∞–∑–º–µ—Å—Ç–∏—Ç–µ –∏—Ö –Ω–∞ –≤–∫–ª–∞–¥–µ ‚Äî —É–¥–æ–±–Ω–æ –∫–æ–ø–∏—Ç—å –∏ –ø–æ–ª—É—á–∞—Ç—å –≤–æ–∑–Ω–∞–≥—Ä–∞–∂–¥–µ–Ω–∏–µ. –î–æ—Ö–æ–¥ ~{benefit} –≤ –º–µ—Å—è—Ü. –û—Ç–∫—Ä—ã—Ç—å –≤–∫–ª–∞–¥.")
    elif product == "–ò–Ω–≤–µ—Å—Ç–∏—Ü–∏–∏":
        return (f"{name}, –ø–æ–ø—Ä–æ–±—É–π—Ç–µ –∏–Ω–≤–µ—Å—Ç–∏—Ü–∏–∏ —Å –Ω–∏–∑–∫–∏–º –ø–æ—Ä–æ–≥–æ–º –≤—Ö–æ–¥–∞ –∏ –±–µ–∑ –∫–æ–º–∏—Å—Å–∏–π –Ω–∞ —Å—Ç–∞—Ä—Ç. "
                f"–û—Ç–∫—Ä–æ–π—Ç–µ —Å—á—ë—Ç –∏ –Ω–∞—á–Ω–∏—Ç–µ —Å –Ω–µ–±–æ–ª—å—à–∏—Ö —Å—É–º–º. –û—Ç–∫—Ä—ã—Ç—å —Å—á—ë—Ç.")
    elif product == "–ó–æ–ª–æ—Ç—ã–µ —Å–ª–∏—Ç–∫–∏":
        return (f"{name}, –¥–ª—è –¥–æ–ª–≥–æ—Å—Ä–æ—á–Ω–æ–π –∑–∞—â–∏—Ç—ã –∫–∞–ø–∏—Ç–∞–ª–∞ –ø–æ–¥–æ–π–¥—É—Ç –∑–æ–ª–æ—Ç—ã–µ —Å–ª–∏—Ç–∫–∏. "
                f"–£–∑–Ω–∞–π—Ç–µ —É—Å–ª–æ–≤–∏—è –∏ –ø–æ–¥–±–µ—Ä–∏—Ç–µ –≤–µ—Å —Å–ª–∏—Ç–∫–∞. –ü–æ–¥–∫–ª—é—á–∏—Ç—å –∑–æ–ª–æ—Ç–æ.")
    elif product == "–ö—Ä–µ–¥–∏—Ç –Ω–∞–ª–∏—á–Ω—ã–º–∏":
    return (f"{name}, –µ—Å–ª–∏ –ø–ª–∞–Ω–∏—Ä—É—é—Ç—Å—è –∫—Ä—É–ø–Ω—ã–µ —Ç—Ä–∞—Ç—ã ‚Äî "
            f"–∫—Ä–µ–¥–∏—Ç –Ω–∞–ª–∏—á–Ω—ã–º–∏ –ø–æ–º–æ–∂–µ—Ç –∏ –≤–µ—Ä–Ω—ë—Ç –≥–∏–±–∫–æ—Å—Ç—å –±—é–¥–∂–µ—Ç—É. "
            f"–£–∑–Ω–∞–π—Ç–µ –¥–æ—Å—Ç—É–ø–Ω—ã–π –ª–∏–º–∏—Ç.")
    else:
        # –∑–∞–ø–∞—Å–Ω–æ–π –≤–∞—Ä–∏–∞–Ω—Ç
        return (f"{name}, –ø–æ—Å–º–æ—Ç—Ä–∏—Ç–µ –Ω–æ–≤—ã–π –ø—Ä–æ–¥—É–∫—Ç ‚Äî –æ–Ω –º–æ–∂–µ—Ç –ø–æ–¥–æ–π—Ç–∏ –ø–æ–¥ –≤–∞—à–∏ —Ä–∞—Å—Ö–æ–¥—ã. "
                f"–ü–æ–¥—Ä–æ–±–Ω–µ–µ –≤ –ø—Ä–∏–ª–æ–∂–µ–Ω–∏–∏. –ü–æ—Å–º–æ—Ç—Ä–µ—Ç—å.")

# --- LLM-–ø–æ–ª–∏—Ä–æ–≤–∫–∞ —Ç–æ–Ω–∞ (–±–µ–∑ –¥–æ–±–∞–≤–ª–µ–Ω–∏—è —Ñ–∞–∫—Ç–æ–≤) ---
SYSTEM_PROMPT = """
–í—ã ‚Äî —Ä–µ–¥–∞–∫—Ç–æ—Ä –ø—É—à-—É–≤–µ–¥–æ–º–ª–µ–Ω–∏–π –±–∞–Ω–∫–∞. –ü–µ—Ä–µ—Ñ—Ä–∞–∑–∏—Ä—É–π—Ç–µ –ø–µ—Ä–µ–¥–∞–Ω–Ω—ã–π —Ç–µ–∫—Å—Ç –±–µ–∑ –∏—Å–∫–∞–∂–µ–Ω–∏–π —Ñ–∞–∫—Ç–æ–≤.
–¢—Ä–µ–±–æ–≤–∞–Ω–∏—è:
- —Ç–æ–Ω: –Ω–∞ —Ä–∞–≤–Ω—ã—Ö, –ø—Ä–æ—Å—Ç–æ, –ø–æ-—á–µ–ª–æ–≤–µ—á–µ—Å–∫–∏; –æ–±—Ä–∞—â–µ–Ω–∏–µ –Ω–∞ ¬´–≤—ã¬ª
- –≤–∞–∂–Ω–æ–µ –≤ –Ω–∞—á–∞–ª–æ, –±–µ–∑ –∫–∞–Ω—Ü–µ–ª—è—Ä–∏–∑–º–æ–≤; 180‚Äì220 —Å–∏–º–≤–æ–ª–æ–≤
- —ç–º–æ–¥–∑–∏ 0‚Äì1 –ø–æ –¥–µ–ª—É; –º–∞–∫—Å–∏–º—É–º –æ–¥–∏–Ω ¬´!¬ª
- –Ω–µ—Ç call to action
- –ù–∏—á–µ–≥–æ –Ω–µ –º–µ–Ω—è—Ç—å –≤ —Ç–µ–∫—Å—Ç–µ. –í–µ—Ä–Ω—É—Ç—å –µ–≥–æ –∫–∞–∫ –µ—Å—Ç—å.
- –µ—Å–ª–∏ product "–ö—Ä–µ–¥–∏—Ç –Ω–∞–ª–∏—á–Ω—ã–º–∏":
—Ç–æ –ø–∏—Å–∞—Ç—å , –µ—Å–ª–∏ –ø–ª–∞–Ω–∏—Ä—É—é—Ç—Å—è –∫—Ä—É–ø–Ω—ã–µ —Ç—Ä–∞—Ç—ã ‚Äî "
            "–∫—Ä–µ–¥–∏—Ç –Ω–∞–ª–∏—á–Ω—ã–º–∏ –ø–æ–º–æ–∂–µ—Ç –∏ –≤–µ—Ä–Ω—ë—Ç –≥–∏–±–∫–æ—Å—Ç—å –±—é–¥–∂–µ—Ç—É. "
            .")
- —á–∏—Å–ª–∞: –ø—Ä–æ–±–µ–ª—ã –∫–∞–∫ —Ä–∞–∑–¥–µ–ª–∏—Ç–µ–ª–∏ —Ä–∞–∑—Ä—è–¥–æ–≤; –¥—Ä–æ–±–Ω–∞—è —á–∞—Å—Ç—å ‚Äî –∑–∞–ø—è—Ç–∞—è; –≤–∞–ª—é—Ç–∞ ‚Äî ¬´‚Ç∏¬ª —Å –ø—Ä–æ–±–µ–ª–æ–º: 2 490 ‚Ç∏
- –¥–∞—Ç—ã: –¥–¥.–º–º.–≥–≥–≥–≥ –∏–ª–∏ ¬´30 –∞–≤–≥—É—Å—Ç–∞ 2025¬ª (–µ—Å–ª–∏ –≤—Å—Ç—Ä–µ—á–∞—é—Ç—Å—è)
- –Ω–∏–∫–∞–∫–∏—Ö –Ω–æ–≤—ã—Ö —á–∏—Å–µ–ª/—Å—É–º–º/—É—Å–ª–æ–≤–∏–π; –ù–ï –¥–æ–±–∞–≤–ª—è—Ç—å ¬´–±–∞–ª–∞–Ω—Å¬ª, ¬´–ª–∏–º–∏—Ç¬ª –∏ —Ç. –ø., –µ—Å–ª–∏ —ç—Ç–æ–≥–æ –Ω–µ—Ç –≤ —Ç–µ–∫—Å—Ç–µ
–í–µ—Ä–Ω–∏—Ç–µ –æ–¥–∏–Ω –∞–±–∑–∞—Ü –±–µ–∑ –∫–∞–≤—ã—á–µ–∫.
"""

def _llm_polish(text: str) -> str:
    # –º–æ–∂–Ω–æ –∑–∞–º–µ–Ω–∏—Ç—å –Ω–∞ –ª—é–±–æ–π —Å–æ–≤–º–µ—Å—Ç–∏–º—ã–π –∫–ª–∏–µ–Ω—Ç LLM
    try:
        from openai import OpenAI
        client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
        resp = client.chat.completions.create(
            model="gpt-4o-mini",
            messages=[
                {"role": "system", "content": SYSTEM_PROMPT},
                {"role": "user", "content": text}
            ],
            temperature=0.4,
            max_tokens=140
        )
        return resp.choices[0].message.content.strip()
    except Exception:
        return text  # –Ω–∞ –ª—é–±–æ–π —Å–±–æ–π ‚Äî –±–µ–∑ –ø–æ–ª–∏—Ä–æ–≤–∫–∏

def generate_push(name: str, product: str, facts: Dict[str, Any]) -> str:
    base = make_base_text(product, name, facts)
    polished = _llm_polish(base)
    # –ñ—ë—Å—Ç–∫–∞—è —Å—Ç—Ä–∞—Ö–æ–≤–∫–∞: –µ—Å–ª–∏ –≤–¥—Ä—É–≥ LLM –≤—ã–∫–∞—Ç–∏–ª —Å–ª–∏—à–∫–æ–º –¥–ª–∏–Ω–Ω—ã–π —Ç–µ–∫—Å—Ç
    if len(polished) > 230:
        polished = polished[:229].rstrip() + "‚Ä¶"
    return polished

Overwriting /content/hackathon_case_hybrid/src/push_generator.py


In [None]:
%cd /content/hackathon_case_hybrid
%env PYTHONPATH=/content/hackathon_case_hybrid
!python -m src.pipeline

/content/hackathon_case_hybrid
env: PYTHONPATH=/content/hackathon_case_hybrid
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000038 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 136
[LightGBM] [Info] Number of data points in the train set: 48, number of used features: 8
[LightGBM] [Info] Start training from score -2.079442
[LightGBM] [Info] Start training from score -0.133531
Training until validation scores don't improve for 30 rounds
[50]	valid_0's multi_logloss: 0.0526071
[100]	valid_0's multi_logloss: 0.00396756
[150]	valid_0's multi_logloss: 0.000323661
[200]	valid_0's multi_logloss: 2.65543e-05
Early stopping, best iteration is:
[206]	valid_0's multi_logloss: 1.96717e-05
–°–æ—Ö—Ä–∞–Ω–µ–Ω–æ: /content/hackathon_case_hybrid/output/recommendations.csv
