# Кластеризация пользователей по фичам (UMAP + KMeans)


## Описание фичей

**Сколько тратят (`daily_amount` / `high_spend_share`)**
Помогает понять, кто много тратит и может быть заинтересован в кредитах или особых условиях.


**Кто копит (`saver_share` / `saver_share_30d`)**
Выделяет людей, которые склонны откладывать деньги. Им могут быть интересны депозиты или сберегательные счета.


**Интересы (`auto_day_share` / `home_day_share`)**
Показывает, кто часто тратит на машину (автостраховки, заправки) или на дом/ремонт (страховки, кредиты на ремонт).


**Сезонные траты (`pre_new_year`, `gifts_q1`, `back_to_school`, `summer`, `salary_window`, `social_benefits_window`)**
Отмечают периоды, когда люди чаще всего покупают подарки, готовятся к школе, едут в отпуск, или когда у них зарплата/пособия. В эти моменты актуальны кредиты, страховки или кешбэк.


**Разнообразие покупок (`unique_categories_mean` / `top_category`)**
Говорит о том, насколько разнообразны покупки человека и что он покупает чаще всего. Это сигнал для кешбэка или специальных предложений по его любимым категориям.


**Покупки по выходным (`weekend_share`)**
Показывает, кто активнее тратит деньги в выходные. Для таких клиентов актуальны предложения по путешествиям или развлечениям.


**Предпочтения продуктов (`target_*`)**
Это подсказки, какие продукты (кредиты, вклады, страховки) могут быть интересны клиенту. Используются для более точных рекомендаций.


In [2]:
from pathlib import Path
import json
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.neighbors import LocalOutlierFactor
import umap
import seaborn as sns
import matplotlib.pyplot as plt

from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [42]:
PROJECT_ROOT = Path('/content/drive/MyDrive/aviahack/')

EVENTS_DIR = PROJECT_ROOT / 'marketplace/'
USERS_PATH = PROJECT_ROOT / 'users.pq'
PRODUCTS_PATH = PROJECT_ROOT / 'psb_products_updated.json'

FEATURE_PARAMS = {
    'anchor_date': pd.Timestamp('2023-01-01'),
    'n_files': 100,
    'high_spend_quantile': 0.75,
    'saver_window': 30,
    'saver_min_periods': 10,
    'saver_threshold': 0.8,
    'salary_window_start_day': 25,
    'salary_window_end_day': 5,
    'social_benefits_start_day': 10,
    'social_benefits_end_day': 20,
    'pre_ny_start_day': 15,
    'gifts_q1_end_day': 8,
    'back_to_school_start': (8, 15),
    'back_to_school_end': (9, 15),
    'summer_months': [6, 7, 8],
}

CLUSTER_PARAMS = {
    'n_clusters': 5,
    'umap_n_neighbors': 25,
    'umap_min_dist': 0.1,
    'umap_random_state': 42,
    'lof_neighbors': 25,
    'lof_contamination': 0.02,
}

print('Используем каталог событий:', EVENTS_DIR)

Используем каталог событий: /content/drive/MyDrive/aviahack/marketplace


In [43]:
def choose_first_available(df: pd.DataFrame, candidates):
    for col in candidates:
        if col in df.columns:
            return col
    return None

In [44]:
def detect_keyword_flag(df: pd.DataFrame, columns, keywords) -> pd.Series:
    cols = [c for c in columns if c and c in df.columns]
    if not cols:
        return pd.Series(False, index=df.index)
    text = df[cols[0]].fillna('').astype(str)
    for col in cols[1:]:
        text = text.str.cat(' ' + df[col].fillna('').astype(str))
    text = text.str.lower()
    mask = pd.Series(False, index=df.index)
    for kw in keywords:
        mask = mask | text.str.contains(kw)
    return mask

In [45]:
def build_calendar_flags(dates: pd.Series, cfg: dict) -> pd.DataFrame:
    d = pd.to_datetime(dates)
    day = d.dt.day
    month = d.dt.month
    flags = pd.DataFrame(index=dates.index)
    flags['is_pre_new_year'] = (month == 12) & (day >= cfg['pre_ny_start_day'])
    flags['is_gifts_q1'] = (month == 2) | ((month == 3) & (day <= cfg['gifts_q1_end_day']))
    start_m, start_d = cfg['back_to_school_start']
    end_m, end_d = cfg['back_to_school_end']
    flags['is_back_to_school'] = ((month == start_m) & (day >= start_d)) | ((month == end_m) & (day <= end_d))
    flags['is_summer'] = month.isin(cfg['summer_months'])
    flags['is_salary_window'] = (day >= cfg['salary_window_start_day']) | (day <= cfg['salary_window_end_day'])
    flags['is_social_benefits_window'] = (day >= cfg['social_benefits_start_day']) & (day <= cfg['social_benefits_end_day'])
    return flags


In [46]:
def standardize_event_time(df: pd.DataFrame, anchor_date: pd.Timestamp) -> pd.DataFrame:
    df = df.copy()
    if df.empty:
        df['event_dt'] = pd.NaT
        df['date'] = pd.NaT
        return df
    time_col = choose_first_available(df, ['event_time', 'timestamp', 'time', 'ts'])
    if time_col is None:
        raise ValueError('Не нашли колонку со временем события')
    raw = df[time_col]
    if np.issubdtype(raw.dtype, np.timedelta64):
        event_dt = anchor_date + pd.to_timedelta(raw)
    else:
        event_dt = pd.to_datetime(raw, errors='coerce', utc=True)
        try:
            event_dt = event_dt.dt.tz_convert(None)
        except TypeError:
            pass
    df['event_dt'] = event_dt
    df['date'] = df['event_dt'].dt.floor('D')
    return df


## Загрузка и предобработка событий
- Фильтр `action_type != view` — оставляем действия, убираем шум просмотров.
- Нормализация времени (timedelta -> anchor_date).
- Сумма/категория, тематические флаги.


In [47]:
event_files = sorted(list(EVENTS_DIR.glob('*.pq')))

if event_files:
    n_files = FEATURE_PARAMS['n_files']
    use_files = event_files if n_files is None else event_files[:n_files]
    raw_frames = [pd.read_parquet(f) for f in use_files]
    raw_events = pd.concat(raw_frames, ignore_index=True)
else:
    use_files = []
    raw_events = pd.DataFrame()
print(f'Загружено {len(raw_events):,} строк из {len(use_files)} файлов')

Загружено 1,549,942 строк из 100 файлов


In [48]:
rows_before = len(raw_events)
if 'action_type' in raw_events.columns:
    events_filtered = raw_events[raw_events['action_type'].fillna('') != 'view'].copy()
else:
    events_filtered = raw_events.copy()
rows_after = len(events_filtered)
print(f'Строк до: {rows_before:,} / после удаления view: {rows_after:,}')


Строк до: 1,549,942 / после удаления view: 1,549,942


In [49]:
events = standardize_event_time(events_filtered, FEATURE_PARAMS['anchor_date'])
if events.empty:
    print('Нет событий после фильтрации')
else:
    events = events.dropna(subset=['event_dt'])
    events['user_id'] = pd.to_numeric(events['user_id'], errors='coerce').astype('Int64')
    events = events.dropna(subset=['user_id'])
    events['user_id'] = events['user_id'].astype(int)
print('Событий после нормализации:', events.shape)


Событий после нормализации: (1549942, 9)


In [50]:
amount_col = choose_first_available(events, ['price', 'amount', 'sum', 'value'])
if amount_col:
    events['amount'] = pd.to_numeric(events[amount_col], errors='coerce').fillna(0.0)
else:
    events['amount'] = 0.0
category_col = choose_first_available(events, ['category', 'category_id', 'subdomain', 'domain', 'brand_id'])
if category_col:
    events[category_col] = events[category_col].astype(str)
print('Колонка суммы:', amount_col)
print('Колонка категории:', category_col)


Колонка суммы: None
Колонка категории: subdomain


In [51]:
category_candidates = [category_col, 'brand_id', 'domain', 'subdomain', 'item_id', 'action_type']

auto_keywords = [
    'auto', 'avto', 'car', 'cars', 'motor', 'moto', 'bike', 'auto-', 'авто', 'тачка', 'машин', 'мотор', 'мото',
    'fuel', 'gas', 'gasoline', 'benzin', 'benz', 'petrol', 'diesel', 'дт', 'дизел', 'бенз', 'бензин', 'газ',
    'ev', 'electro', 'electrocar', 'электро', 'электрокар', 'tesla',
    'azs', 'azk', 'azk', 'азс', 'азк', 'заправ', 'запра', 'тнк', 'лукойл', 'роснефть', 'газпромнефть', 'shell', 'bp',
    'sto', 'service', 'autoservice', 'autoserv', 'шиномон', 'шин', 'автосервис', 'сто', 'сервис авто', 'ремавто',
    'parking', 'parkov', 'парков', 'паркинг', 'паркомат', 'парковка', 'parkomat',
    'tire', 'tyre', 'шин', 'шина', 'резина', 'колес', 'колёс',
    'oil', 'масло', 'маслосервис', 'filter', 'фильтр',
    'wash', 'carwash', 'автомой', 'мойка', 'химчистка авто',
    'osago', 'kasko', 'осаго', 'каско', 'страховка авто',
    'taxi', 'uber', 'yandex go', 'яндекс го', 'яндекс.го', 'такси', 'gettaxi', 'gett', 'bolt',
    'parking', 'carsharing', 'шеринг', 'каршер', 'карш', 'драйв', 'citydrive', 'belka', 'belkacar', 'делимобиль', 'youdrive',
    'rentacar', 'прокат авто', 'аренда авто', 'аренда маш', 'автопрокат'
]

home_keywords = [
    'home', 'house', 'flat', 'apartment', 'apt', 'room', 'rent', ' аренда ', 'съем', 'снять', 'квартира', 'кварти', 'комната', 'комнат', 'дом', 'дача',
    'mortgage', 'ipoteka', 'ипотек', 'ipot', 'ипот', 'мортг', 'ип', 'ипотечный',
    'repair', 'remont', 'ремонт', 'рем', 'ремавто', 'строи', 'строй', 'строит', 'build', 'construction', 'строительство',
    'materials', 'материал', 'материалы', 'строймат', 'стройматериалы', 'cement', 'цемент', 'шпатлев', 'штукатур', 'гипс', 'краска', 'paint', 'paintwork',
    'furniture', 'mebel', 'мебел', 'мебель', 'диван', 'кровать', 'стол', 'стул', 'шкаф', 'кухня', 'kitchen', 'wardrobe',
    'appliance', 'техника', 'быттех', 'холодильник', 'посудомой', 'стирал', 'микровол', 'духовка', 'плита', 'oven', 'fridge', 'washer', 'dishwasher',
    'floor', 'пол', 'ламинат', 'паркет', 'линолеум', 'плитка', 'tile', 'ceram', 'керам', 'керамич',
    'wall', 'wallpaper', 'обои', 'обой', 'панель', 'панел', 'гкл', 'гипсокартон',
    'door', 'двер', 'окн', 'window', 'окон', 'стекл', 'стеклопакет', 'пластик окно',
    'bath', 'ванн', 'душ', 'смесител', 'кран', 'санузел', 'туалет', 'раковина', 'унитаз', 'santech', 'сантеx', 'сантех',
    'decor', 'декор', 'карниз', 'штора', 'занавес', 'ковер', 'плед', 'подушка', 'light', 'свет', 'люстра', 'светильник',
    'rent', 'аренда', 'съём', 'жильё', 'жилье', 'hypothec', 'estate', 'realty', 'недвиж', 'риелт', 'агентство недвижимости',
    'insurance', 'страхование имущества', 'страховка жилья', 'property insurance'
]

events['auto_related'] = detect_keyword_flag(events, category_candidates, auto_keywords)
events['home_related'] = detect_keyword_flag(events, category_candidates, home_keywords)
print('auto_related mean:', events['auto_related'].mean() if len(events) else 0)
print('home_related mean:', events['home_related'].mean() if len(events) else 0)

auto_related mean: 0.0
home_related mean: 0.0


## Дневные фичи и сезонные окна
- Сумма/частота за день, разнообразие категорий
- Самая значимая категория дня
- Авто/ремонт активность в день
- High-spend дни и «накопители»
- Сезонность (предНГ, подарки, школа, лето, зарплаты, соцвыплаты)


In [52]:
agg_dict = {
    'daily_amount': ('amount', 'sum'),
    'daily_events': ('event_dt', 'size'),
}
if category_col:
    agg_dict['unique_categories'] = (category_col, 'nunique')

daily = events.groupby(['user_id', 'date']).agg(**agg_dict).reset_index() if len(events) else pd.DataFrame()
if len(daily):
    daily = daily.sort_values(['user_id', 'date'])

    daily['avg_daily_amount'] = daily.groupby('user_id')['daily_amount'].transform(lambda s: s.expanding().mean())
    daily['avg_daily_events'] = daily.groupby('user_id')['daily_events'].transform(lambda s: s.expanding().mean())
    if 'unique_categories' in daily.columns:
        daily['avg_unique_categories'] = daily.groupby('user_id')['unique_categories'].transform(lambda s: s.expanding().mean())

    daily = daily.drop(columns=['daily_amount', 'daily_events', 'unique_categories'], errors='ignore')

print('Размер daily:', daily.shape)
daily.head()

Размер daily: (386164, 5)
   user_id       date  avg_daily_amount  avg_daily_events  \
0      195 2024-08-26               0.0               1.0   
1      195 2024-08-27               0.0               2.0   
2      444 2024-06-10               0.0               1.0   
3      444 2024-06-11               0.0               1.0   
4     1514 2024-06-23               0.0               1.0   

   avg_unique_categories  
0                    1.0  
1                    1.0  
2                    1.0  
3                    1.0  
4                    1.0  


In [53]:
if len(daily) and category_col:
    cat_daily = (
        events
        .groupby(['user_id', 'date', category_col])
        .agg(category_amount=('amount', 'sum'), category_events=('amount', 'size'))
        .reset_index()
    )
    top_cat = (
        cat_daily
        .sort_values(['category_amount', 'category_events'], ascending=False)
        .groupby(['user_id', 'date'])
        .head(1)
        .rename(columns={category_col: 'top_category'})
    )
    daily = daily.merge(top_cat[['user_id', 'date', 'top_category', 'category_amount', 'category_events']], on=['user_id', 'date'], how='left')


In [54]:
for source_col, target_col in [('auto_related', 'is_auto_active'), ('home_related', 'is_home_repair_period')]:
    if source_col in events.columns and len(events):
        flag = events.groupby(['user_id', 'date'])[source_col].any().reset_index().rename(columns={source_col: target_col})
        daily = daily.merge(flag, on=['user_id', 'date'], how='left') if len(daily) else flag
    else:
        daily[target_col] = False
for col in ['is_auto_active', 'is_home_repair_period']:
    if col in daily.columns:
        daily[col] = daily[col].fillna(False)


In [57]:
if len(daily):
    q = FEATURE_PARAMS['high_spend_quantile']
    p_user = daily.groupby('user_id')['avg_daily_amount'].transform(lambda s: s.quantile(q))
    daily['is_high_spend_day'] = daily['avg_daily_amount'] >= p_user

    low_spend_flag = daily['avg_daily_amount'] <= daily.groupby('user_id')['avg_daily_amount'].transform('median')
    win = FEATURE_PARAMS['saver_window']
    min_p = FEATURE_PARAMS['saver_min_periods']
    daily['saver_share_30d'] = low_spend_flag.groupby(daily['user_id']).transform(lambda s: s.rolling(win, min_periods=min_p).mean())
    daily['is_saver'] = daily['saver_share_30d'] >= FEATURE_PARAMS['saver_threshold']
else:
    daily['is_high_spend_day'] = pd.Series(dtype='bool')
    daily['saver_share_30d'] = pd.Series(dtype='float64')
    daily['is_saver'] = pd.Series(dtype='bool')

In [58]:
if len(daily):
    calendar_flags = build_calendar_flags(daily['date'], FEATURE_PARAMS)
    daily = pd.concat([daily, calendar_flags], axis=1)
    daily['month'] = pd.to_datetime(daily['date']).dt.month
    daily['dayofweek'] = pd.to_datetime(daily['date']).dt.dayofweek
    daily['is_weekend'] = daily['dayofweek'] >= 5
    daily['weekofyear'] = pd.to_datetime(daily['date']).dt.isocalendar().week.astype(int)
else:
    daily['month'] = []
    daily['dayofweek'] = []
    daily['is_weekend'] = []
    daily['weekofyear'] = []
print(daily.head())


   user_id       date  avg_daily_amount  avg_daily_events  \
0      195 2024-08-26               0.0               1.0   
1      195 2024-08-27               0.0               2.0   
2      444 2024-06-10               0.0               1.0   
3      444 2024-06-11               0.0               1.0   
4     1514 2024-06-23               0.0               1.0   

   avg_unique_categories top_category  category_amount  category_events  \
0                    1.0          u2i              0.0                1   
1                    1.0          u2i              0.0                3   
2                    1.0      catalog              0.0                1   
3                    1.0          u2i              0.0                1   
4                    1.0       search              0.0                1   

   is_auto_active  is_home_repair_period  ...  is_pre_new_year  is_gifts_q1  \
0           False                  False  ...            False        False   
1           False       

## Пользовательские фичи (агрегация по user_id)
Копим поведение в стационарные метрики для кластеризации и UMAP.


In [60]:
if len(daily):
    user_feats = daily.groupby('user_id').agg(
        days=('date', 'nunique'),
        total_amount=('avg_daily_amount', 'sum'),
        mean_amount=('avg_daily_amount', 'mean'),
        median_amount=('avg_daily_amount', 'median'),
        max_amount=('avg_daily_amount', 'max'),
        mean_events=('avg_daily_events', 'mean'),
        unique_categories_mean=('avg_unique_categories', 'mean'),
        auto_day_share=('is_auto_active', 'mean'),
        home_day_share=('is_home_repair_period', 'mean'),
        high_spend_share=('is_high_spend_day', 'mean'),
        saver_share=('is_saver', 'mean'),
        weekend_share=('is_weekend', 'mean'),
        pre_ny_share=('is_pre_new_year', 'mean'),
        gifts_q1_share=('is_gifts_q1', 'mean'),
        bts_share=('is_back_to_school', 'mean'),
        summer_share=('is_summer', 'mean'),
        salary_window_share=('is_salary_window', 'mean'),
        social_benefits_share=('is_social_benefits_window', 'mean'),
    ).reset_index()
else:
    user_feats = pd.DataFrame(columns=['user_id'])
print('user_feats shape:', user_feats.shape)

user_feats shape: (168846, 19)


## Продуктовые таргеты + демография
Используем сигнальные фичи как мягкие таргеты типов продуктов, склеиваем с socdem_cluster/region.


In [61]:
def apply_feature_product_mapping(df: pd.DataFrame, feature_to_types: dict, product_types: list) -> pd.DataFrame:
    targets = pd.DataFrame(index=df.index)
    for pt in product_types:
        targets[f'target_{pt}'] = False
    for feat, pts in feature_to_types.items():
        if feat not in df.columns:
            continue
        for pt in pts:
            if pt in product_types:
                targets[f'target_{pt}'] = targets[f'target_{pt}'] | df[feat].fillna(False)
    target_cols = [f'target_{pt}' for pt in product_types]
    targets['candidate_product_types'] = targets[target_cols].apply(lambda r: [pt for pt, flag in zip(product_types, r) if flag], axis=1)
    return targets


In [62]:
with open(PRODUCTS_PATH, 'r') as f:
    products = json.load(f)
product_types = sorted({p['product_type'] for p in products})
feature_to_types = {
    'is_pre_new_year': ['loan', 'credit_card'],
    'is_gifts_q1': ['credit_card', 'debit_card', 'premium_service'],
    'is_back_to_school': ['credit_card', 'loan', 'savings_account', 'deposit'],
    'is_summer': ['insurance', 'debit_card'],
    'is_salary_window': ['debit_card', 'deposit'],
    'is_social_benefits_window': ['savings_account', 'deposit'],
    'is_high_spend_day': ['loan', 'credit_card'],
    'is_saver': ['deposit', 'savings_account', 'investment'],
    'is_auto_active': ['insurance', 'debit_card'],
    'is_home_repair_period': ['mortgage', 'loan', 'insurance'],
}

if len(daily):
    targets_daily = apply_feature_product_mapping(daily, feature_to_types, product_types)
    daily_with_targets = pd.concat([daily, targets_daily], axis=1)

    numeric_target_cols = [col for col in targets_daily.columns if col != 'candidate_product_types']
    user_targets = daily_with_targets.groupby('user_id')[numeric_target_cols].mean().reset_index()
else:
    user_targets = pd.DataFrame(columns=['user_id'])
print('user_targets shape:', user_targets.shape)

user_targets shape: (168846, 14)


In [63]:
users_df = pd.read_parquet(USERS_PATH) if USERS_PATH.exists() else pd.DataFrame(columns=['user_id'])
if not users_df.empty:
    users_df['socdem_cluster'] = users_df['socdem_cluster'].astype('Int64')

full = user_feats.merge(user_targets, on='user_id', how='left') if len(user_feats) else pd.DataFrame()
if not users_df.empty:
    full = full.merge(users_df, on='user_id', how='left')
print('full shape:', full.shape)
full.head()


full shape: (168846, 34)


Unnamed: 0,user_id,days,total_amount,mean_amount,median_amount,max_amount,mean_events,unique_categories_mean,auto_day_share,home_day_share,...,target_investment,target_investment_education,target_investment_ideas,target_loan,target_mortgage,target_partner_program,target_premium_service,target_savings_account,socdem_cluster,region
0,195,2,0.0,0.0,0.0,0.0,1.5,1.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,9,18.0
1,444,2,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,8,
2,1514,3,0.0,0.0,0.0,0.0,1.444444,1.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,17,2.0
3,1979,5,0.0,0.0,0.0,0.0,5.43,1.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.6,19,81.0
4,3849,2,0.0,0.0,0.0,0.0,16.5,2.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.5,17,54.0


In [64]:
output_path = PROJECT_ROOT / 'data' / 'full_data.parquet'
output_path.parent.mkdir(parents=True, exist_ok=True)
full.to_parquet(output_path, index=False)
print(f'DataFrame full сохранен в: {output_path}')

DataFrame full сохранен в: /content/drive/MyDrive/aviahack/data/full_data.parquet


Проверить первые 5 строк сохраненного файла можно так:

In [65]:
pd.read_parquet(PROJECT_ROOT / 'data' / 'full_data.parquet').head()

Unnamed: 0,user_id,days,total_amount,mean_amount,median_amount,max_amount,mean_events,unique_categories_mean,auto_day_share,home_day_share,...,target_investment,target_investment_education,target_investment_ideas,target_loan,target_mortgage,target_partner_program,target_premium_service,target_savings_account,socdem_cluster,region
0,195,2,0.0,0.0,0.0,0.0,1.5,1.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,9,18.0
1,444,2,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,8,
2,1514,3,0.0,0.0,0.0,0.0,1.444444,1.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,17,2.0
3,1979,5,0.0,0.0,0.0,0.0,5.43,1.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.6,19,81.0
4,3849,2,0.0,0.0,0.0,0.0,16.5,2.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.5,17,54.0


# Профили кластеров

In [31]:
if len(full):
    socdem_major = (
        full.groupby('cluster')['socdem_cluster']
        .value_counts(normalize=True)
        .rename('share')
        .reset_index()
        .sort_values(['cluster', 'share'], ascending=[True, False])
    )
    print('Socdem распределение по кластерам (top):')
    display(socdem_major.groupby('cluster').head(3))

Socdem распределение по кластерам (top):


Unnamed: 0,cluster,socdem_cluster,share
0,0,9,0.272549
1,0,20,0.207793
2,0,12,0.116038
20,1,9,0.28011
21,1,20,0.23008
22,1,12,0.126714
41,2,9,0.299829
42,2,20,0.182687
43,2,12,0.121748
62,3,20,0.369871


In [32]:
print('Сопоставление cluster (id) и socdem_cluster:')
print(cluster_to_socdem)

Сопоставление cluster (id) и socdem_cluster:
{0: 9, 1: 9, 2: 9, 3: 20, 4: 9}


In [38]:
if len(full):
    all_numeric_cols = [c for c in feature_cols if c != 'socdem_cluster'] + numeric_target_cols
    numeric_cols_for_mean = []
    seen = set()
    for col in all_numeric_cols:
        if col not in seen:
            numeric_cols_for_mean.append(col)
            seen.add(col)

    df_for_means = full[['cluster'] + numeric_cols_for_mean].copy()

    cluster_profiles_means = df_for_means.groupby('cluster')[numeric_cols_for_mean].mean().reset_index()

    idx = socdem_major.groupby('cluster')['share'].idxmax()
    cluster_to_socdem_map = socdem_major.loc[idx].set_index('cluster')['socdem_cluster'].to_dict()

    cluster_socdem_mode = pd.DataFrame.from_dict(cluster_to_socdem_map, orient='index', columns=['mapped_socdem']).reset_index().rename(columns={'index': 'cluster'})

    cluster_profiles = cluster_profiles_means.merge(cluster_socdem_mode, on='cluster', how='left')

    cols = ['cluster', 'mapped_socdem'] + [col for col in cluster_profiles.columns if col not in ['cluster', 'mapped_socdem']]
    cluster_profiles = cluster_profiles[cols]

    print('Профили кластеров (средние значения признаков):')
    display(cluster_profiles.round(2))
else:
    print('Нет данных для анализа профилей кластеров')

Профили кластеров (средние значения признаков):


Unnamed: 0,cluster,mapped_socdem,days,total_amount,mean_amount,median_amount,max_amount,mean_events,unique_categories_mean,auto_day_share,...,target_insurance,target_investment,target_investment_education,target_investment_ideas,target_loan,target_mortgage,target_partner_program,target_premium_service,target_savings_account,region
0,0,9,1.74,0.0,0.0,0.0,0.0,3.03,1.07,0.0,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.68,39.64
1,1,9,2.76,0.0,0.0,0.0,0.0,4.76,1.35,0.0,...,0.99,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.34,38.98
2,2,9,1.3,0.0,0.0,0.0,0.0,3.29,1.01,0.0,...,0.96,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.37,42.29
3,3,20,1.06,0.0,0.0,0.0,0.0,1.53,1.0,0.0,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.59,28.11
4,4,9,1.01,0.0,0.0,0.0,0.0,1.35,1.0,0.0,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.56,24.18


# **Основные различия между кластерами**

* **Уровень активности:**
  Самый активный — **Кластер 1**.
  Наименее активные — **Кластеры 3 и 4**.

* **Интерес к продуктам сбережений:**
  Высокий — **Кластеры 0, 3 и 4**.
  Низкий — **Кластеры 1 и 2**.

---

# **Профили кластеров**

## **Кластер 0 — Умеренные сберегатели**

* **Демография:** socdem_cluster 9
* **Активность:** 1.74 дня активности, 3.03 событий/день, среднее разнообразие категорий (1.07)
* **Интересы:** высокие сбережения (0.68), депозиты (0.68), страхование (1.00), интерес к кредитам и кредитным картам

---

## **Кластер 1 — Активные потребители**

* **Демография:** socdem_cluster 9
* **Активность:** самая высокая — 2.76 дня активности, 4.76 событий/день, высокая разнообразность (1.35)
* **Интересы:** низкие сбережения (0.34), депозиты (0.34), высокий интерес к кредитам, картам, страхованию (0.99)

---

## **Кластер 2 — Низкоактивные сберегатели**

* **Демография:** socdem_cluster 9
* **Активность:** низкая — 1.30 дня активности, 3.29 событий/день, низкое разнообразие (1.01)
* **Интересы:** умеренные сбережения (0.37), депозиты (0.37), немного ниже интерес к страхованию (0.96), высокий интерес к кредитам и картам

---

## **Кластер 3 — Отличающиеся соцдем + пассивные**

* **Демография:** в основном socdem_cluster 20; отличаются по региону
* **Активность:** самая низкая — 1.06 дня активности, 1.53 событий/день, минимальное разнообразие (1.00)
* **Интересы:** высокие сбережения (0.59), депозиты (0.59), страхование (1.00), интерес к кредитам и картам

---

## **Кластер 4 — Очень пассивные сберегатели**

* **Демография:** socdem_cluster 9
* **Активность:** 1.01 дня активности, 1.35 событий/день, низкое разнообразие (1.00)
* **Интересы:** высокие сбережения (0.56), депозиты (0.56), страхование (1.00), интерес к кредитам и картам


In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans, DBSCAN

# Загрузка исходных данных (путь к parquet можно изменить).
df = pd.read_parquet('user_clusters.parquet')

# Выбираем только те признаки, которые влияют на различия между кластерами.
features = [
    'days', 'mean_events', 'unique_categories_mean',
    'saver_share', 'target_deposit', 'target_insurance',
    'target_credit_card', 'target_loan', 'target_savings_account',
    'socdem_cluster', 'region'
]
X = df[features].copy()

# Определяем опорных (идеальных) пользователей, исходя из описаний кластеров.
anchor_users = pd.DataFrame([
    # кластер 0: умеренный сберегатель
    {
        'days': 1.74,
        'mean_events': 3.03,
        'unique_categories_mean': 1.07,
        'saver_share': 0.68,
        'target_deposit': 0.68,
        'target_insurance': 1.0,
        'target_credit_card': 1.0,
        'target_loan': 1.0,
        'target_savings_account': 0.68,
        'socdem_cluster': 9,
        'region': 9
    },
    # кластер 1: активный потребитель
    {
        'days': 2.76,
        'mean_events': 4.76,
        'unique_categories_mean': 1.35,
        'saver_share': 0.34,
        'target_deposit': 0.34,
        'target_insurance': 0.99,
        'target_credit_card': 1.0,
        'target_loan': 1.0,
        'target_savings_account': 0.34,
        'socdem_cluster': 9,
        'region': 9
    },
    # кластер 2: низкоактивный сберегатель
    {
        'days': 1.30,
        'mean_events': 3.29,
        'unique_categories_mean': 1.01,
        'saver_share': 0.37,
        'target_deposit': 0.37,
        'target_insurance': 0.96,
        'target_credit_card': 1.0,
        'target_loan': 1.0,
        'target_savings_account': 0.37,
        'socdem_cluster': 9,
        'region': 9
    },
    # кластер 3: пассивный, иной социодем
    {
        'days': 1.06,
        'mean_events': 1.53,
        'unique_categories_mean': 1.00,
        'saver_share': 0.59,
        'target_deposit': 0.59,
        'target_insurance': 1.0,
        'target_credit_card': 1.0,
        'target_loan': 1.0,
        'target_savings_account': 0.59,
        'socdem_cluster': 20,
        'region': 20
    },
    # кластер 4: очень пассивный сберегатель
    {
        'days': 1.01,
        'mean_events': 1.35,
        'unique_categories_mean': 1.00,
        'saver_share': 0.56,
        'target_deposit': 0.56,
        'target_insurance': 1.0,
        'target_credit_card': 1.0,
        'target_loan': 1.0,
        'target_savings_account': 0.56,
        'socdem_cluster': 9,
        'region': 9
    }
])

X_with_anchors = pd.concat([X, anchor_users], ignore_index=True)

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_with_anchors)

anchor_scaled = scaler.transform(anchor_users[features])
kmeans = KMeans(
    n_clusters=5,
    init=anchor_scaled,  # фиксируем центры кластеров
    n_init=1,
    random_state=42
)
kmeans_labels = kmeans.fit_predict(X_scaled)

original_labels_kmeans = kmeans_labels[:-len(anchor_users)]
