# Categorical EDA (Karim) - простой рабочий ноутбук

Запускай ячейки сверху вниз. Если ловишь редкую ошибку `pandas.period already defined`,
перезапусти Kernel и снова Run All.


In [None]:
import os
from pathlib import Path

import numpy as np
import pandas as pd
import pyarrow.parquet as pq
from pyarrow.lib import ArrowKeyError

import seaborn as sns
import matplotlib.pyplot as plt

from scipy.stats import chi2_contingency, kruskal
from sklearn.feature_selection import mutual_info_classif

sns.set_theme(style='whitegrid')
pd.set_option('display.max_rows', 200)
pd.set_option('display.max_columns', 200)


In [None]:
# ====== Прямые пути (без авто-магии) ======
PROJECT_ROOT = Path('/Users/karimkhabib/Documents/Projects Programming/PyCharm/data-fusion-contest')
DATA_DIR = PROJECT_ROOT / 'src' / 'data'
OUT_DIR = PROJECT_ROOT / 'outputs' / 'categorical_analysis'
OUT_DIR.mkdir(parents=True, exist_ok=True)

LABELS_PATH = DATA_DIR / 'train_labels.parquet'
TRAIN_FILES = [
    DATA_DIR / 'train_part_1.parquet',
    DATA_DIR / 'train_part_2.parquet',
    DATA_DIR / 'train_part_3.parquet',
]

# Быстрый режим для чернового EDA
QUICK_MODE = False
MAX_LABEL_EVENTS = 200_000
BATCH_SIZE = 500_000
MISSING_TOKEN = '__MISSING__'

print('LABELS_PATH:', LABELS_PATH)
print('TRAIN FILES:')
for x in TRAIN_FILES:
    print(' -', x)


In [None]:
def safe_read_parquet(path, columns=None):
    # Чтение parquet с fallback от ArrowKeyError в ноутбуке
    try:
        return pd.read_parquet(path, columns=columns)
    except ArrowKeyError:
        import pyarrow as pa
        for ext_name in ('pandas.period', 'pandas.interval'):
            try:
                pa.unregister_extension_type(ext_name)
            except Exception:
                pass
        table = pq.read_table(path, columns=columns)
        return table.to_pandas()

labels = safe_read_parquet(LABELS_PATH)
labels = labels[['event_id', 'target']].copy()
labels['event_id'] = pd.to_numeric(labels['event_id'], errors='coerce').astype('Int64')
labels['target'] = pd.to_numeric(labels['target'], errors='coerce').astype('Int64')
labels = labels.dropna(subset=['event_id', 'target']).copy()
labels['event_id'] = labels['event_id'].astype('int64')
labels['target'] = labels['target'].astype('int8')
labels = labels.drop_duplicates('event_id', keep='last').reset_index(drop=True)

print('labels shape:', labels.shape)
print(labels['target'].value_counts())
print('positive rate:', round(float(labels['target'].mean()), 6))


In [None]:
CATEGORICAL_COLUMNS = [
    'event_type_nm',
    'event_desc',
    'channel_indicator_type',
    'channel_indicator_sub_type',
    'currency_iso_cd',
    'mcc_code',
    'pos_cd',
    'accept_language',
    'browser_language',
    'timezone',
    'operating_system_type',
    'device_system_version',
    'screen_size',
    'developer_tools',
    'phone_voip_call_state',
    'web_rdp_connection',
    'compromised',
]

sample_cols = safe_read_parquet(TRAIN_FILES[0]).columns.tolist()
AVAILABLE_CAT_COLS = [c for c in CATEGORICAL_COLUMNS if c in sample_cols]
print('available categorical columns:', len(AVAILABLE_CAT_COLS))
print(AVAILABLE_CAT_COLS)


In [None]:
label_event_ids = labels['event_id'].tolist()
if QUICK_MODE and len(label_event_ids) > MAX_LABEL_EVENTS:
    rng = np.random.default_rng(42)
    label_event_ids = rng.choice(label_event_ids, size=MAX_LABEL_EVENTS, replace=False).tolist()
label_event_ids = set(label_event_ids)

print('using label event ids:', len(label_event_ids))


def collect_labeled_rows(train_files, label_event_ids, feature_columns, batch_size=500_000):
    chunks = []
    read_cols = ['event_id'] + feature_columns

    for file_path in train_files:
        pf = pq.ParquetFile(file_path)
        for batch in pf.iter_batches(columns=read_cols, batch_size=batch_size):
            part = batch.to_pandas()
            mask = part['event_id'].isin(label_event_ids)
            if mask.any():
                chunks.append(part.loc[mask, read_cols].copy())

    if not chunks:
        return pd.DataFrame(columns=read_cols)

    out = pd.concat(chunks, ignore_index=True)
    out = out.drop_duplicates('event_id', keep='last')
    return out

labeled_features = collect_labeled_rows(
    TRAIN_FILES,
    label_event_ids,
    AVAILABLE_CAT_COLS,
    batch_size=BATCH_SIZE,
)

print('labeled feature rows:', labeled_features.shape)


In [None]:
df = labels.merge(labeled_features, on='event_id', how='inner')
print('merged shape:', df.shape)
print('merged positive rate:', round(float(df['target'].mean()), 6))
df.head(3)


In [None]:
def prep_cat(s):
    return s.astype('string').fillna(MISSING_TOKEN)


def calc_cramers_v(cat, y):
    table = pd.crosstab(cat, y)
    if table.shape[0] < 2 or table.shape[1] < 2:
        return np.nan, np.nan, np.nan

    chi2, p, _, _ = chi2_contingency(table)
    n = table.to_numpy().sum()
    if n <= 1:
        return chi2, p, np.nan

    phi2 = chi2 / n
    r, k = table.shape
    phi2_corr = max(0, phi2 - ((k - 1) * (r - 1)) / (n - 1))
    r_corr = r - ((r - 1) ** 2) / (n - 1)
    k_corr = k - ((k - 1) ** 2) / (n - 1)
    denom = min(k_corr - 1, r_corr - 1)
    v = np.sqrt(phi2_corr / denom) if denom > 0 else np.nan
    return chi2, p, v


def calc_mi(cat, y):
    codes, _ = pd.factorize(cat, sort=False)
    return mutual_info_classif(codes.reshape(-1, 1), y.values, discrete_features=True, random_state=42)[0]


def calc_kruskal(cat, y):
    groups = [y[cat == val].values for val in cat.unique()]
    groups = [g for g in groups if len(g) >= 2]
    if len(groups) < 2:
        return np.nan, np.nan
    h, p = kruskal(*groups, nan_policy='omit')
    return h, p


In [None]:
metrics_rows = []
y = df['target'].astype('int8')

for col in AVAILABLE_CAT_COLS:
    cat = prep_cat(df[col])
    chi2_stat, chi2_p, c_v = calc_cramers_v(cat, y)
    mi = calc_mi(cat, y)
    kr_h, kr_p = calc_kruskal(cat, y)

    metrics_rows.append({
        'feature': col,
        'n_unique': int(cat.nunique(dropna=False)),
        'missing_rate': float((cat == MISSING_TOKEN).mean()),
        'mutual_info': float(mi),
        'cramers_v': float(c_v) if pd.notna(c_v) else np.nan,
        'chi2_p_value': float(chi2_p) if pd.notna(chi2_p) else np.nan,
        'kruskal_p_value': float(kr_p) if pd.notna(kr_p) else np.nan,
    })

metrics = pd.DataFrame(metrics_rows).sort_values(['mutual_info', 'cramers_v'], ascending=False)
metrics


In [None]:
plt.figure(figsize=(10, 6))
sns.barplot(data=metrics.head(12), y='feature', x='mutual_info', color='#4C78A8')
plt.title('Top categorical features by mutual_info')
plt.tight_layout()
plt.show()

plt.figure(figsize=(10, 6))
sns.barplot(data=metrics.sort_values('cramers_v', ascending=False).head(12), y='feature', x='cramers_v', color='#F58518')
plt.title("Top categorical features by Cramer's V")
plt.tight_layout()
plt.show()


In [None]:
def summarize_categorical(df, feature, min_count=30):
    cat = prep_cat(df[feature])
    tmp = pd.DataFrame({'cat': cat, 'target': df['target'].astype('int8')})
    summary = tmp.groupby('cat', dropna=False)['target'].agg(['count', 'mean', 'sum']).rename(
        columns={'mean': 'target_rate', 'sum': 'target_positives'}
    )
    summary = summary[summary['count'] >= min_count].sort_values('target_rate', ascending=False)
    return summary

TOP_FEATURES = metrics['feature'].head(5).tolist()
TOP_FEATURES


In [None]:
for feat in TOP_FEATURES:
    print('
' + '=' * 90)
    print('Feature:', feat)
    summary = summarize_categorical(df, feat, min_count=30).head(20)
    display(summary)

    if not summary.empty:
        plt.figure(figsize=(10, 6))
        sns.barplot(x=summary['target_rate'].values, y=summary.index.astype(str), color='#54A24B')
        plt.title(f'{feat}: top categories by target_rate')
        plt.tight_layout()
        plt.show()


In [None]:
metrics.to_csv(OUT_DIR / 'categorical_feature_metrics.csv', index=False)

feature_summary_dir = OUT_DIR / 'feature_summaries'
feature_summary_dir.mkdir(parents=True, exist_ok=True)

for feat in AVAILABLE_CAT_COLS:
    s = summarize_categorical(df, feat, min_count=1)
    s.to_csv(feature_summary_dir / f'{feat}_summary.csv')

print('saved:', OUT_DIR / 'categorical_feature_metrics.csv')
print('saved dir:', feature_summary_dir)
