# 02_Evaluate.ipynb

#  Import

In [1]:
import yaml
import json
import joblib
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path

import warnings
warnings.filterwarnings('ignore')
%matplotlib inline
import yaml
from pathlib import Path
import os


# находим корень проекта (там, где лежит config/)
PROJECT_ROOT = Path(__file__).parent.parent if "__file__" in locals() else Path.cwd().parents[0]
# PROJECT_ROOT = Path.cwd().parents[0]

# Загрузка конфига
config_path = PROJECT_ROOT / 'config' / 'params2.yml'

if not config_path.exists():
    raise FileNotFoundError(f"Файл конфигурации не найден: {config_path}")

with open(config_path, 'r', encoding='utf-8') as f:
    config = yaml.safe_load(f)

print("Конфигурация успешно загружена!")
print(f"Корень проекта: {PROJECT_ROOT}")
print(f"Путь к конфигу: {config_path}")

Конфигурация успешно загружена!
Корень проекта: /Users/bariatmamaeva/Desktop/uplift-marketing-mlops
Путь к конфигу: /Users/bariatmamaeva/Desktop/uplift-marketing-mlops/config/params2.yml


# Preprocessing

In [2]:

# Пути из конфигурации 
def get_paths(config):
    model_path = PROJECT_ROOT / config['folders']['models'] / 'uplift_model.joblib'
    preprocessor_path = PROJECT_ROOT / config['folders']['models'] / 'preprocessor.joblib'
    metrics_path = PROJECT_ROOT / config['folders']['report'] / 'metrics.json'
    best_params_path = PROJECT_ROOT / config['folders']['report'] / 'best_params.json'
    return model_path, preprocessor_path, metrics_path, best_params_path

# Загрузка модели
def load_model(model_path: Path):
    if model_path.exists():
        model = joblib.load(model_path)
        print(f"Uplift-модель загружена из {model_path}")
        return model
    else:
        raise FileNotFoundError(f"Модель не найдена: {model_path}")

# Загрузка препроцессора
def load_preprocessor(preprocessor_path: Path):
    if preprocessor_path.exists():
        preprocessor = joblib.load(preprocessor_path)
        print(f"Препроцессор загружен из {preprocessor_path}")
        return preprocessor
    else:
        raise FileNotFoundError(f"Препроцессор не найден: {preprocessor_path}")

# Загрузка метрик
def load_metrics(metrics_path: Path):
    if metrics_path.exists():
        with metrics_path.open('r') as f:
            metrics = json.load(f)
        print("Метрики загружены из {metrics_path}")
        return metrics
    else:
        print(f"Метрики не найдены: {metrics_path}. Используем default.")
        return {}

# Загрузка лучших параметров
def load_best_params(best_params_path: Path):
    if best_params_path.exists():
        with best_params_path.open('r') as f:
            best_params = json.load(f)
        print(f"Лучшие параметры загружены из {best_params_path}")
        return best_params
    else:
        print(f"Параметры не найдены: {best_params_path}. Используем default.")
        return {}

# Загрузка всех артефактов
model_path, preprocessor_path, metrics_path, best_params_path = get_paths(config)
model = load_model(model_path)
preprocessor = load_preprocessor(preprocessor_path)
metrics = load_metrics(metrics_path)
best_params = load_best_params(best_params_path)

Uplift-модель загружена из /Users/bariatmamaeva/Desktop/uplift-marketing-mlops/models/uplift_model.joblib
Препроцессор загружен из /Users/bariatmamaeva/Desktop/uplift-marketing-mlops/models/preprocessor.joblib
Метрики не найдены: /Users/bariatmamaeva/Desktop/uplift-marketing-mlops/report/metrics.json. Используем default.
Лучшие параметры загружены из /Users/bariatmamaeva/Desktop/uplift-marketing-mlops/report/best_params.json


## 4. Загрузка тестовых данных

Загружаем из `data/processed/test.csv` (создано в 01_EDA_and_Train).

In [3]:
test_file_path = PROJECT_ROOT / config['data']['processed_path'] / config['data']['test_file']

if test_file_path.exists():
    df_test = pd.read_csv(test_file_path)
    print(f"Тестовые данные загружены: {df_test.shape[0]} строк")
    display(df_test.head())
else:
    raise FileNotFoundError(f"Тестовые данные не найдены: {test_file_path}")

Тестовые данные загружены: 12800 строк


Unnamed: 0,recency,history,used_discount,used_bogo,zip_code,is_referral,channel,offer,conversion,history_discount,history_bogo,recency_history,recency_bin_0,recency_bin_1,recency_bin_2,treatment
0,2,29.99,1,0,Urban,0,Phone,No Offer,0,29.99,0.0,59.98,1.0,0.0,0.0,0
1,9,310.79,0,1,Urban,0,Phone,Discount,0,0.0,310.79,2797.11,0.0,0.0,1.0,1
2,2,89.31,1,0,Surburban,1,Web,Discount,0,89.31,0.0,178.62,1.0,0.0,0.0,1
3,2,457.22,0,1,Urban,0,Multichannel,Buy One Get One,0,0.0,457.22,914.44,1.0,0.0,0.0,1
4,2,29.99,0,1,Surburban,0,Phone,No Offer,0,0.0,29.99,59.98,1.0,0.0,0.0,0


## 5. Предсказание uplift_score

Используем загруженную модель и препроцессор для расчёта uplift на тестовых данных.

In [None]:
feature_cols = [col for col in df_test.columns if col not in ['conversion', 'treatment', 'offer']]
X_test = preprocessor.transform(df_test[feature_cols])

prop_score = model['prop_model'].predict_proba(X_test)[:, 1]
effect_t = model['effect_t'].predict(X_test)
effect_c = model['effect_c'].predict(X_test)
uplift_pred = prop_score * effect_t + (1 - prop_score) * effect_c
uplift_pred = np.clip(uplift_pred, -0.15, 0.30)

results = df_test.copy()
results['uplift_score'] = uplift_pred

print("Предсказания завершены")
display(results.head())

## 6. Анализ топ-клиентов

Выделяем топ-клиентов по uplift_score и сохраняем в CSV.

In [None]:
top_fraction = metrics.get('top_fraction_90', 0.05)
top_n = int(len(results) * top_fraction)
top_clients = results.nlargest(top_n, 'uplift_score')

print(f"\nАнализ топ {top_fraction:.1%} клиентов ({top_n} человек):")
print(f"Средний uplift_score: {top_clients['uplift_score'].mean():.4f}")
print(f"Конверсия в топе: {top_clients['conversion'].mean():.4f} (vs общая: {results['conversion'].mean():.4f})")

report_path = PROJECT_ROOT / config['folders']['report']
report_path.mkdir(parents=True, exist_ok=True)
output_csv = report_path / f'top_clients_{int(top_fraction*100)}pct.csv'
top_clients.to_csv(output_csv, index=False)
print(f"Список топ-клиентов сохранён: {output_csv}")

## 7. Qini-кривая

In [None]:
order = np.argsort(-uplift_pred)
y_sorted = np.asarray(results['conversion'])[order]
t_sorted = np.asarray(results['treatment'])[order]

cum_treated = np.cumsum(t_sorted)
cum_control = np.cumsum(1 - t_sorted)
cum_response_t = np.cumsum(y_sorted * t_sorted) / np.maximum(cum_treated, 1)
cum_response_c = np.cumsum(y_sorted * (1 - t_sorted)) / np.maximum(cum_control, 1)
cum_uplift = cum_response_t - cum_response_c
treated_fraction = cum_treated / cum_treated[-1] if cum_treated[-1] > 0 else np.zeros(len(cum_uplift))

plt.figure(figsize=(11, 7))
plt.plot(treated_fraction, cum_uplift, label='Модель', linewidth=3)
plt.plot([0, 1], [0, cum_uplift[-1]], 'k--', label='Идеальная')
plt.plot([0, 1], [0, 0], 'k-', label='Случайная')
plt.title(f'Qini-кривая | AUUC = {metrics.get("AUUC", "N/A")}')
plt.xlabel('Доля клиентов с коммуникацией')
plt.ylabel('Кумулятивный прирост')
plt.legend()
plt.grid(True, alpha=0.3)
plt.show()

## 8. Симуляция ROI

In [None]:
cost_per_contact = 5      # руб. за контакт
profit_per_conversion = 2000  # руб. прибыль с одной конверсии

# Базлайн: отправка всем
baseline_contacts = len(results)
baseline_conversions = results['conversion'].sum()
baseline_cost = baseline_contacts * cost_per_contact
baseline_profit = baseline_conversions * profit_per_conversion
baseline_net_profit = baseline_profit - baseline_cost

# Uplift-стратегия: топ N% клиентов
uplift_contacts = top_n
uplift_conversions = top_clients['conversion'].sum()
uplift_cost = uplift_contacts * cost_per_contact
uplift_profit = uplift_conversions * profit_per_conversion
uplift_net_profit = uplift_profit - uplift_cost

# Сводный отчёт
roi_summary = {
    'baseline_conversions': baseline_conversions,
    'baseline_net_profit': baseline_net_profit,
    'uplift_conversions': uplift_conversions,
    'uplift_net_profit': uplift_net_profit,
    'budget_savings_fraction': 1 - top_fraction,
    'profit_increase_ratio': uplift_net_profit / baseline_net_profit if baseline_net_profit != 0 else None
}

print("\nСимуляция ROI")
for k, v in roi_summary.items():
    print(f"{k}: {v}")

# Сохранение отчёта
report_path = PROJECT_ROOT / config['folders']['report']
report_path.mkdir(parents=True, exist_ok=True)

roi_csv_path = report_path / f'roi_top_{int(top_fraction*100)}pct.csv'
pd.DataFrame([roi_summary]).to_csv(roi_csv_path, index=False)
print(f"\nROI-отчёт сохранён: {roi_csv_path}")

## 9. Сбор и сохранение метрик

In [None]:
import numpy as np

def auuc_score(y_true, uplift, treatment):
    """
    Computes AUUC (Area Under the Uplift Curve)
    Parameters
    ----------
    y_true : array-like, {0,1}
    uplift : array-like, continuous
    treatment : array-like, {0,1}
    Returns
    -------
    float
    """
    y_true = np.asarray(y_true)
    uplift = np.asarray(uplift)
    treatment = np.asarray(treatment)
    
    assert set(np.unique(treatment)) <= {0, 1}, "treatment must be binary"
    assert set(np.unique(y_true)) <= {0, 1}, "y_true must be binary"
    
    order = np.argsort(-uplift)
    y_true = y_true[order]
    treatment = treatment[order]
    
    cum_treat = np.cumsum(treatment)
    cum_control = np.cumsum(1 - treatment)
    cum_y_treat = np.cumsum(y_true * treatment)
    cum_y_control = np.cumsum(y_true * (1 - treatment))
    
    uplift_curve = (
        cum_y_treat / np.maximum(cum_treat, 1)
        - cum_y_control / np.maximum(cum_control, 1)
    )
    
    return np.trapz(uplift_curve)

In [None]:
# Проверки (как ты просила)
assert results['treatment'].isin([0,1]).all(), "treatment должен быть 0/1"
assert results['conversion'].isin([0,1]).all(), "conversion должен быть 0/1"
assert results['uplift_score'].notna().all(), "uplift_score содержит NaN"

# Пересчёт AUUC
auuc_test = auuc_score(
    results['conversion'],
    results['uplift_score'],
    results['treatment']
)

print(f"Пересчитанный AUUC на тесте: {auuc_test:.5f}")

In [None]:
# Сбор всех метрик

evaluation_metrics = {
    'auuc_from_training': metrics.get('AUUC', None),
    'auuc_on_test': round(float(auuc_test), 5),
    'mean_uplift_score': round(float(results['uplift_score'].mean()), 4),
    'top_fraction': float(top_fraction),
    'top_mean_uplift_score': round(float(top_clients['uplift_score'].mean()), 4),
    'top_conversion': round(float(top_clients['conversion'].mean()), 4),
    'overall_conversion': round(float(results['conversion'].mean()), 4),
    'roi_summary': roi_summary,          # может содержать numpy-типы
    'best_params_summary': best_params   # может содержать numpy-типы
}

# Приведение numpy-типов → Python

def to_python_types(obj):
    """Рекурсивно приводит numpy-типы к стандартным Python-типам"""
    if isinstance(obj, dict):
        return {k: to_python_types(v) for k, v in obj.items()}
    elif isinstance(obj, list):
        return [to_python_types(v) for v in obj]
    elif isinstance(obj, (np.integer,)):
        return int(obj)
    elif isinstance(obj, (np.floating,)):
        return float(obj)
    else:
        return obj


evaluation_metrics_clean = to_python_types(evaluation_metrics)

In [None]:
# Вывод метрик

print("\nСбор всех метрик:")
for k, v in evaluation_metrics_clean.items():
    print(f"{k}: {v}")

# Сохранение в JSON
evaluation_report_path = report_path / 'evaluation_metrics.json'
with open(evaluation_report_path, 'w') as f:
    json.dump(evaluation_metrics_clean, f, indent=2, ensure_ascii=False)

print(f"\nПолный отчёт метрик сохранён: {evaluation_report_path}")
