In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.model_selection import TimeSeriesSplit
import lightgbm as lgb
import optuna
from optuna.samplers import TPESampler
from tqdm import tqdm
import warnings
warnings.filterwarnings('ignore')

# 1. Загрузка и подготовка данных
print("1. Загрузка и подготовка данных...")
data = pd.read_csv("CupIT_Sber_data.csv", delimiter=";")

# Преобразование типов и очистка
data['service_amount_net'] = pd.to_numeric(data['service_amount_net'], errors='coerce')
data['service_date'] = pd.to_datetime(data['service_date'])
data = data.dropna(subset=['service_date', 'service_amount_net'])

# 2. Создание временных лагов и фичей
print("\n2. Создание временных лагов и признаков...")

# Агрегация по месяцам с сохранением всех колонок
monthly_data = data.resample('M', on='service_date').agg({
    'service_amount_net': 'sum',
    'service_document_id': 'count',
    'patient_id': 'nunique',
    'service_code': 'nunique',
    'is_hospital': 'mean'
}).rename(columns={
    'service_document_id': 'transactions_count',
    'patient_id': 'unique_patients',
    'service_code': 'unique_services'
})

# Лаги для целевой переменной
for lag in [1, 2, 3, 12]:
    monthly_data[f'lag_{lag}'] = monthly_data['service_amount_net'].shift(lag)

# Дополнительные временные фичи
monthly_data['month'] = monthly_data.index.month
monthly_data['year'] = monthly_data.index.year
monthly_data['quarter'] = monthly_data.index.quarter

# Удаление строк с пропусками
monthly_data = monthly_data.dropna()

# 3. Подготовка train/test
print("\n3. Разделение данных на train/test...")
train_size = int(len(monthly_data) * 0.8)
train = monthly_data.iloc[:train_size]
test = monthly_data.iloc[train_size:]

X_train, y_train = train.drop('service_amount_net', axis=1), train['service_amount_net']
X_test, y_test = test.drop('service_amount_net', axis=1), test['service_amount_net']

# 4. Оптимизация гиперпараметров с Optuna
print("\n4. Оптимизация гиперпараметров LightGBM...")

def objective(trial):
    params = {
        'objective': 'regression',
        'metric': 'mae',
        'boosting_type': 'gbdt',
        'n_estimators': trial.suggest_int('n_estimators', 50, 500),
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3, log=True),
        'subsample': trial.suggest_float('subsample', 0.6, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.6, 1.0),
        'num_leaves': trial.suggest_int('num_leaves', 20, 100),
        'min_child_samples': trial.suggest_int('min_child_samples', 5, 30),
        'reg_alpha': trial.suggest_float('reg_alpha', 0, 1),
        'reg_lambda': trial.suggest_float('reg_lambda', 0, 1),
        'verbosity': -1
    }
    
    # Кросс-валидация временного ряда
    tscv = TimeSeriesSplit(n_splits=3)
    scores = []
    
    for train_idx, val_idx in tscv.split(X_train):
        X_train_fold, X_val_fold = X_train.iloc[train_idx], X_train.iloc[val_idx]
        y_train_fold, y_val_fold = y_train.iloc[train_idx], y_train.iloc[val_idx]
        
        train_data = lgb.Dataset(X_train_fold, label=y_train_fold)
        val_data = lgb.Dataset(X_val_fold, label=y_val_fold)
        
        model = lgb.train(params,
                         train_data,
                         valid_sets=[val_data],
                         callbacks=[lgb.early_stopping(stopping_rounds=50, verbose=False)])
        
        preds = model.predict(X_val_fold)
        score = mean_absolute_error(y_val_fold, preds)
        scores.append(score)
    
    return np.mean(scores)

study = optuna.create_study(direction='minimize', sampler=TPESampler())
study.optimize(objective, n_trials=30, show_progress_bar=True)

# 5. Обучение лучшей модели
print("\n5. Обучение лучшей модели LightGBM...")
best_params = study.best_params
best_params['verbosity'] = -1

train_data = lgb.Dataset(X_train, label=y_train)
test_data = lgb.Dataset(X_test, label=y_test)

model = lgb.train(best_params,
                 train_data,
                 valid_sets=[test_data],
                 callbacks=[lgb.early_stopping(stopping_rounds=50, verbose=True)])

# 6. Прогнозирование и оценка
print("\n6. Прогнозирование и оценка модели...")
test['forecast'] = model.predict(X_test)

# Расчет доверительного интервала
n_bootstrap = 100
preds = []
for _ in tqdm(range(n_bootstrap), desc="Bootstrap"):
    sample_idx = np.random.choice(len(X_test), len(X_test), replace=True)
    preds.append(model.predict(X_test.iloc[sample_idx]))
    
preds = np.array(preds)
test['lower_ci'] = np.percentile(preds, 2.5, axis=0)
test['upper_ci'] = np.percentile(preds, 97.5, axis=0)

# 7. Визуализация результатов
plt.figure(figsize=(14, 7))
plt.plot(train.index, train['service_amount_net'], label='Обучающая выборка', linewidth=2)
plt.plot(test.index, test['service_amount_net'], label='Фактические значения', 
         color='green', linewidth=2)
plt.plot(test.index, test['forecast'], label='Прогноз', 
         color='red', linestyle='--', linewidth=2)
plt.fill_between(test.index,
                test['lower_ci'],
                test['upper_ci'], color='pink', alpha=0.3)
plt.title('Сравнение прогноза с реальными значениями (LightGBM)', fontsize=14)
plt.xlabel('Дата', fontsize=12)
plt.ylabel('Сумма выплат', fontsize=12)
plt.legend()
plt.grid(True, linestyle='--', alpha=0.7)
plt.tight_layout()
plt.show()

# 8. Расчет метрик
mae = mean_absolute_error(test['service_amount_net'], test['forecast'])
rmse = np.sqrt(mean_squared_error(test['service_amount_net'], test['forecast']))
mape = np.mean(np.abs((test['service_amount_net'] - test['forecast']) / test['service_amount_net'])) * 100

print("\nМетрики качества на тестовой выборке:")
print(f"MAE: {mae:.2f}")
print(f"RMSE: {rmse:.2f}")
print(f"MAPE: {mape:.2f}%")

# 9. Прогноз на будущие периоды
print("\n9. Прогноз на будущие периоды...")
future_steps = 12
last_date = monthly_data.index[-1]
future_dates = pd.date_range(start=last_date, periods=future_steps+1, freq='M')[1:]

# Создание фичей для будущих периодов
future_data = []
for i in range(1, future_steps+1):
    row = {
        'transactions_count': X_test['transactions_count'].mean(),
        'unique_patients': X_test['unique_patients'].mean(),
        'unique_services': X_test['unique_services'].mean(),
        'is_hospital': X_test['is_hospital'].mean(),
        'month': future_dates[i-1].month,
        'year': future_dates[i-1].year,
        'quarter': (future_dates[i-1].month-1)//3 + 1
    }
    
    # Лаги (используем прогнозы для лагов)
    for lag in [1, 2, 3, 12]:
        if i - lag > 0:
            row[f'lag_{lag}'] = future_data[i-1-lag]['forecast']
        else:
            row[f'lag_{lag}'] = monthly_data['service_amount_net'].iloc[-lag]
    
    future_data.append(row)

future_df = pd.DataFrame(future_data, index=future_dates)
future_df['forecast'] = model.predict(future_df[X_test.columns])
future_df['lower_ci'] = future_df['forecast'] * 0.9  # Упрощенный расчет
future_df['upper_ci'] = future_df['forecast'] * 1.1

# Визуализация прогноза
plt.figure(figsize=(14, 7))
plt.plot(monthly_data.index, monthly_data['service_amount_net'], 
         label='Исторические данные', linewidth=2)
plt.plot(future_df.index, future_df['forecast'], 
         label='Прогноз', color='red', linewidth=2)
plt.fill_between(future_df.index,
                future_df['lower_ci'],
                future_df['upper_ci'], color='pink', alpha=0.3)
plt.title(f'Прогноз месячных выплат на {future_steps} месяцев вперед', fontsize=14)
plt.xlabel('Дата', fontsize=12)
plt.ylabel('Сумма выплат', fontsize=12)
plt.legend()
plt.grid(True, linestyle='--', alpha=0.7)
plt.tight_layout()
plt.show()

# 10. Сохранение результатов
forecast_df = pd.DataFrame({
    'date': future_df.index,
    'forecast': future_df['forecast'],
    'lower_ci': future_df['lower_ci'],
    'upper_ci': future_df['upper_ci']
})
forecast_df.to_csv('lgb_payments_forecast.csv', index=False)
print("\nПрогноз сохранен в lgb_payments_forecast.csv")

# 11. Важность признаков
plt.figure(figsize=(12, 6))
lgb.plot_importance(model, importance_type='split', max_num_features=15)
plt.title('Важность признаков (LightGBM)', fontsize=14)
plt.tight_layout()
plt.show()