In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, FunctionTransformer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
import optuna
from optuna.samplers import TPESampler
from gensim.models import Word2Vec
from sklearn.base import BaseEstimator, TransformerMixin

from marked_problems import marked_problems

breakdowns_df = pd.read_excel(r'Поломки за 5 лет.xlsx')
# Создание целевой переменной
breakdowns_df['target'] = breakdowns_df['reason'].map(marked_problems)

# Генерация временных признаков
breakdowns_df['hour'] = breakdowns_df['start'].dt.hour
breakdowns_df['day_of_week'] = breakdowns_df['start'].dt.dayofweek
breakdowns_df['month'] = breakdowns_df['start'].dt.month
breakdowns_df['is_weekend'] = breakdowns_df['day_of_week'].apply(lambda x: 1 if x >= 5 else 0)

# Генерация признаков на основе длительности
breakdowns_df['log_duration'] = np.log1p(breakdowns_df['duration'])
breakdowns_df['duration_bin'] = pd.cut(breakdowns_df['duration'], bins=5, labels=False)

# Убедимся, что данные отсортированы по времени
breakdowns_df = breakdowns_df.sort_values(by='start')

# Преобразуем индекс в datetime для работы с временными окнами
breakdowns_df = breakdowns_df.set_index('start')

# Генерация скользящих метрик
for lag in [1, 3, 7]:
    breakdowns_df[f'breakdowns_last_{lag}_days'] = breakdowns_df['target'].rolling(f'{lag}D').sum()
    breakdowns_df[f'duration_avg_last_{lag}_days'] = breakdowns_df['duration'].rolling(f'{lag}D').mean()

# Генерация лаговых признаков
for lag in [1, 3, 7]:
    breakdowns_df[f'target_lag_{lag}_days'] = breakdowns_df['target'].shift(lag)
    breakdowns_df[f'duration_lag_{lag}_days'] = breakdowns_df['duration'].shift(lag)

# Сбросим индекс, чтобы вернуть столбец 'start' обратно
breakdowns_df = breakdowns_df.reset_index()

# Препроцессинг данных
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), ['hour', 'day_of_week', 'month', 'duration', 'log_duration', 'duration_bin',
                                   'target_lag_1_days', 'target_lag_3_days', 'target_lag_7_days',
                                   'duration_lag_1_days', 'duration_lag_3_days', 'duration_lag_7_days',
                                   'breakdowns_last_1_days', 'breakdowns_last_3_days', 'breakdowns_last_7_days',
                                   'duration_avg_last_1_days', 'duration_avg_last_3_days', 'duration_avg_last_7_days']),
        ('cat', OneHotEncoder(), ['reason_group'])
    ])

# Разделение данных
X = breakdowns_df[['hour', 'day_of_week', 'month', 'duration', 'log_duration', 'duration_bin', 'reason_group',
                   'target_lag_1_days', 'target_lag_3_days', 'target_lag_7_days',
                   'duration_lag_1_days', 'duration_lag_3_days', 'duration_lag_7_days',
                   'breakdowns_last_1_days', 'breakdowns_last_3_days', 'breakdowns_last_7_days',
                   'duration_avg_last_1_days', 'duration_avg_last_3_days', 'duration_avg_last_7_days']]
y = breakdowns_df['target']

# Удаление строк с NaN (если они есть)
X = X.fillna(0)
y = y.fillna(0)

# Функция для Optuna
def objective(trial):
    n_estimators = trial.suggest_int('n_estimators', 50, 200)
    max_depth = trial.suggest_int('max_depth', 10, 50)
    min_samples_split = trial.suggest_int('min_samples_split', 2, 10)

    model = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('classifier', RandomForestClassifier(n_estimators=n_estimators, max_depth=max_depth, min_samples_split=min_samples_split, random_state=42))
    ])

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    model.fit(X_train, y_train)
    y_pred_proba = model.predict_proba(X_test)[:, 1]
    return roc_auc_score(y_test, y_pred_proba)

# Оптимизация гиперпараметров
study = optuna.create_study(direction='maximize', sampler=TPESampler())
study.optimize(objective, n_trials=50)

# Лучшие параметры
best_params = study.best_params
print(f'Best ROC-AUC: {study.best_value}')
print(f'Best parameters: {best_params}')

# Обучение модели с лучшими параметрами
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(**best_params, random_state=42))
])

model.fit(X, y)

# Убедимся, что данные отсортированы по времени
breakdowns_df = breakdowns_df.sort_values(by='start')

# Преобразуем индекс в datetime для работы с временными окнами
breakdowns_df = breakdowns_df.set_index('start')

# Генерация скользящих метрик
for lag in [1, 3, 7]:
    breakdowns_df[f'breakdowns_last_{lag}_days'] = breakdowns_df['target'].rolling(f'{lag}D').sum()
    breakdowns_df[f'duration_avg_last_{lag}_days'] = breakdowns_df['duration'].rolling(f'{lag}D').mean()

# Генерация лаговых признаков
for lag in [1, 3, 7]:
    breakdowns_df[f'target_lag_{lag}_days'] = breakdowns_df['target'].shift(lag)
    breakdowns_df[f'duration_lag_{lag}_days'] = breakdowns_df['duration'].shift(lag)

# Сбросим индекс, чтобы вернуть столбец 'start' обратно
breakdowns_df = breakdowns_df.reset_index()

# Предсказание на будущие даты
last_date = breakdowns_df['start'].max()
future_dates = [last_date + pd.Timedelta(days=i) for i in [1, 3, 7]]
future_data = pd.DataFrame({'start': future_dates})

# Генерация признаков для будущих дат
future_data['hour'] = future_data['start'].dt.hour
future_data['day_of_week'] = future_data['start'].dt.dayofweek
future_data['month'] = future_data['start'].dt.month
future_data['is_weekend'] = future_data['day_of_week'].apply(lambda x: 1 if x >= 5 else 0)
future_data['duration'] = 0
future_data['log_duration'] = np.log1p(future_data['duration'])
future_data['duration_bin'] = pd.cut(future_data['duration'], bins=5, labels=False)

# Добавим скользящие метрики (используем последние доступные данные)
for lag in [1, 3, 7]:
    future_data[f'breakdowns_last_{lag}_days'] = breakdowns_df[f'breakdowns_last_{lag}_days'].iloc[-1]
    future_data[f'duration_avg_last_{lag}_days'] = breakdowns_df[f'duration_avg_last_{lag}_days'].iloc[-1]

# Генерация лаговых признаков для будущих дат
for lag in [1, 3, 7]:
    future_data[f'target_lag_{lag}_days'] = breakdowns_df[f'target_lag_{lag}_days'].iloc[-1]
    future_data[f'duration_lag_{lag}_days'] = breakdowns_df[f'duration_lag_{lag}_days'].iloc[-1]

# Причина (предположим, что неизвестна, используем наиболее частую)
future_data['reason_group'] = breakdowns_df['reason_group'].mode()[0]

# Предсказание
future_predictions = model.predict_proba(future_data)[:, 1]

# Добавляем предсказания в DataFrame
future_data['predicted_probability'] = future_predictions

# Выводим результат
print(future_data[['start', 'predicted_probability']])