Быстро самые тупые гипотезы накинул, не зашло, буду адекватно делать)

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, TimeSeriesSplit
from sklearn.preprocessing import LabelEncoder
from catboost import CatBoostClassifier
from sklearn.metrics import classification_report, roc_auc_score

Catbost + датчики

In [2]:
# Загрузка данных
data = pd.read_excel(r'Датчики за июль 2024.xlsx')

# Приведение временных меток к datetime
data['start'] = pd.to_datetime(data['start'])
data['end'] = pd.to_datetime(data['end'])

# Создание целевой переменной
data['target'] = (data['reason'] == 'Аварийная остановка').astype(int)

# Временные признаки
data['duration_diff'] = data['duration'].diff().fillna(0)
data['time_since_last'] = (data['start'] - data['start'].shift(1)).dt.total_seconds().fillna(0)
data['hour'] = data['start'].dt.hour
data['day_of_week'] = data['start'].dt.dayofweek

# Кодирование категориального признака
le = LabelEncoder()
data['reason_encoded'] = le.fit_transform(data['reason'])

# Подготовка признаков и целевой переменной
features = ['duration', 'duration_diff', 'time_since_last', 'hour', 'day_of_week', 'reason_encoded']
X = data[features]
y = data['target']

# Разделение данных на обучающую и тестовую выборки
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=False)

# Обучение модели CatBoost
model = CatBoostClassifier(iterations=500, depth=6, learning_rate=0.1, verbose=100)
model.fit(X_train, y_train)

# Предсказание и оценка модели
y_pred = model.predict(X_test)
y_proba = model.predict_proba(X_test)[:, 1]

print(classification_report(y_test, y_pred))
print(f"ROC-AUC: {roc_auc_score(y_test, y_proba):.5f}")

0:	learn: 0.4717941	total: 104ms	remaining: 51.8s
100:	learn: 0.0000596	total: 347ms	remaining: 1.37s
200:	learn: 0.0000281	total: 604ms	remaining: 898ms
300:	learn: 0.0000186	total: 850ms	remaining: 562ms
400:	learn: 0.0000144	total: 1.08s	remaining: 267ms
499:	learn: 0.0000133	total: 1.3s	remaining: 0us
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      1327
           1       1.00      1.00      1.00         2

    accuracy                           1.00      1329
   macro avg       1.00      1.00      1.00      1329
weighted avg       1.00      1.00      1.00      1329

ROC-AUC: 1.00000


Catbbost + датчики + поломки со сдвигом

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from catboost import CatBoostClassifier
from sklearn.metrics import roc_auc_score

# Загрузка данных
downtime_july_df = pd.read_excel(r'Простои за июль 2024.xlsx')
sensors_july_df = pd.read_excel(r'Датчики за июль 2024.xlsx')

# Приведение времени к минутам
downtime_july_df['start'] = pd.to_datetime(downtime_july_df['start']).dt.floor('min')
downtime_july_df['end'] = pd.to_datetime(downtime_july_df['end']).dt.floor('min')
sensors_july_df['start'] = pd.to_datetime(sensors_july_df['start']).dt.floor('min')
sensors_july_df['end'] = pd.to_datetime(sensors_july_df['end']).dt.floor('min')

# Создание временного ряда по минутам
full_time_range = pd.date_range(start=downtime_july_df['start'].min(), end=downtime_july_df['end'].max(), freq='T')

# Развертывание данных по времени
downtime_expanded = pd.DataFrame({'timestamp': full_time_range})
sensors_expanded = pd.DataFrame({'timestamp': full_time_range})

downtime_july_df['duration'] = downtime_july_df['duration'].fillna(0)
sensors_july_df['duration'] = sensors_july_df['duration'].fillna(0)

downtime_expanded = downtime_expanded.merge(downtime_july_df, left_on='timestamp', right_on='start', how='left')
sensors_expanded = sensors_expanded.merge(sensors_july_df, left_on='timestamp', right_on='start', how='left')

# Объединение данных
merged_df = pd.merge(downtime_expanded, sensors_expanded, on='timestamp', suffixes=('_x', '_y'))

# Преобразование всех столбцов с типом datetime в числовой формат
for col in merged_df.select_dtypes(include=['datetime64[ns]']).columns:
    merged_df[col] = (merged_df[col] - pd.Timestamp("1970-01-01")) // pd.Timedelta('1s')

# Автоматическое определение категориальных признаков
cat_features = merged_df.select_dtypes(include=['object', 'category']).columns.tolist()

# Заполнение пропусков в категориальных признаках
for col in cat_features:
    merged_df[col].fillna('Нет данных', inplace=True)

# Сдвиг целевой переменной на 3 дня вперед (4320 минут)
merged_df['target'] = merged_df['reason_y'].shift(-4320)  # Сдвиг на 3 дня вперед
merged_df['target'] = (merged_df['target'] == 'Аварийная остановка').astype(int)

# Удаление строк с пропусками (последние 4320 строк будут иметь NaN в целевой переменной)
merged_df.dropna(subset=['target'], inplace=True)

# Подготовка данных для обучения
X = merged_df.drop(columns=['reason_y', 'target'])
y = merged_df['target']

# Разделение данных на обучающую и тестовую выборки
# Используем временное разделение: первые 80% данных - train, последние 20% - test
train_size = int(0.8 * len(X))
X_train, X_test = X[:train_size], X[train_size:]
y_train, y_test = y[:train_size], y[train_size:]

# Обучение модели CatBoost
cat_features_indices = [X.columns.get_loc(col) for col in cat_features if col in X.columns]
model = CatBoostClassifier(cat_features=cat_features_indices, verbose=0)
model.fit(X_train, y_train)

# Предсказание и оценка
y_pred = model.predict_proba(X_test)[:, 1]
roc_auc = roc_auc_score(y_test, y_pred)
print(f'ROC-AUC: {roc_auc:.4f}')

In [16]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from catboost import CatBoostClassifier
from sklearn.metrics import roc_auc_score
from statsmodels.tsa.arima.model import ARIMA

# Загрузка данных
downtime_july_df = pd.read_excel(r'Простои за июль 2024.xlsx')
sensors_july_df = pd.read_excel(r'Датчики за июль 2024.xlsx')

# Приведение времени к минутам
downtime_july_df['start'] = pd.to_datetime(downtime_july_df['start']).dt.floor('T')
downtime_july_df['end'] = pd.to_datetime(downtime_july_df['end']).dt.floor('T')
sensors_july_df['start'] = pd.to_datetime(sensors_july_df['start']).dt.floor('T')
sensors_july_df['end'] = pd.to_datetime(sensors_july_df['end']).dt.floor('T')

# Создание временного ряда по минутам
full_time_range = pd.date_range(start=downtime_july_df['start'].min(), end=downtime_july_df['end'].max(), freq='T')

# Развертывание данных о простоях по временному ряду
downtime_expanded = pd.DataFrame({'timestamp': full_time_range})
for index, row in downtime_july_df.iterrows():
    downtime_expanded.loc[
        (downtime_expanded['timestamp'] >= row['start']) & 
        (downtime_expanded['timestamp'] <= row['end']), 
        ['downtime', 'reason', 'description', 'component']
    ] = row[['downtime', 'reason', 'description', 'component']]

# Развертывание данных о датчиках по временному ряду
sensors_expanded = pd.DataFrame({'timestamp': full_time_range})
for index, row in sensors_july_df.iterrows():
    sensors_expanded.loc[
        (sensors_expanded['timestamp'] >= row['start']) & 
        (sensors_expanded['timestamp'] <= row['end']), 
        'reason'
    ] = row['reason']

# Объединение данных
merged_df = pd.merge(downtime_expanded, sensors_expanded, on='timestamp', how='left', suffixes=('_downtime', '_sensor'))

# Заполнение пропусков
merged_df['reason_sensor'].fillna('Нет данных', inplace=True)
merged_df.fillna({'downtime': 'Нет данных', 'reason_downtime': 'Нет данных', 'description': 'Нет данных', 'component': 'Нет данных'}, inplace=True)

# Обработка категориальных признаков
cat_features = ['downtime', 'reason_downtime', 'description', 'component', 'reason_sensor']
for col in cat_features:
    merged_df[col] = merged_df[col].astype('category')

# Создание целевой переменной (аварийная остановка)
merged_df['target'] = (merged_df['reason_sensor'] == 'Аварийная остановка').astype(int)

# Сдвиг целевой переменной на 3 дня вперед (4320 минут)
merged_df['target'] = merged_df['target'].shift(-4320)

# Удаление строк с пропусками в целевой переменной
merged_df.dropna(subset=['target'], inplace=True)

# Подготовка данных для обучения
X = merged_df.drop(columns=['target'])
y = merged_df['target']

# Разделение данных на обучающую и тестовую выборки
train_size = int(0.8 * len(X))
X_train, X_test = X[:train_size], X[train_size:]
y_train, y_test = y[:train_size], y[train_size:]

# Обучение модели CatBoost
cat_features_indices = [X.columns.get_loc(col) for col in cat_features]
model = CatBoostClassifier(cat_features=cat_features_indices, verbose=0)
model.fit(X_train, y_train)

# Предсказание и оценка
y_pred = model.predict_proba(X_test)[:, 1]
roc_auc = roc_auc_score(y_test, y_pred)
print(f'ROC-AUC: {roc_auc:.4f}')

  downtime_july_df['start'] = pd.to_datetime(downtime_july_df['start']).dt.floor('T')
  downtime_july_df['end'] = pd.to_datetime(downtime_july_df['end']).dt.floor('T')
  sensors_july_df['start'] = pd.to_datetime(sensors_july_df['start']).dt.floor('T')
  sensors_july_df['end'] = pd.to_datetime(sensors_july_df['end']).dt.floor('T')
  full_time_range = pd.date_range(start=downtime_july_df['start'].min(), end=downtime_july_df['end'].max(), freq='T')
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  merged_df['reason_sensor'].fillna('Нет данных', inplace=True)


ROC-AUC: 0.4527


In [18]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from catboost import CatBoostClassifier
from sklearn.metrics import roc_auc_score

# Загрузка данных
downtime_july_df = pd.read_excel(r'Простои за июль 2024.xlsx')
sensors_july_df = pd.read_excel(r'Датчики за июль 2024.xlsx')

# Приведение времени к минутам
downtime_july_df['start'] = pd.to_datetime(downtime_july_df['start']).dt.floor('T')
downtime_july_df['end'] = pd.to_datetime(downtime_july_df['end']).dt.floor('T')
sensors_july_df['start'] = pd.to_datetime(sensors_july_df['start']).dt.floor('T')
sensors_july_df['end'] = pd.to_datetime(sensors_july_df['end']).dt.floor('T')

# Создание временного ряда с интервалом 10 минут
full_time_range = pd.date_range(start=downtime_july_df['start'].min(), end=downtime_july_df['end'].max(), freq='10T')

# Агрегация данных о простоях
downtime_aggregated = pd.DataFrame({'timestamp': full_time_range})
for index, row in downtime_july_df.iterrows():
    mask = (downtime_aggregated['timestamp'] >= row['start']) & (downtime_aggregated['timestamp'] <= row['end'])
    downtime_aggregated.loc[mask, ['downtime', 'reason', 'description', 'component']] = row[['downtime', 'reason', 'description', 'component']]

# Агрегация данных о датчиках
sensors_aggregated = pd.DataFrame({'timestamp': full_time_range})
for index, row in sensors_july_df.iterrows():
    mask = (sensors_aggregated['timestamp'] >= row['start']) & (sensors_aggregated['timestamp'] <= row['end'])
    sensors_aggregated.loc[mask, 'reason'] = row['reason']

# Объединение данных
merged_df = pd.merge(downtime_aggregated, sensors_aggregated, on='timestamp', how='left', suffixes=('_downtime', '_sensor'))

# Заполнение пропусков
merged_df['reason_sensor'].fillna('Нет данных', inplace=True)
merged_df.fillna({'downtime': 'Нет данных', 'reason_downtime': 'Нет данных', 'description': 'Нет данных', 'component': 'Нет данных'}, inplace=True)
# Добавление лаговых признаков
for lag in [1, 2, 3, 6, 12, 24]:  # Лаги за 10, 20, 30, 60, 120 и 240 минут
    merged_df[f'reason_sensor_lag_{lag}'] = merged_df['reason_sensor'].shift(lag)

# Обновление списка категориальных признаков
cat_features = ['downtime', 'reason_downtime', 'description', 'component', 'reason_sensor']
for lag in [1, 2, 3, 6, 12, 24]:
    cat_features.append(f'reason_sensor_lag_{lag}')

# Удаление строк с пропусками после добавления лагов
merged_df.dropna(inplace=True)

# Подготовка данных для обучения
X = merged_df.drop(columns=['target'])
y = merged_df['target']

# Разделение данных на обучающую и тестовую выборки
train_size = int(0.8 * len(X))
X_train, X_test = X[:train_size], X[train_size:]
y_train, y_test = y[:train_size], y[train_size:]

# Обучение модели CatBoost
cat_features_indices = [X.columns.get_loc(col) for col in cat_features]
model = CatBoostClassifier(cat_features=cat_features_indices, verbose=0)
model.fit(X_train, y_train)

# Предсказание и оценка
y_pred = model.predict_proba(X_test)[:, 1]
roc_auc = roc_auc_score(y_test, y_pred)
print(f'ROC-AUC: {roc_auc:.4f}')

  downtime_july_df['start'] = pd.to_datetime(downtime_july_df['start']).dt.floor('T')
  downtime_july_df['end'] = pd.to_datetime(downtime_july_df['end']).dt.floor('T')
  sensors_july_df['start'] = pd.to_datetime(sensors_july_df['start']).dt.floor('T')
  sensors_july_df['end'] = pd.to_datetime(sensors_july_df['end']).dt.floor('T')
  full_time_range = pd.date_range(start=downtime_july_df['start'].min(), end=downtime_july_df['end'].max(), freq='10T')
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  merged_df['reason_sensor'].fillna('Нет данных', inplace=True)


KeyError: "['target'] not found in axis"