In [1]:
%load_ext autoreload

In [2]:
%autoreload 2

In [3]:
import os
import sys

colab = True
if os.getenv("COLAB_RELEASE_TAG"):
    colab = True
else:
    colab = False

if colab:
    module_path = os.path.abspath(os.path.join('./real-hdd-failure/code/'))
    if module_path not in sys.path:
        sys.path.append(module_path)

    helper_path = os.path.abspath(os.path.join('./real-hdd-failure/code/helper/'))
    if helper_path not in sys.path:
        sys.path.append(helper_path)
    !{sys.executable} -m pip install -r ./real-hdd-failure/requirements.txt
else:
    module_path = os.path.abspath(os.path.join('..'))
    if module_path not in sys.path:
        sys.path.append(module_path)

    helper_path = os.path.abspath(os.path.join('../helper'))
    if helper_path not in sys.path:
        sys.path.append(helper_path)
# !{sys.executable} -m pip install -r ../../requirements.txt

In [4]:
from helper.dataclass import *
from helper.preprocessing import *
from helper.eda import *
from helper.algorithms.DoubleLayer import DoubleLayer
from helper.metrics import *
from helper.saver import *
from sklearn.model_selection import train_test_split

import warnings


warnings.simplefilter(action='ignore', category=FutureWarning)

## Data Preprocessing

In [5]:
hdd_dataset = HDDDataset.read_csv('ST14000NM001G.csv')

In [6]:
preprocessor = Preprocessing(hdd_dataset)
preprocessor.clear_unused_data()
preprocessor.add_target_column()
preprocessor.prepare_train_df()
preprocessor.add_time_features()
preprocessor.train_test_val_split(train_size=0.9, test_size=0.1, val_size=0)
preprocessor.normalize_data(method='YJ')

Clearing unused columns...
Adding target column...
Prepairing train dataset...
Adding time features to unsplitted dataset...
Splitting train dataset...
Normalizing splitted dataset with YJ...


## Training

In [7]:
dl = DoubleLayer(n_splits=5, n_opts=30)
dl.fit(preprocessor.X_train, preprocessor.y_train)

=== Обучаем модель первого слоя ===
=== Выбранные признаки: ['failure', 'smart_5_raw', 'smart_9_raw', 'smart_187_raw', 'smart_188_raw', 'smart_192_raw', 'smart_197_raw', 'smart_199_raw', 'smart_240_raw', 'smart_241_raw', 'smart_242_raw', 'shift_smart_5_raw_1', 'shift_smart_9_raw_1', 'shift_smart_188_raw_1', 'shift_smart_192_raw_1', 'shift_smart_197_raw_1', 'shift_smart_199_raw_1', 'shift_smart_240_raw_1', 'shift_smart_241_raw_1', 'shift_smart_242_raw_1', 'diff_smart_9_raw_1', 'diff_smart_187_raw_1', 'diff_smart_188_raw_1', 'diff_smart_192_raw_1', 'diff_smart_197_raw_1', 'diff_smart_240_raw_1', 'diff_smart_241_raw_1', 'diff_smart_242_raw_1', 'smart_187_raw_normalized', 'smart_188_raw_normalized', 'smart_192_raw_normalized', 'smart_240_raw_normalized', 'smart_241_raw_normalized', 'smart_242_raw_normalized'] ===
=== Получаем предсказания первой модели ===


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_selected['layer1_preds'] = layer1_preds


=== Обучаем вторую модель ===


In [8]:
dl.feature2importance

[('smart_187_raw_normalized', 0.40630463),
 ('smart_187_raw', 0.23349284),
 ('smart_5_raw', 0.04271862),
 ('smart_197_raw', 0.038081054),
 ('smart_199_raw', 0.032846365),
 ('shift_smart_199_raw_1', 0.028598666),
 ('smart_240_raw_normalized', 0.022593712),
 ('smart_188_raw_normalized', 0.019030312),
 ('smart_188_raw', 0.01901043),
 ('smart_192_raw_normalized', 0.018221006),
 ('shift_smart_192_raw_1', 0.013556926),
 ('smart_192_raw', 0.012478201),
 ('smart_241_raw_normalized', 0.01160151),
 ('shift_smart_240_raw_1', 0.00933075),
 ('diff_smart_192_raw_1', 0.009108311),
 ('smart_242_raw_normalized', 0.008647421),
 ('smart_240_raw', 0.008137648),
 ('smart_242_raw', 0.007861959),
 ('shift_smart_242_raw_1', 0.007339875),
 ('shift_smart_241_raw_1', 0.006337287),
 ('smart_241_raw', 0.006095802),
 ('shift_smart_188_raw_1', 0.006010483),
 ('shift_smart_9_raw_1', 0.0052551227),
 ('smart_9_raw', 0.005240639),
 ('failure', 0.0033730478),
 ('shift_smart_5_raw_1', 0.0024762333),
 ('diff_smart_241_raw_

In [9]:
preds = dl.predict(preprocessor.X_test)
metrics = Metrics.get_metrics(preds, preprocessor.y_test)
print(metrics)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_selected['layer1_preds'] = layer1_preds


{'accuracy': 0.9996707717738025, 'precision': 0.8081264108352144, 'recall': 0.6964980544747081, 'f1': 0.748171368861024, 'auc_roc': 0.8481909275218376}


In [11]:
from sklearn.metrics import classification_report
print(classification_report(preprocessor.y_test, preds))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00    731501
           1       0.81      0.70      0.75       514

    accuracy                           1.00    732015
   macro avg       0.90      0.85      0.87    732015
weighted avg       1.00      1.00      1.00    732015



In [10]:
Saver.save(dl, preprocessor, metrics, save_csv=False)

