In [1]:
%load_ext autoreload

In [2]:
autoreload 2

In [3]:
import os
import sys

colab = True
if os.getenv("COLAB_RELEASE_TAG"):
    colab = True
else:
    colab = False

if colab:
    module_path = os.path.abspath(os.path.join('./real-hdd-failure/code/'))
    if module_path not in sys.path:
        sys.path.append(module_path)

    helper_path = os.path.abspath(os.path.join('./real-hdd-failure/code/helper/'))
    if helper_path not in sys.path:
        sys.path.append(helper_path)
    !{sys.executable} -m pip install -r ./real-hdd-failure/requirements.txt
else:
    module_path = os.path.abspath(os.path.join('..'))
    if module_path not in sys.path:
        sys.path.append(module_path)

    helper_path = os.path.abspath(os.path.join('../helper'))
    if helper_path not in sys.path:
        sys.path.append(helper_path)
# !{sys.executable} -m pip install -r ../../requirements.txt

In [4]:
from helper.dataclass import HDDDataset
from helper.algorithms.Blending import Blending
from helper.algorithms.Stacking import Stacking
from helper.preprocessing import *
from helper.metrics import *
from helper.eda import *
from sklearn.model_selection import train_test_split
from helper.metrics import *
from helper.saver import Saver

## Data preprocessing

In [5]:
hdd_dataset = HDDDataset.read_csv('ST14000NM001G.csv')

In [None]:
preprocessor = Preprocessing(hdd_dataset)
preprocessor.clear_unused_data()
preprocessor.add_target_column()
preprocessor.prepare_train_df()
preprocessor.add_time_features()
preprocessor.train_test_val_split(sampling_strat=0.055, oversampling='Default')
preprocessor.normalize_data(method='formulae', inplace=True)
# preprocessor.normalize_data(method='YJ', inplace=True)
preprocessor.drop_unimportant_features()

Clearing unused columns...
Adding target column...
Prepairing train dataset...
Adding time features to unsplitted dataset...


AssertionError: Некорректное значение oversampling

In [7]:
preprocessor.rescale_types()

Rescaling types...


## Training

In [8]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from catboost import CatBoostClassifier

In [9]:
model = Blending(models=[
    MLPClassifier(activation='relu', alpha=0.43, batch_size=350, beta_1=0.85, beta_2=0.9, hidden_layer_sizes=(128, 64), learning_rate='adaptive', learning_rate_init=0.1, solver='adam'),
    RandomForestClassifier(bootstrap=False, criterion='gini', max_features=0.21, min_samples_leaf=9, min_samples_split=3, n_jobs=-1),
    CatBoostClassifier(thread_count=-1, verbose=0)
    ])
preds = model.fit_predict(preprocessor.X_train, preprocessor.y_train, preprocessor.X_val, preprocessor.y_val, preprocessor.X_test)

=== Обучаем MLPClassifier...  ===
=== Обучаем RandomForestClassifier...  ===
=== Обучаем CatBoostClassifier...  ===
=== Обучаем мета модель... ===


## Testing

In [10]:
metrics = Metrics.get_metrics(preds, preprocessor.y_test)
print(metrics)
# Посмотрим метрики

{'accuracy': 0.9996530130488951, 'precision': 0.9779411764705882, 'recall': 0.5175097276264592, 'f1': 0.6768447837150128, 'auc_roc': 0.7587507626624467}


## Logging

In [11]:
Saver.save(model, preprocessor, metrics, save_csv=False)