In [1]:
%load_ext autoreload

In [2]:
autoreload 2

In [None]:
# import sys
# !{sys.executable} -m pip install -r ../../requirements.txt

In [4]:
import sys
import os

module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

helper_path = os.path.abspath(os.path.join('../helper'))
if helper_path not in sys.path:
    sys.path.append(helper_path)

from helper.dataclass import HDDDataset
from helper.models.mymodel import MyModel
from helper.preprocessing import *
from helper.metrics import *
from helper.eda import *
from sklearn.model_selection import train_test_split
from helper.metrics import *
from helper.saver import Saver

## Data preprocessing

In [5]:
hdd_dataset = HDDDataset.read_csv('ST14000NM001G.csv')

In [6]:
preprocessor = Preprocessing(hdd_dataset)
preprocessor.clear_unused_data()
preprocessor.add_target_column()
preprocessor.prepare_train_df()
preprocessor.add_time_features()
preprocessor.train_test_val_split(train_size=0.8, test_size=0.2, val_size=0)
preprocessor.normalize_data(method='boxcoxlog', inplace=True)

Clearing unused columns...
Adding target column...
Prepairing train dataset...
Adding time features to unsplitted dataset...
Splitting train dataset...
Normalizing splitted dataset with boxcoxlog...


## Training

In [7]:
model = MyModel('CatBoost')
model.fit(preprocessor.X_train, preprocessor.y_train, method='grid', epochs=20)
print(model.get_hyperparams())

=== Оптимизация гиперпараметров для CatBoost... ===
Лучшие параметры для CatBoost:
{'depth': 8, 'iterations': 200, 'learning_rate': 0.1}
{'nan_mode': 'Min', 'eval_metric': 'Logloss', 'iterations': 200, 'sampling_frequency': 'PerTree', 'leaf_estimation_method': 'Newton', 'random_score_type': 'NormalWithModelSizeDecrease', 'grow_policy': 'SymmetricTree', 'penalties_coefficient': 1, 'boosting_type': 'Plain', 'model_shrink_mode': 'Constant', 'feature_border_type': 'GreedyLogSum', 'bayesian_matrix_reg': 0.10000000149011612, 'eval_fraction': 0, 'force_unit_auto_pair_weights': False, 'l2_leaf_reg': 3, 'random_strength': 1, 'rsm': 1, 'boost_from_average': False, 'model_size_reg': 0.5, 'pool_metainfo_options': {'tags': {}}, 'subsample': 0.800000011920929, 'use_best_model': False, 'class_names': [0, 1], 'random_seed': 0, 'depth': 8, 'posterior_sampling': False, 'border_count': 254, 'classes_count': 0, 'auto_class_weights': 'None', 'sparse_features_conflict_fraction': 0, 'leaf_estimation_backtrac

## Testing

In [8]:
preds = model.predict(preprocessor.X_test)
metrics = Metrics.get_metrics(preds, preprocessor.y_test)
print(metrics)

{'accuracy': 0.9996735037352402, 'precision': 0.9661016949152542, 'recall': 0.5544747081712063, 'f1': 0.7045735475896168, 'auc_roc': 0.777230518820282}


In [9]:
from sklearn.metrics import classification_report
print(classification_report(preprocessor.y_test, preds))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00   1463001
           1       0.97      0.55      0.70      1028

    accuracy                           1.00   1464029
   macro avg       0.98      0.78      0.85   1464029
weighted avg       1.00      1.00      1.00   1464029



## Logging

In [10]:
Saver.save(model, preprocessor, metrics, save_csv=False)