In [1]:
# import numpy as np
# import pandas as pd
# from sklearn.base import BaseEstimator, TransformerMixin


# class LogTransformer(BaseEstimator, TransformerMixin):
#     """
#     Log transforming
#     """

#     def __init__(self, copy=None):
#         self.copy = copy

#     def fit(self, X, y=None):
#         return self

#     def transform(self, X, y=None, copy=None):
#         return np.log(X + 1)

#     def fit_transform(self, X, y=None):
#         return self.fit(X, y).transform(X, y)

import numpy as np
import pandas as pd
from sklearn.base import BaseEstimator, TransformerMixin


class LogTransformer(BaseEstimator, TransformerMixin):
    """
    Log transforming
    """

    def __init__(self):
        pass

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        return np.log(X + 1)

    def fit_transform(self, X, y=None):
        return self.fit(X, y).transform(X, y)

# Imports

In [2]:
import os
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm
from yaml import load as yaml_load, FullLoader
import pickle
import sklearn
from sklearn.linear_model import LogisticRegressionCV, LogisticRegression
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import roc_auc_score, average_precision_score, f1_score, classification_report

In [3]:
from sklearn import svm
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score

In [4]:
RS=42

In [5]:
from logger import logger

## Preprocessing define

In [6]:
df = pd.read_csv('data/raw/heart_cleveland_upload.csv')
df_train, df_oos = train_test_split(df, stratify=df['condition'], test_size=0.25, random_state=RS)
df.shape, df_train.shape, df_oos.shape

((297, 14), (222, 14), (75, 14))

In [7]:
df_train.to_csv('data/raw/train.csv', index=False)
df_oos.to_csv('data/raw/oos.csv', index=False)

In [8]:
X_train, y_train = df_train.drop(columns=['condition']), df_train['condition']

In [9]:
X_test, y_test = df_oos.drop(columns=['condition']), df_oos['condition']

## Hyper parameters search

In [10]:
from sklearn.preprocessing import StandardScaler

In [50]:
def build_pipe(clf_name, r_state):
    """
    Формируем модель с указанными гиперпараметрами 
    """ 
    
    if clf_name == 'logreg':
        classifier = LogisticRegression(class_weight='balanced',
                                        solver='saga',
                                        n_jobs=-1,
                                        random_state=r_state)
        
    elif clf_name == 'svm':
        classifier = svm.SVC(random_state=r_state)
        
    else: 
        raise KeyError('Unknown classifier: {}'.format(clf_name))
        
    
    pipe = Pipeline([
                            ('log_scaler', LogTransformer()),
                            ('clf', classifier)
    ])
    
    return pipe

In [51]:
param_range_fl = [1.0, 0.5]

logreg_params = [{'clf__penalty': ['l1', 'l2'],
                'clf__C': param_range_fl,
                'clf__solver': ['liblinear']}] 


In [52]:
param_range = [9, 10]

svm_params = [{'clf__kernel': ['linear', 'rbf'], 
        'clf__C': param_range}]

In [53]:
classifiers = {'logreg': logreg_params, 
               'svm': svm_params
              }

In [54]:
classifiers

{'logreg': [{'clf__penalty': ['l1', 'l2'],
   'clf__C': [1.0, 0.5],
   'clf__solver': ['liblinear']}],
 'svm': [{'clf__kernel': ['linear', 'rbf'], 'clf__C': [9, 10]}]}

## logreg

In [55]:
# clf = 'svm'
clf = 'logreg'

In [56]:
RS=42

In [57]:
pipe = build_pipe(clf, r_state=RS)

In [58]:
pipe

Pipeline(steps=[('log_scaler', LogTransformer()),
                ('clf',
                 LogisticRegression(class_weight='balanced', n_jobs=-1,
                                    random_state=42, solver='saga'))])

In [59]:
search_space = classifiers[clf]
search_space

[{'clf__penalty': ['l1', 'l2'],
  'clf__C': [1.0, 0.5],
  'clf__solver': ['liblinear']}]

In [60]:
gs = GridSearchCV(estimator=pipe,
                  param_grid=search_space,
                  scoring='accuracy', 
                  cv=10, 
                  n_jobs=-1, 
                  verbose=1) 

In [61]:
gs

GridSearchCV(cv=10,
             estimator=Pipeline(steps=[('log_scaler', LogTransformer()),
                                       ('clf',
                                        LogisticRegression(class_weight='balanced',
                                                           n_jobs=-1,
                                                           random_state=42,
                                                           solver='saga'))]),
             n_jobs=-1,
             param_grid=[{'clf__C': [1.0, 0.5], 'clf__penalty': ['l1', 'l2'],
                          'clf__solver': ['liblinear']}],
             scoring='accuracy', verbose=1)

In [62]:
import warnings
warnings.filterwarnings("ignore")

In [63]:
logger.info("Train %s model ..." % clf)
# print('\nEstimator: %s' % grid_dict[idx])
gs.fit(X_train, y_train)
logger.info("Best params are : %s" % gs.best_params_)
# Best training data accuracy
logger.info('Best training accuracy: %.3f' % gs.best_score_)
# Predict on test data with best params
y_pred = gs.predict(X_test)
# Test data accuracy of model with best params
logger.info('Test set accuracy score for best params: %.3f ' % accuracy_score(y_test, y_pred))
# Track best (highest test accuracy) model

Train logreg model ...


INFO:logger:Train logreg model ...


Fitting 10 folds for each of 4 candidates, totalling 40 fits
Best params are : {'clf__C': 0.5, 'clf__penalty': 'l2', 'clf__solver': 'liblinear'}


INFO:logger:Best params are : {'clf__C': 0.5, 'clf__penalty': 'l2', 'clf__solver': 'liblinear'}


Best training accuracy: 0.829


INFO:logger:Best training accuracy: 0.829


Test set accuracy score for best params: 0.893 


INFO:logger:Test set accuracy score for best params: 0.893 


In [64]:
params = gs.best_params_
params

{'clf__C': 0.5, 'clf__penalty': 'l2', 'clf__solver': 'liblinear'}

In [65]:
model.set_params(**params)

Pipeline(steps=[('log_scaler', LogTransformer()),
                ('clf',
                 LogisticRegression(C=0.5, class_weight='balanced', n_jobs=-1,
                                    random_state=42, solver='liblinear'))])

In [66]:
model.fit(X_train, y_train)

Pipeline(steps=[('log_scaler', LogTransformer()),
                ('clf',
                 LogisticRegression(C=0.5, class_weight='balanced', n_jobs=-1,
                                    random_state=42, solver='liblinear'))])

In [67]:
preds_train = model.predict(X_train)
preds_oos = model.predict(X_test)
# preds_oot = model.predict(X_oot)

In [68]:
train_f1_macro = accuracy_score(y_train, preds_train)
oos_f1_macro = accuracy_score(y_test, preds_oos)
# oot_f1_macro = f1_score(y_oot, preds_oot, average='macro')

In [69]:
print('Train F1 macro=', train_f1_macro)
# print('Cross Validation F1 macro=', model.best_score_, 'std=', model.cv_results_['std_test_score'][model.best_index_])
print('Out of sample F1 macro=', oos_f1_macro)
# print('Out of time F1 macro=', oot_f1_macro)

Train F1 macro= 0.8378378378378378
Out of sample F1 macro= 0.8933333333333333


In [70]:
# Save model
pickle.dump(gs.best_estimator_, open(config.model_params['MODEL_PATH'], 'wb'))
print('Обученная модель сохранена {}'.format(config.model_params['MODEL_PATH']))

# Out of sample classification report

In [71]:
print(classification_report(y_test, preds_oos))

              precision    recall  f1-score   support

           0       0.85      0.97      0.91        40
           1       0.97      0.80      0.88        35

    accuracy                           0.89        75
   macro avg       0.91      0.89      0.89        75
weighted avg       0.90      0.89      0.89        75

