In [1]:
import numpy as np
import pandas as pd
from sklearn.base import BaseEstimator, TransformerMixin


class LogTransformer(BaseEstimator, TransformerMixin):
    """
    Log transforming
    """

    def __init__(self):
        pass

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        return np.log(X + 1)

    def fit_transform(self, X, y=None):
        return self.fit(X, y).transform(X, y)

# Imports

In [2]:
import os
import pandas as pd
import numpy as np
from yaml import load as yaml_load, FullLoader
import pickle
import sklearn
from sklearn.linear_model import LogisticRegressionCV, LogisticRegression
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import roc_auc_score, average_precision_score, f1_score, classification_report

In [3]:
from sklearn import svm
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score

In [4]:
from logger import logger
import config_logreg as config

In [5]:
RS=config.data_params['random_state']

## Preprocessing define

In [6]:
df = pd.read_csv(config.data_params['raw_data_path'])
df_train, df_oos = train_test_split(df, stratify=df['condition'], test_size=config.data_params['test_size'], random_state=RS)
df.shape, df_train.shape, df_oos.shape

((297, 14), (252, 14), (45, 14))

In [7]:
df_train.to_csv(config.data_params['train_data_path'], index=False)
df_oos.to_csv(config.data_params['test_data_path'], index=False)

In [8]:
X_train, y_train = df_train.drop(columns=['condition']), df_train['condition']

In [9]:
X_test, y_test = df_oos.drop(columns=['condition']), df_oos['condition']

## Hyper parameters search

In [10]:
from sklearn.preprocessing import StandardScaler

In [11]:
def build_pipe(clf_name, r_state):
    """
    Формируем модель с указанными гиперпараметрами 
    """ 
    
    if clf_name == 'logreg':
        classifier = LogisticRegression(class_weight='balanced',
                                        n_jobs=-1,
                                        random_state=r_state)
        
    elif clf_name == 'svm':
        classifier = svm.SVC(random_state=r_state)
        
    else: 
        logger.error("Unknown classifier: %s ..." % clf_name)
        raise KeyError('Unknown classifier: {}'.format(clf_name))
        
    
    pipe = Pipeline([
                            ('log_scaler', LogTransformer()),
                            ('clf', classifier)
    ])
    
    return pipe

## logreg

In [12]:
clf = config.model_name
clf

'logreg'

In [13]:
RS=42

In [14]:
pipe = build_pipe(clf, r_state=RS)

In [15]:
pipe

Pipeline(steps=[('log_scaler', LogTransformer()),
                ('clf',
                 LogisticRegression(class_weight='balanced', n_jobs=-1,
                                    random_state=42))])

In [16]:
search_space = config.search_space
search_space

[{'clf__penalty': ['l1', 'l2'],
  'clf__C': [0.01, 0.05, 0.1, 0.5, 1, 5, 10],
  'clf__solver': ['liblinear', 'saga']}]

In [17]:
gs = GridSearchCV(estimator=pipe,
                  param_grid=search_space,
                  scoring='accuracy', 
                  cv=10, 
                  n_jobs=-1, 
                  verbose=1) 

In [18]:
gs

GridSearchCV(cv=10,
             estimator=Pipeline(steps=[('log_scaler', LogTransformer()),
                                       ('clf',
                                        LogisticRegression(class_weight='balanced',
                                                           n_jobs=-1,
                                                           random_state=42))]),
             n_jobs=-1,
             param_grid=[{'clf__C': [0.01, 0.05, 0.1, 0.5, 1, 5, 10],
                          'clf__penalty': ['l1', 'l2'],
                          'clf__solver': ['liblinear', 'saga']}],
             scoring='accuracy', verbose=1)

In [19]:
import warnings
warnings.filterwarnings("ignore")

In [20]:
logger.info("Train %s model ..." % clf)
# print('\nEstimator: %s' % grid_dict[idx])
gs.fit(X_train, y_train)
logger.info("Best params are : %s" % gs.best_params_)
# Best training data accuracy
logger.info('Best training accuracy: %.3f' % gs.best_score_)
# Predict on test data with best params
y_pred = gs.predict(X_test)
# Test data accuracy of model with best params
logger.info('Test set accuracy score for best params: %.3f ' % accuracy_score(y_test, y_pred))
# Track best (highest test accuracy) model

Train logreg model ...


INFO:logger:Train logreg model ...


Fitting 10 folds for each of 28 candidates, totalling 280 fits


INFO:numexpr.utils:NumExpr defaulting to 8 threads.


Best params are : {'clf__C': 5, 'clf__penalty': 'l2', 'clf__solver': 'liblinear'}


INFO:logger:Best params are : {'clf__C': 5, 'clf__penalty': 'l2', 'clf__solver': 'liblinear'}


Best training accuracy: 0.829


INFO:logger:Best training accuracy: 0.829


Test set accuracy score for best params: 0.911 


INFO:logger:Test set accuracy score for best params: 0.911 


In [21]:
gs.best_params_

{'clf__C': 5, 'clf__penalty': 'l2', 'clf__solver': 'liblinear'}

In [22]:
pipe.set_params(**gs.best_params_)

Pipeline(steps=[('log_scaler', LogTransformer()),
                ('clf',
                 LogisticRegression(C=5, class_weight='balanced', n_jobs=-1,
                                    random_state=42, solver='liblinear'))])

In [23]:
pipe.fit(X_train, y_train)

Pipeline(steps=[('log_scaler', LogTransformer()),
                ('clf',
                 LogisticRegression(C=5, class_weight='balanced', n_jobs=-1,
                                    random_state=42, solver='liblinear'))])

In [24]:
preds_train = pipe.predict(X_train)
preds_test = pipe.predict(X_test)

In [25]:
train_accuracy_score = accuracy_score(y_train, preds_train)
test_accuracy_score = accuracy_score(y_test, preds_test)

In [26]:
print('Train accuracy_score=', train_accuracy_score)
print('test accuracy_score=', test_accuracy_score)

Train accuracy_score= 0.8452380952380952
test accuracy_score= 0.9111111111111111


In [27]:
# Save model
pickle.dump(gs.best_estimator_, open(config.model_path, 'wb'))
logger.info('Обученная модель сохранена: %s' % config.model_path)

Обученная модель сохранена: ../model/clf_pipeline.pickle


INFO:logger:Обученная модель сохранена: ../model/clf_pipeline.pickle


# Out of sample classification report

In [28]:
print(classification_report(y_test, preds_test))

              precision    recall  f1-score   support

           0       0.86      1.00      0.92        24
           1       1.00      0.81      0.89        21

    accuracy                           0.91        45
   macro avg       0.93      0.90      0.91        45
weighted avg       0.92      0.91      0.91        45

