# Notebook overview

# Imports
## Libraries

In [364]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import mlflow
import gc, warnings, os
import importlib
import pipelines
import data_preprocessing as process
import transformers

importlib.reload(process)
importlib.reload(transformers)
importlib.reload(pipelines)


from transformers import ApplicationCleaner
from pipelines import PIPELINES, DEFAULT_CONFIG

from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.metrics import roc_auc_score, accuracy_score, classification_report

warnings.simplefilter(action='ignore', category=FutureWarning)

pd.set_option('display.max_colwidth', None)
pd.options.display.float_format = '{:.4f}'.format

# Pipeline

In [274]:
def display_scores(y_true, y_pred):
    print(classification_report(y_true, y_pred))

In [367]:
def train(pipeline_name, config, dev_mode=True):
    print('Training the model')

    data = process.load_data(dev_mode)

    X = data['application'].drop(process.TARGET_COLUMN, axis=1)
    y = data['application'][process.TARGET_COLUMN]

    X_train, X_test, y_train, y_test = train_test_split(X,
                                                          y,
                                                          test_size=0.2,
                                                          random_state=42,
                                                          shuffle=False)

    print('Train shape: {}'.format(X_train.shape))
    print('Valid shape: {}'.format(y_train.shape))

    pipeline = PIPELINES[pipeline_name](config=config)

    print('Start pipeline fit and transform')
    pipeline.fit(X_train, y_train)

    y_pred = pipeline.predict(X_test)
    
    display_scores(y_test, y_pred)

In [366]:
config = {
    'preprocessing': {
        'should_fill_na': True,
        'num_imputer': SimpleImputer(strategy='median'),
        'cat_imputer': SimpleImputer(strategy='most_frequent'),
        'should_scale': True,
        'scaler': StandardScaler()
    }
}


train('log_regression', config, dev_mode=False)

Training the model
Loading application_train ...
Loading Done.
FLAG_OWN_CAR
N    162480
Y     83528
Name: count, dtype: int64
Train shape: (246008, 121)
Valid shape: (246008,)
Start pipeline fit and transform
Cleaning data...
Extracting features..
New X shape: (246008, 128)
Removing null values...
Removed 7449679 null values
Encoding values...
New X shape: (246008, 250)
Scaling values...
FLAG_OWN_CAR
0    162480
1     83528
Name: count, dtype: int64
Cleaning data...
Extracting features..
New X shape: (61503, 128)
Removing null values...
Removed 1851234 null values
Encoding values...
New X shape: (61503, 250)
Scaling values...
              precision    recall  f1-score   support

           0       0.92      1.00      0.96     56619
           1       0.00      0.00      0.00      4884

    accuracy                           0.92     61503
   macro avg       0.46      0.50      0.48     61503
weighted avg       0.85      0.92      0.88     61503



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
