# 0 - Imports

In [23]:
import joblib
import pandas as pd
import pandera
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import RobustScaler, StandardScaler
from sklearn.metrics import roc_auc_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from pandera import Check, Column, DataFrameSchema
from feature_engine.discretisation import EqualFrequencyDiscretiser
from feature_engine.imputation import MeanMedianImputer
from feature_engine.wrappers import SklearnTransformerWrapper


# Utils

In [24]:
# Pre-defining columns to use

columns_to_use = ['target', 'TaxaDeUtilizacaoDeLinhasNaoGarantidas', 'Idade',
                  'NumeroDeVezes30-59DiasAtrasoNaoPior', 'TaxaDeEndividamento',
                  'RendaMensal', 'NumeroDeLinhasDeCreditoEEmprestimosAbertos',
                  'NumeroDeVezes90DiasAtraso', 'NumeroDeEmprestimosOuLinhasImobiliarias',
                  'NumeroDeVezes60-89DiasAtrasoNaoPior', 'NumeroDeDependentes']

In [25]:
# Information to yaml configuration file

data_path = 'datasets/train.csv'
data_types = {'target':pd.Int64Dtype(),
              'TaxaDeUtilizacaoDeLinhasNaoGarantidas':pd.Float64Dtype(),
              'Idade':pd.Int64Dtype(),
              'NumeroDeVezes30-59DiasAtrasoNaoPior':pd.Int64Dtype(),
              'TaxaDeEndividamento':pd.Float64Dtype(),
              'RendaMensal':pd.Float64Dtype(),
              'NumeroDeLinhasDeCreditoEEmprestimosAbertos':pd.Int64Dtype(),
              'NumeroDeVezes90DiasAtraso':pd.Int64Dtype(),
              'NumeroDeEmprestimosOuLinhasImobiliarias':pd.Int64Dtype(),
              'NumeroDeVezes60-89DiasAtrasoNaoPior':pd.Int64Dtype(),
              'NumeroDeDependentes':pd.Int64Dtype()}

random_state = 42
model_name = 'Logistic_Regression'

In [26]:
df.head()

NameError: name 'df' is not defined

# 1.0 Data Load

In [27]:
class DataLoad:
    """Class responsible for data loading"""

    import pandas as pd

    def __init__(self) -> None:
        pass

    def load_data(self) -> pd.DataFrame:
        """Method for data loading"""

        loaded_data = pd.read_csv(data_path, index_col=[0], dtype=data_types)

        return loaded_data

In [28]:
# Loading data

data_loader = DataLoad() # instance

df = data_loader.load_data()[columns_to_use] # loading data to dataframe filtering pre-defined columns

df.head()

Unnamed: 0,target,TaxaDeUtilizacaoDeLinhasNaoGarantidas,Idade,NumeroDeVezes30-59DiasAtrasoNaoPior,TaxaDeEndividamento,RendaMensal,NumeroDeLinhasDeCreditoEEmprestimosAbertos,NumeroDeVezes90DiasAtraso,NumeroDeEmprestimosOuLinhasImobiliarias,NumeroDeVezes60-89DiasAtrasoNaoPior,NumeroDeDependentes
0,1,0.766127,45,2,0.802982,9120.0,13,0,6,0,2
1,0,0.957151,40,0,0.121876,2600.0,4,0,0,0,1
2,0,0.65818,38,1,0.085113,3042.0,2,1,0,0,0
3,0,0.23381,30,0,0.03605,3300.0,5,0,0,0,0
4,0,0.907239,49,1,0.024926,63588.0,7,0,1,0,0


# 2.0 Data validation

In [29]:
df['target'].value_counts()

target
0    139974
1     10026
Name: count, dtype: Int64

In [30]:
df['target'].value_counts(normalize=True)

target
0    0.93316
1    0.06684
Name: proportion, dtype: Float64

In [31]:
df.head()

Unnamed: 0,target,TaxaDeUtilizacaoDeLinhasNaoGarantidas,Idade,NumeroDeVezes30-59DiasAtrasoNaoPior,TaxaDeEndividamento,RendaMensal,NumeroDeLinhasDeCreditoEEmprestimosAbertos,NumeroDeVezes90DiasAtraso,NumeroDeEmprestimosOuLinhasImobiliarias,NumeroDeVezes60-89DiasAtrasoNaoPior,NumeroDeDependentes
0,1,0.766127,45,2,0.802982,9120.0,13,0,6,0,2
1,0,0.957151,40,0,0.121876,2600.0,4,0,0,0,1
2,0,0.65818,38,1,0.085113,3042.0,2,1,0,0,0
3,0,0.23381,30,0,0.03605,3300.0,5,0,0,0,0
4,0,0.907239,49,1,0.024926,63588.0,7,0,1,0,0


In [32]:
df.columns

Index(['target', 'TaxaDeUtilizacaoDeLinhasNaoGarantidas', 'Idade',
       'NumeroDeVezes30-59DiasAtrasoNaoPior', 'TaxaDeEndividamento',
       'RendaMensal', 'NumeroDeLinhasDeCreditoEEmprestimosAbertos',
       'NumeroDeVezes90DiasAtraso', 'NumeroDeEmprestimosOuLinhasImobiliarias',
       'NumeroDeVezes60-89DiasAtrasoNaoPior', 'NumeroDeDependentes'],
      dtype='object')

In [33]:
class DataValidation:
    """Class responsible for validating data input"""

    def __init__(self, columns_to_use) -> None:
        self.columns_to_use = columns_to_use

    def check_data_shape(self, dataframe: pd.DataFrame) -> bool:
        """Method responsible for data shape checking"""

        try: #tentar o código abaixo: check do shape das colunas à usar (passada no construtor) com as colunas do DataFrame
            print('Data shape validation started!')
            dataframe.columns = self.columns_to_use
            return True

        except Exception as e: # Se o shape não bater vai ter exceção
            print(f'Error: Data shape validation failed:{e}')
            return False

    def check_columns(self, dataframe: pd.DataFrame) -> bool:
        """Method responsible for data columns data type checking.
        Each column must have its own data type format.
        """
        schema = DataFrameSchema(
            {
                "target": Column(int, Check.isin([0,1]), Check(lambda x: x>0), coerce=True),
                "TaxaDeUtilizacaoDeLinhasNaoGarantidas": Column(float, nullable=True),
                "Idade": Column(int, nullable=True),
                "NumeroDeVezes30-59DiasAtrasoNaoPior": Column(int, nullable=True),
                "TaxaDeEndividamento": Column(float, nullable=True),
                "RendaMensal": Column(float, nullable=True),
                "NumeroDeLinhasDeCreditoEEmprestimosAbertos": Column(int, nullable=True),
                "NumeroDeVezes90DiasAtraso": Column(int, nullable=True),
                "NumeroDeEmprestimosOuLinhasImobiliarias": Column(int, nullable=True),
                "NumeroDeVezes60-89DiasAtrasoNaoPior": Column(int, nullable=True),
                "NumeroDeDependentes": Column(int, nullable=True)
            }
        )
    
        try:
            schema.validate(dataframe)
            print('Column validation passed!')        
            return True

        except pandera.errors.SchemaErrors as exc:
            print('Error: Column validation failed!')
            pandera.display(exc.failure_cases)        
        return False

    def run(self, dataframe: pd.DataFrame) -> bool:
        if self.check_data_shape(dataframe) and self.check_columns(dataframe):
            print('Validation step concluded!')
            return True
        
        else:
            print('Validation failed!')
            return False

In [34]:
data_validation = DataValidation(columns_to_use)

data_validation.run(df)

Data shape validation started!
Column validation passed!
Validation step concluded!


True

# 3.0 Data transformation

In [35]:
# Nessa etapa podem ser realizadas quaisquer transformações nos dados (PCA, feature engineering, etc)
# para simplificação a classe abaixo apenas fará o split de treino e teste

In [36]:
class DataTransformation:
    """
    Class responsible for data transformations.
    
    """

    def __init__(self, dataframe: pd.DataFrame, target_variable: str):
        
        self.dataframe = dataframe
        self.target_variable = target_variable

    def train_test_data_split(self):

        X = self.dataframe.drop(columns=[self.target_variable])
        y = self.dataframe[self.target_variable]

        X_train, X_val, y_train, y_val = train_test_split(X, y, stratify=y, random_state=random_state)

        return X_train, X_val, y_train, y_val    


In [37]:
# Instanciando o objeto data_transformer
data_transformer = DataTransformation(df, 'target')

# Transformando os dados
X_train, X_val, y_train, y_val = data_transformer.train_test_data_split()

In [38]:
X_train.shape[0] / df.shape[0]

0.75

In [39]:
X_val.shape[0] / df.shape[0]

0.25

# 4.0 Data preprocessing

In [40]:
class DataPreprocessing:

    def __init__(self, dataframe: pd.DataFrame, pipeline: Pipeline):
        """
        This constructor method is responsible for storing X_train and scikit-learn Pipeline object
        """
        
        self.dataframe = dataframe
        self.pipeline = pipeline

    def pipeline_(self):
        """
        This method is responsible for fiting the pipeline
        """

        train_pipeline = self.pipeline
        train_pipeline.fit(self.dataframe)

        return train_pipeline
    
    def run(self):
        """
        This method is responsible for transforming the X by using fited Pipeline object
        """
        print('Preprocessor started...')
        trained_pipeline = self.pipeline_()
        df_proprocessed = trained_pipeline.transform(self.dataframe)
        print('Finished!')

        return df_proprocessed


In [41]:
# Definindo o objeto Pipeline para passar como argumento no construtor da classe DataPreprocessing

pipe =  Pipeline([
    ('imputer', MeanMedianImputer(variables=['RendaMensal','NumeroDeDependentes'])),
    ('discretiser', EqualFrequencyDiscretiser(variables=['TaxaDeUtilizacaoDeLinhasNaoGarantidas',
                                                         'TaxaDeEndividamento',
                                                         'RendaMensal'])),
    ('scaler', SklearnTransformerWrapper(StandardScaler()))
])

In [42]:

# Instanciando o objeto processor da classe DataPreprocessing
data_preprocessor = DataPreprocessing(X_train, pipe)

# Processando o X_train
X_train_proprocessed = data_preprocessor.run()

Preprocessor started...
Finished!


In [43]:
# Processando o X_train utilizando o método 'pipeline.transform' do objeto 'data_preprocessor'

X_val_proprocessed = data_preprocessor.pipeline.transform(X_val)

X_val_proprocessed

Unnamed: 0,TaxaDeUtilizacaoDeLinhasNaoGarantidas,Idade,NumeroDeVezes30-59DiasAtrasoNaoPior,TaxaDeEndividamento,RendaMensal,NumeroDeLinhasDeCreditoEEmprestimosAbertos,NumeroDeVezes90DiasAtraso,NumeroDeEmprestimosOuLinhasImobiliarias,NumeroDeVezes60-89DiasAtrasoNaoPior,NumeroDeDependentes
39795,1.566697,0.455865,-0.100407,0.522233,0.409217,0.298442,-0.063926,0.866364,-0.057872,0.236879
86079,-0.174074,1.268727,-0.100407,1.566699,0.001363,3.016764,-0.063926,1.748816,-0.057872,-0.666658
134541,-1.566691,-0.695689,0.137891,1.566699,0.001363,0.492608,-0.063926,1.748816,-0.057872,-0.666658
72738,-1.218537,2.623496,-0.100407,-1.566699,0.409217,-1.449050,-0.063926,-0.898540,-0.057872,-0.666658
135945,-0.522228,0.591342,-0.100407,-0.522233,1.632778,1.269271,-0.063926,-0.016088,-0.057872,0.236879
...,...,...,...,...,...,...,...,...,...,...
62560,-1.218537,-0.966642,0.137891,0.522233,0.409217,0.492608,0.175709,0.866364,-0.057872,1.140416
66278,-1.218537,-1.644027,-0.100407,-1.218544,0.409217,-0.478221,-0.063926,-0.898540,-0.057872,-0.666658
100793,0.870389,-0.627950,-0.100407,-0.522233,-1.630052,-0.866552,-0.063926,-0.898540,-0.057872,-0.666658
135630,-0.174074,-0.221519,-0.100407,-0.174078,1.224924,-0.672387,-0.063926,0.866364,-0.057872,2.043954


In [44]:
# Exportando o processor

joblib.dump(data_preprocessor.pipeline_(), 'preprocessor.joblib')

['preprocessor.joblib']

In [45]:
data_preprocessor.pipeline_()

In [46]:
X_train.head()

Unnamed: 0,TaxaDeUtilizacaoDeLinhasNaoGarantidas,Idade,NumeroDeVezes30-59DiasAtrasoNaoPior,TaxaDeEndividamento,RendaMensal,NumeroDeLinhasDeCreditoEEmprestimosAbertos,NumeroDeVezes90DiasAtraso,NumeroDeEmprestimosOuLinhasImobiliarias,NumeroDeVezes60-89DiasAtrasoNaoPior,NumeroDeDependentes
140300,0.034949,60,0,0.004933,8311.0,5,0,0,0,0
92417,0.155308,46,0,881.0,,6,0,1,0,0
48193,0.165166,63,1,0.020327,3000.0,8,0,0,0,0
66644,0.010886,60,0,0.642979,1167.0,6,0,1,0,0
144734,0.000717,48,0,3603.0,,15,0,3,0,0


In [47]:
X_train_proprocessed.head()

Unnamed: 0,TaxaDeUtilizacaoDeLinhasNaoGarantidas,Idade,NumeroDeVezes30-59DiasAtrasoNaoPior,TaxaDeEndividamento,RendaMensal,NumeroDeLinhasDeCreditoEEmprestimosAbertos,NumeroDeVezes90DiasAtraso,NumeroDeEmprestimosOuLinhasImobiliarias,NumeroDeVezes60-89DiasAtrasoNaoPior,NumeroDeDependentes
140300,-0.870382,0.523604,-0.100407,-1.566699,1.224924,-0.672387,-0.063926,-0.89854,-0.057872,-0.666658
92417,0.17408,-0.424735,-0.100407,1.218544,0.001363,-0.478221,-0.063926,-0.016088,-0.057872,-0.666658
48193,0.17408,0.726819,0.137891,-1.566699,-1.222198,-0.089889,-0.063926,-0.89854,-0.057872,-0.666658
66644,-1.218537,0.523604,-0.100407,0.522233,-1.630052,-0.478221,-0.063926,-0.016088,-0.057872,-0.666658
144734,-1.566691,-0.289258,-0.100407,1.566699,0.001363,1.269271,-0.063926,1.748816,-0.057872,-0.666658


# 5.0 Model training

In [48]:
class ModelTraining:
    """
    """

    def __init__(self, X: pd.DataFrame, y: pd.Series):
        """
        """
        self.X = X
        self.y = y

    def fit(self, model_instance):
        """
        """
        model_instance.fit(self.X, self.y)
        joblib.dump(model_instance, model_name+'.joblib')

    def predict(self, X: pd.DataFrame):
        """
        """
        model_fitted = self._load_model()
        array_probabilities = model_fitted.predict_proba(X)
        return array_probabilities[:,1]

    def _load_model(self):
        """
        """
        loaded_model = joblib.load(model_name+'.joblib')
        return loaded_model        

In [49]:
# Instanciando o modelo a ser passado na classe:
lr_object = LogisticRegression()

In [50]:
# Instanciando o objeto trainer da classe criada acima
model_trainer = ModelTraining(X_train_proprocessed, y_train)


# Invocando o método de treinamento da classe 
model_instance = model_trainer.fit(model_instance=lr_object)

# Invocando o método de predição de probabilidades da classe
model_trainer.predict(X_train_proprocessed)

array([0.00843568, 0.04345571, 0.05017865, ..., 0.05864854, 0.01817563,
       0.10013475])

In [51]:
# Predições das probabilidades no dado de validação

y_val_pred_probabilities = model_trainer.predict(X_val_proprocessed)

y_val_pred_probabilities

array([0.14862859, 0.03414856, 0.01385968, ..., 0.09725534, 0.02930445,
       0.11755718])

# 6.0 Model evaluation

In [52]:
data_preprocessor.pipeline_()

In [53]:
class ModelEvaluating:
    """
    """

    def __init__(self):
        pass

    def eval_metrics(self, y_true, y_pred_probabilities):

        return roc_auc_score(y_true, y_pred_probabilities)

In [54]:
model_evaluator = ModelEvaluating()

model_evaluator.eval_metrics(y_val, y_val_pred_probabilities)

0.8003929165191288

# 7.0 Experiments

## 7.1 Experiment 1

In [55]:
# 1a Etapa

# Loading data

data_loader = DataLoad() # instance

df = data_loader.load_data()[columns_to_use] # loading data to dataframe filtering pre-defined columns

# ----------------------------#

# 2a Etapa

data_validation = DataValidation(columns_to_use)

data_validation.run(df)

# ----------------------------#

# 3a Etapa

# Instanciando o objeto data_transformer
data_transformer = DataTransformation(df, 'target')

# Transformando os dados
X_train, X_val, y_train, y_val = data_transformer.train_test_data_split()

# ----------------------------#

# 4a Etapa

# Definindo o objeto Pipeline para passar como argumento no construtor da classe DataPreprocessing

pipe =  Pipeline([
    ('imputer', MeanMedianImputer(variables=['RendaMensal','NumeroDeDependentes'])),
    ('discretiser', EqualFrequencyDiscretiser(variables=['TaxaDeUtilizacaoDeLinhasNaoGarantidas',
                                                         'TaxaDeEndividamento',
                                                         'RendaMensal'])),
    ('scaler', SklearnTransformerWrapper(StandardScaler()))
])


# Instanciando o objeto processor da classe DataPreprocessing
data_preprocessor = DataPreprocessing(X_train, pipe)

# Processando o X_train
X_train_proprocessed = data_preprocessor.run()


# Processando o X_train utilizando o método 'pipeline.transform' do objeto 'data_preprocessor'
X_val_proprocessed = data_preprocessor.pipeline.transform(X_val)

# ----------------------------#

# 5a Etapa

# Instanciando o modelo a ser passado na classe:
lr_object = LogisticRegression()


# Instanciando o objeto trainer da classe criada acima
model_trainer = ModelTraining(X_train_proprocessed, y_train)


# Invocando o método de treinamento da classe 
model_instance = model_trainer.fit(model_instance=lr_object)


# Predições das probabilidades no dado de validação
y_val_pred_probabilities = model_trainer.predict(X_val_proprocessed)

# ----------------------------#

# 6a Etapa

model_evaluator = ModelEvaluating()

model_evaluator.eval_metrics(y_val, y_val_pred_probabilities)

Data shape validation started!
Column validation passed!
Validation step concluded!
Preprocessor started...
Finished!


0.8003929165191288

## 7.2 Experiment 2

In [56]:
# 1a Etapa

# Loading data

data_loader = DataLoad() # instance

df = data_loader.load_data()[columns_to_use] # loading data to dataframe filtering pre-defined columns

# ----------------------------#

# 2a Etapa

data_validation = DataValidation(columns_to_use)

data_validation.run(df)

# ----------------------------#

# 3a Etapa

# Instanciando o objeto data_transformer
data_transformer = DataTransformation(df, 'target')

# Transformando os dados
X_train, X_val, y_train, y_val = data_transformer.train_test_data_split()

# ----------------------------#

# 4a Etapa

# Definindo o objeto Pipeline para passar como argumento no construtor da classe DataPreprocessing

pipe =  Pipeline([
    ('imputer', MeanMedianImputer(variables=['RendaMensal','NumeroDeDependentes'])),
    ('discretiser', EqualFrequencyDiscretiser(variables=['TaxaDeUtilizacaoDeLinhasNaoGarantidas',
                                                         'TaxaDeEndividamento',
                                                         'RendaMensal'])),
    ('scaler', SklearnTransformerWrapper(StandardScaler()))
])


# Instanciando o objeto processor da classe DataPreprocessing
data_preprocessor = DataPreprocessing(X_train, pipe)

# Processando o X_train
X_train_proprocessed = data_preprocessor.run()


# Processando o X_train utilizando o método 'pipeline.transform' do objeto 'data_preprocessor'
X_val_proprocessed = data_preprocessor.pipeline.transform(X_val)

# ----------------------------#

# 5a Etapa

# Instanciando o modelo a ser passado na classe:
lr_object = DecisionTreeClassifier()


# Instanciando o objeto trainer da classe criada acima
model_trainer = ModelTraining(X_train_proprocessed, y_train)


# Invocando o método de treinamento da classe 
model_instance = model_trainer.fit(model_instance=lr_object)


# Predições das probabilidades no dado de validação
y_val_pred_probabilities = model_trainer.predict(X_val_proprocessed)

# ----------------------------#

# 6a Etapa

model_evaluator = ModelEvaluating()

model_evaluator.eval_metrics(y_val, y_val_pred_probabilities)

Data shape validation started!
Column validation passed!
Validation step concluded!
Preprocessor started...
Finished!


0.618275771229007

In [77]:
!free -h

              total        used        free      shared  buff/cache   available
Mem:           12Gi       1.7Gi       9.7Gi       0.0Ki       993Mi        10Gi
Swap:         4.0Gi          0B       4.0Gi
