In [1]:
import sys

# Retornar para a pasta 'src' para poder importar os módulos criados em 'data/', 'utils/', etc
sys.path.insert(0, '../src/')

In [2]:
# Data load step

from data.data_load import DataLoad

In [3]:
# Instantiating 'data_loader' object

data_loader = DataLoad()

# loading csv file as dataframe from yaml config file using 'load_data()' method 

df = data_loader.load_data('train_dataset_name')

df.head()

[2m2024-03-12 22:10:13[0m [[32m[1minfo     [0m] [1mStaring data loading with: train_dataset_name[0m
[2m2024-03-12 22:10:14[0m [[32m[1minfo     [0m] [1mData loaded successfully![0m


Unnamed: 0,target,TaxaDeUtilizacaoDeLinhasNaoGarantidas,Idade,NumeroDeVezes30-59DiasAtrasoNaoPior,TaxaDeEndividamento,RendaMensal,NumeroDeLinhasDeCreditoEEmprestimosAbertos,NumeroDeVezes90DiasAtraso,NumeroDeEmprestimosOuLinhasImobiliarias,NumeroDeVezes60-89DiasAtrasoNaoPior,NumeroDeDependentes
0,1,0.766127,45,2,0.802982,9120.0,13,0,6,0,2
1,0,0.957151,40,0,0.121876,2600.0,4,0,0,0,1
2,0,0.65818,38,1,0.085113,3042.0,2,1,0,0,0
3,0,0.23381,30,0,0.03605,3300.0,5,0,0,0,0
4,0,0.907239,49,1,0.024926,63588.0,7,0,1,0,0


In [4]:
# Data validation step

from data.data_validation import DataValidation


# Instantiating 'data_validator' object

data_validator = DataValidation()


# run data validation
data_validator.run(df)

[2m2024-03-12 22:10:16[0m [[32m[1minfo     [0m] [1mStarting data shape checking[0m
[2m2024-03-12 22:10:16[0m [[32m[1minfo     [0m] [1mData shape validation started![0m
[2m2024-03-12 22:10:16[0m [[32m[1minfo     [0m] [1mStarting column types validation[0m
[2m2024-03-12 22:10:16[0m [[32m[1minfo     [0m] [1mColumn validation passed![0m
[2m2024-03-12 22:10:16[0m [[32m[1minfo     [0m] [1mValidation step concluded![0m


True

In [5]:
# Data transformation step

from data.data_transformation import DataTransformation


# Instantiating 'data_transformer' object

data_transformer = DataTransformation(df)


# getting X_train, X_val, y_train, y_val splits
X_train, X_val, y_train, y_val = data_transformer.train_test_data_split()

[2m2024-03-12 22:10:17[0m [[32m[1minfo     [0m] [1mTarget 'target' variable found in the dataframe. Performing X, y split and train/val split[0m
[2m2024-03-12 22:10:17[0m [[32m[1minfo     [0m] [1mData split performed successfully![0m


In [6]:
# Data preprocessing step

from data.data_preprocessing import DataPreprocessing

In [7]:
# Importing preprocessors and sklearn pipeline

# pipeline
from sklearn.pipeline import Pipeline

# imputers and discretizers
from feature_engine.imputation import MeanMedianImputer
from feature_engine.discretisation import EqualFrequencyDiscretiser, EqualWidthDiscretiser

# scaler
from sklearn.preprocessing import StandardScaler

# sklearn wrapper
from feature_engine.wrappers import SklearnTransformerWrapper


# import yaml consig file
from utils.utils import load_config_file


In [8]:
# Loading yaml file
yaml_file = load_config_file()

In [9]:
# instantiating pipeline object

pipeline =  Pipeline(
                    [
                     ('imputer', MeanMedianImputer(variables=yaml_file['variables_imputer'])),

                     ('discretizer', EqualFrequencyDiscretiser(variables=yaml_file['variables_discretiser'])),
                     
                     ('scaler', SklearnTransformerWrapper(StandardScaler()))
                                                                              ]
                      )

In [10]:
pipeline

In [11]:
# Instantiating 'data_preprocessor' object from DataPreprocessing class

data_preprocessor = DataPreprocessing(pipeline)

[2m2024-03-12 22:10:25[0m [[32m[1minfo     [0m] [1mPipeline started...[0m


In [12]:
# Fitting pipeline

data_preprocessor.fit_pipeline(X_train)

[2m2024-03-12 22:10:29[0m [[32m[1minfo     [0m] [1mPipeline fitting started...[0m
[2m2024-03-12 22:10:29[0m [[32m[1minfo     [0m] [1mPipeline fitting finished![0m


In [13]:
# Transforming data

# X_train
X_train_processed = data_preprocessor.transform_pipeline(X_train)

# X_val
X_val_processed = data_preprocessor.transform_pipeline(X_val)

[2m2024-03-12 22:10:30[0m [[32m[1minfo     [0m] [1mData transformation with fitted pipeline started...[0m
[2m2024-03-12 22:10:30[0m [[32m[1minfo     [0m] [1mData transformation with fitted pipeline finished![0m
[2m2024-03-12 22:10:30[0m [[32m[1minfo     [0m] [1mData transformation with fitted pipeline started...[0m
[2m2024-03-12 22:10:30[0m [[32m[1minfo     [0m] [1mData transformation with fitted pipeline finished![0m
