In [44]:
import pandas as pd
import numpy as np
import joblib

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score

from feature_engine.imputation import MeanMedianImputer, CategoricalImputer
from feature_engine.encoding import CountFrequencyEncoder
from feature_engine.transformation import LogTransformer 
from feature_engine.selection import DropFeatures

#cargamos operadores definidos por desarrollador
import operators

# 1. Cargamos data de entrenamiento 

In [45]:
data_train = pd.read_csv("../data/raw/train.csv") #para moverse de la carpeta colocamos ../ y vamos a traer la ruta
data_train['MSSubClass'] = data_train['MSSubClass'].astype('O')
data_train['GarageCars'] = data_train['GarageCars'].astype('O')
data_train['BsmtFullBath'] = data_train['BsmtFullBath'].astype('O')
data_train.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


# 2. Train, Test Split

In [46]:
X_train, X_test, y_train, y_test= train_test_split(data_train.drop(['Id','SalePrice'], axis=1), data_train['SalePrice'], test_size=0.30, random_state=2025)

# 3. Configuración del Pipeline

Luego de realizar el analisis exploratorio de datos y la investigacion de caracteristicas, definimos que operaciones y que features se le aplicarán

In [47]:
#imputacion de variables categoricas con imputación por frecuencia
CATEGORICAL_VARS_WITH_NA_FREQUENT=['BsmtQual','BsmtExposure','BsmtFinType1', 'GarageFinish', 'Functional',
                                   'MSZoning', 'Exterior1st', 'KitchenQual']

#'BsmtFullBath', 'GarageCars',

#Imputacion de variables númericas con imputacion por media
NUMERICAL_VARS_WITH_NA =['LotFrontage','GarageArea']

#Imputacion de variables categoricas con valor faltante (Missing)
CATEGORICAL_VARS_WITH_NA_MISSING=['FireplaceQu']

#Variables Temporales
TEMPORAL_VARS=['YearRemodAdd']

#Año de referencia
REF_VAR= "YrSold"

#Variables para binarizacion por sesgo
BINARIZE_VARS=['ScreenProch']

#Variables que eliminaremos
DROP_FEATURES = ["YrSold"]

#Variables para transfomraicón logarítmica
NUMERICAL_LOG_VARS = ["LotFrontage", "1stFlrSF", "GrLivArea"]

#Variables para codificación ordinal.
QUAL_VARS = ['ExterQual', 'BsmtQual', 'HeatingQC', 'KitchenQual', 'FireplaceQu']

 #variables especiales
EXPOSURE_VARS = ['BsmtExposure']
FINISH_VARS = ['BsmtFinType1']
GARAGE_VARS = ['GarageFinish']
FENCE_VARS = ['Fence']

#Variables para codificación por frecuencia (no ordinal)
CATEGORICAL_VARS = ['MSZoning',  'LotShape',  'LandContour', 'LotConfig', 'Neighborhood', 'RoofStyle', 
                    'Exterior1st', 'Foundation', 'CentralAir', 'Functional', 'PavedDrive', 'SaleCondition']

#Mapeo para varibles categótricas para calidad.
QUAL_MAPPINGS = {'Po':1, 'Fa':2, 'TA':3, 'Gd':4, 'Ex':5, 'Missing':0, 'NA':0}
 
EXPOSURE_MAPPINGS = {'No':1, 'Mn':2, 'Av':3, 'Gd':4}

FINISH_MAPPINGS = {'Missing':0, 'NA':0, 'Unf':1, 'LwQ':2, 'Rec':3, 'BLQ':4, 'ALQ':5, 'GLQ':6}

GARAGE_MAPPINGS = {'Missing':0, 'NA':0, 'Unf':1, 'RFn':2, 'Fin':3}

#Variables a utilzar en el entrenamiento

FEATURES = ['MSSubClass','MSZoning','LotFrontage','LotShape','LandContour','LotConfig','Neighborhood',
            'RoofStyle','Exterior1st','ExterQual','Foundation','BsmtQual','BsmtExposure','BsmtFinType1',
            'HeatingQC','CentralAir','1stFlrSF','GrLivArea','BsmtFullBath','KitchenQual','Functional',
            'FireplaceQu','GarageFinish','GarageCars','GarageArea','PavedDrive','WoodDeckSF','SaleCondition']


# 4. Seleccion de Features de interes

In [48]:
#X_train = X_train[FEATURES]
#X_test = X_test[FEATURES]

In [49]:
ALL_FEATURES=set(X_train.columns)
FEATURES_TO_DROP =ALL_FEATURES.difference(FEATURES)
FEATURES_TO_DROP = list(FEATURES_TO_DROP)

# 5. Construccion de Pipeline

In [50]:
house_prices_data_pre_proc= Pipeline([
#0. Seleccion de features para el modelo 
    ('drop_features',
            DropFeatures(features_to_drop=FEATURES_TO_DROP)),
#1. Imputacion de variables categoricas
    ('cat_missing_imputation',
            CategoricalImputer(imputation_method='missing', variables=CATEGORICAL_VARS_WITH_NA_MISSING)),

#2. Imputacion de variables categoricas por frecuencia
    ('cat_missing_freq_imputation',
            CategoricalImputer(imputation_method='frequent', variables=CATEGORICAL_VARS_WITH_NA_FREQUENT)),

#3. Imputacion de variables númericas
    ('mean_imputation',
            MeanMedianImputer(imputation_method='mean', variables=NUMERICAL_VARS_WITH_NA)),

#4.Codificacion de variables categoricas
    ('quality_mapper',
            operators.Mapper(variables=QUAL_VARS, mappins=QUAL_MAPPINGS)),

    ('exposure_mapper',
            operators.Mapper(variables=EXPOSURE_VARS, mappins=EXPOSURE_MAPPINGS)),

    ('garage_mapper',
            operators.Mapper(variables=GARAGE_VARS, mappins=GARAGE_MAPPINGS)),

    ('Finish_mapper',
            operators.Mapper(variables=FINISH_VARS, mappins=FINISH_MAPPINGS)),

#5. Codificacion por Frecuency encoding
    ('cat_freq_encode',
            CountFrequencyEncoder(encoding_method='count', variables=CATEGORICAL_VARS)),

#6.Transformacion de variables continuas
    ('continues_log_transform',
            LogTransformer(variables=NUMERICAL_LOG_VARS)),

#7. Normalizacion de variables
    ('Variable_scaler',
            MinMaxScaler())
])


# 6. Aplicamos Pipeline para Construcción de Features

In [51]:
house_prices_data_pre_proc.fit(X_train, y_train)

0,1,2
,steps,"[('drop_features', ...), ('cat_missing_imputation', ...), ...]"
,transform_input,
,memory,
,verbose,False

0,1,2
,features_to_drop,"['SaleType', 'LotArea', ...]"

0,1,2
,imputation_method,'missing'
,fill_value,'Missing'
,variables,['FireplaceQu']
,return_object,False
,ignore_format,False

0,1,2
,imputation_method,'frequent'
,fill_value,'Missing'
,variables,"['BsmtQual', 'BsmtExposure', ...]"
,return_object,False
,ignore_format,False

0,1,2
,imputation_method,'mean'
,variables,"['LotFrontage', 'GarageArea']"

0,1,2
,variables,"['ExterQual', 'BsmtQual', ...]"
,mappins,"{'Ex': 5, 'Fa': 2, 'Gd': 4, 'Missing': 0, ...}"

0,1,2
,variables,['BsmtExposure']
,mappins,"{'Av': 3, 'Gd': 4, 'Mn': 2, 'No': 1}"

0,1,2
,variables,['GarageFinish']
,mappins,"{'Fin': 3, 'Missing': 0, 'NA': 0, 'RFn': 2, ...}"

0,1,2
,variables,['BsmtFinType1']
,mappins,"{'ALQ': 5, 'BLQ': 4, 'GLQ': 6, 'LwQ': 2, ...}"

0,1,2
,encoding_method,'count'
,variables,"['MSZoning', 'LotShape', ...]"
,missing_values,'raise'
,ignore_format,False
,unseen,'ignore'

0,1,2
,variables,"['LotFrontage', '1stFlrSF', ...]"
,base,'e'

0,1,2
,feature_range,"(0, ...)"
,copy,True
,clip,False


In [52]:
def save_procesed_data(X,y, str_df_name):
    X_transformed = house_prices_data_pre_proc.transform(X)
    df_X_train_transformed= pd.DataFrame(data=X_transformed, columns=FEATURES)
    y = y.reset_index()
    df_transformed= pd.concat([df_X_train_transformed, y['SalePrice']], axis=1)
    df_transformed.to_csv(f"../data/interim/proc_{str_df_name}.csv", index=False)

## Procesamos data de entrenamiento

In [53]:
save_procesed_data(X_train, y_train, str_df_name="data_train")

##Procesamos data de Validación

In [54]:
save_procesed_data(X_test, y_test, str_df_name="data_test")



#7. Exportamos Pipeline de pre-procesamiento ya configurado

In [55]:
joblib.dump(house_prices_data_pre_proc, '../models/house_prices_data_pre_proc_pipeline.pkl')

['../models/house_prices_data_pre_proc_pipeline.pkl']