In [1]:
import os

In [2]:
# Cambiamos el directorio al root del proyecto
os.chdir('../../../')

In [3]:
print("Directorio: {0}".format(os.getcwd()))

Directorio: /Users/mohamed.rios/Projects/desafio_peya


In [9]:
import pandas as pd
import numpy as np
import pickle
import yaml
from yaml.loader import SafeLoader
from sklearn.pipeline import make_pipeline
from sklearn.compose import make_column_transformer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import Normalizer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import OrdinalEncoder
from sklearn.model_selection import train_test_split
import warnings
from utils.preprocess import Pipeline 
import utils.experiment_1_features as e1
warnings.filterwarnings('ignore')

# Leemos los parámetros del preprocess

In [5]:
with open('models/stg/experiment_1/params.yaml') as f:
    params = yaml.load(f, Loader=SafeLoader)

# Lectura del raw dataset

In [6]:
df_raw = pd.read_csv(params.get('raw_data_path'))

In [7]:
df_raw.head(3)

Unnamed: 0,ano_de_construccion,banos,disposicion,dormitorios,estado,garajes,gastos_comunes,m2_de_la_terraza,m2_del_terreno,m2_edificados,price,tipo_propiedad,vivienda_social,zona
0,1991.0,1.0,contrafrente,1.0,excelente estado,0.0,5000.0,0.0,45.0,45.0,150000.0,apartamentos,0.0,punta carretas
1,2007.0,3.0,,3.0,excelente estado,1.0,0.0,0.0,155.0,125.0,329000.0,casas,0.0,pocitos nuevo
2,2012.0,3.0,al frente,4.0,excelente estado,2.0,16000.0,0.0,150.0,150.0,650000.0,apartamentos,0.0,punta carretas


# Preprocess dataset

In [10]:
df_processed = e1.delete_missing_values(df_raw, params)
x = df_processed.drop(axis=1, columns=params.get('target'))
y = df_processed[params.get('target')]
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25, random_state=0)

In [11]:
preprocess_f_list = [e1.validate_inputs, e1.create_disposicion_category,
                    e1.create_zona_feature, e1.drop_metadata_columns]
pipe = Pipeline()
pipe.set_f_list(preprocess_f_list)
x_train = pipe.apply(df=x_train, params=params)
x_test = pipe.apply(df=x_test, params=params)

# Scikit pipeline

In [12]:
ohe = OneHotEncoder()
oe = OrdinalEncoder()
scaler = StandardScaler()
normalizer = Normalizer()
column_transformer = make_column_transformer(
                (ohe, params.get('non_ordinal_categorical_features')),
                (oe, params.get('ordinal_categorical_features')),
                (scaler, params.get('numerical_features')),
                (normalizer, params.get('numerical_features'))
)

In [13]:
pipeline = make_pipeline(column_transformer)

In [14]:
pipeline.fit(x_train)

Pipeline(steps=[('columntransformer',
                 ColumnTransformer(transformers=[('onehotencoder',
                                                  OneHotEncoder(),
                                                  ['vivienda_social',
                                                   'disposicion_categoria',
                                                   'disposicion_is_missing',
                                                   'tipo_propiedad', 'estado',
                                                   'zona_categoria']),
                                                 ('ordinalencoder',
                                                  OrdinalEncoder(), ['banos']),
                                                 ('standardscaler',
                                                  StandardScaler(),
                                                  ['m2_edificados',
                                                   'gastos_comunes']),
                                

In [15]:
x_train_t = pipeline.transform(x_train)
x_test_t = pipeline.transform(x_test)

# Guardo preprocess pipelines

In [16]:
with open(params.get('pipeline_path') + '/scikit_pipeline','wb') as outfile:
        pickle.dump(pipeline, outfile)
        
        
with open(params.get('pipeline_path') + '/pipeline','wb') as outfile:
    pickle.dump(pipe, outfile)

# Escribo datasets procesados

In [17]:

pd.DataFrame(x_train_t).to_csv(params['x_train_path'], index=False)
pd.DataFrame(x_test_t).to_csv(params['x_test_path'], index=False)
pd.DataFrame(y_train).to_csv(params['y_train_path'], index=False)
pd.DataFrame(y_test).to_csv(params['y_test_path'], index=False)