## Selección de atributos con wrappers

In [1]:
# https://www.analyticsvidhya.com/blog/2020/10/a-comprehensive-guide-to-feature-selection-using-wrapper-methods-in-python/

Existen 3 metodos a saber para realizar reducción de caracteristicas
- Filtros
- Wrappers o Envolturas
- Metodos integrados


!pip install mlxtend

In [2]:
# Cargadmos el dataset
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

In [3]:
print(tf.__version__)

2.8.0


In [4]:
# Cargue de los datos
df = pd.read_excel("../Archivos Generados/DatasetFinal.xlsx")

In [5]:
df.head(5)

Unnamed: 0,ID_LOTE,TIPO_SIEMBRA,SEM_TRATADAS,MATERIAL_GENETICO,CULT_ANT,DRENAJE,METODO_COSECHA,ALMACENAMIENTO_FINCA,DIAS_EN_EMERGER,DIAS_EN_EMERGER_A_FLORECER,...,Temp_Max_Avg_Mad,Temp_Min_Avg_Mad,Temp_Avg_Mad,Diurnal_Range_Avg_Mad,Sol_Ener_Accu_Mad,Temp_Max_34_Freq_Mad,Rain_Accu_Mad,Rain_10_Freq_Mad,Rhum_Avg_Mad,RDT_AJUSTADO
0,40,Mecanizado,NO,PIONEER 30F32,Algodon,SI,Manual,NO,5,63,...,32.05,23.6,27.83,8.45,13197.57,0.05,279.3,0.23,82.41,4767.44
1,43,Mecanizado,SI,DK 234,Maiz,SI,Manual,NO,5,64,...,32.37,23.49,27.93,8.89,12436.49,0.03,221.2,0.26,81.86,4651.16
2,44,Mecanizado,NO,PIONEER 30F32,Algodon,SI,Manual,NO,5,59,...,32.17,23.53,27.85,8.63,11267.17,0.03,226.0,0.27,82.61,5180.23
3,45,Mecanizado,NO,Otro,Algodon,SI,Manual,NO,5,64,...,32.19,23.54,27.86,8.65,11066.68,0.03,223.2,0.29,81.84,4897.67
4,46,Mecanizado,NO,Otro,Algodon,SI,Manual,NO,5,63,...,32.19,23.54,27.86,8.65,11066.68,0.03,223.2,0.29,81.84,5302.33


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 799 entries, 0 to 798
Columns: 116 entries, ID_LOTE to RDT_AJUSTADO
dtypes: float64(55), int64(31), object(30)
memory usage: 724.2+ KB


In [7]:
# Conteo de los nulos en c/u de las variables del datset
df.isna().sum()

ID_LOTE                 0
TIPO_SIEMBRA            0
SEM_TRATADAS            0
MATERIAL_GENETICO       0
CULT_ANT                0
                       ..
Temp_Max_34_Freq_Mad    0
Rain_Accu_Mad           0
Rain_10_Freq_Mad        0
Rhum_Avg_Mad            0
RDT_AJUSTADO            0
Length: 116, dtype: int64

In [8]:
# Selección de las variables categoricas
cat_features = df.select_dtypes(include = ["object", "category"]).columns
cat_features

Index(['TIPO_SIEMBRA', 'SEM_TRATADAS', 'MATERIAL_GENETICO', 'CULT_ANT',
       'DRENAJE', 'METODO_COSECHA', 'ALMACENAMIENTO_FINCA',
       'TERRENO_CIRCUN_RASTA', 'POSICION_PERFIL_RASTA', 'PEDREG_PERFIL_ROCAS',
       'CAP_ENDURE_RASTA', 'MOTEADOS_RASTA', 'MOTEADOS_MAS70cm._RASTA',
       'ESTRUCTURA_RASTA', 'OBSERVA_EROSION_RASTA', 'OBSERVA_MOHO_RASTA',
       'OBSERVA_COSTRAS_DURAS_RASTA', 'SITIO_EXPUESTO_SOL_RASTA',
       'OBSERVA_COSTRAS_BLANCAS_RASTA', 'OBSERVA_COSTAS_NEGRAS_RASTA',
       'REGION_SECA_ARIDA_RASTA', 'OBSERVA_RAICES_VIVAS_RASTA',
       'OBSERVA_PLANTAS_PEQUENAS_RASTA', 'OBSERVA_HOJARASCA_MO_RASTA',
       'SUELO_NEGRO_BLANDO_RASTA', 'CUCHILLO_PRIMER_HTE_RASTA',
       'CERCA_RIOS_QUEBRADAS_RASTA', 'RECUBRIMIENTO_VEGETAL__SUELO_RASTA',
       'd.interno', 'drenaje_externo'],
      dtype='object')

In [9]:
# Una vez las variables categoricas seleccionadas se realiza la conversion a formato dummy
df_cat = pd.get_dummies(df[cat_features], drop_first = True, dummy_na = True)
df_cat.head()

Unnamed: 0,TIPO_SIEMBRA_Mecanizado,TIPO_SIEMBRA_nan,SEM_TRATADAS_SI,SEM_TRATADAS_nan,MATERIAL_GENETICO_ADV 9339 (Syngenta),MATERIAL_GENETICO_CORPOICA V 114,MATERIAL_GENETICO_Cerato (Syngenta),MATERIAL_GENETICO_DK 1040,MATERIAL_GENETICO_DK 1596,MATERIAL_GENETICO_DK 234,...,RECUBRIMIENTO_VEGETAL__SUELO_RASTA_ESPACIADO,RECUBRIMIENTO_VEGETAL__SUELO_RASTA_MUY BUENO,RECUBRIMIENTO_VEGETAL__SUELO_RASTA_REGULAR,RECUBRIMIENTO_VEGETAL__SUELO_RASTA_SIN COBERTURA,RECUBRIMIENTO_VEGETAL__SUELO_RASTA_nan,d.interno_EXCESIVO,d.interno_LENTO A MUY LENTO,d.interno_nan,drenaje_externo_NINGUNO,drenaje_externo_nan
0,1,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,1,0,1,0
1,1,0,1,0,0,0,0,0,0,1,...,0,0,0,0,0,0,1,0,1,0
2,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,1,0
3,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,1,0
4,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,1,0


In [10]:
# Hay que eliminar las caracteristicas originales del dataframe principal
df.drop(cat_features, axis = 1, inplace = True)

In [11]:
# Agrego las nuevas variables  a la vista minable y finalmente relizo el proceso de los wrappers
df_final = pd.concat([df,df_cat], axis = 1)

In [12]:
df_final

Unnamed: 0,ID_LOTE,DIAS_EN_EMERGER,DIAS_EN_EMERGER_A_FLORECER,DIAS_EN_FLORECER_A_COSECHAR,POBLACION_20DIAS_AJT,ALTURA_LOT,ContEnfQui_Emer_Flor,ContEnfQui_Flor_Cose,ContMalMec_Siem_Emer,ContMalMec_Emer_Flor,...,RECUBRIMIENTO_VEGETAL__SUELO_RASTA_ESPACIADO,RECUBRIMIENTO_VEGETAL__SUELO_RASTA_MUY BUENO,RECUBRIMIENTO_VEGETAL__SUELO_RASTA_REGULAR,RECUBRIMIENTO_VEGETAL__SUELO_RASTA_SIN COBERTURA,RECUBRIMIENTO_VEGETAL__SUELO_RASTA_nan,d.interno_EXCESIVO,d.interno_LENTO A MUY LENTO,d.interno_nan,drenaje_externo_NINGUNO,drenaje_externo_nan
0,40,5,63,68,60000,13,0,0,0,0,...,0,1,0,0,0,0,1,0,1,0
1,43,5,64,63,60000,15,0,0,0,0,...,0,0,0,0,0,0,1,0,1,0
2,44,5,59,66,60000,12,0,0,0,0,...,0,0,0,0,0,0,1,0,1,0
3,45,5,64,59,60000,12,0,0,0,0,...,0,0,0,0,0,0,1,0,1,0
4,46,5,63,60,60000,16,0,0,0,0,...,0,0,0,0,0,0,1,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
794,4378,7,47,84,70000,18,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
795,4379,5,47,81,62000,16,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
796,4380,6,49,79,61000,17,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
797,4382,7,48,95,65000,17,0,0,0,0,...,0,0,1,0,0,0,0,0,1,0


In [13]:
Y = df_final.RDT_AJUSTADO
X = df_final.drop(["RDT_AJUSTADO"], axis=1)

In [14]:
X

Unnamed: 0,ID_LOTE,DIAS_EN_EMERGER,DIAS_EN_EMERGER_A_FLORECER,DIAS_EN_FLORECER_A_COSECHAR,POBLACION_20DIAS_AJT,ALTURA_LOT,ContEnfQui_Emer_Flor,ContEnfQui_Flor_Cose,ContMalMec_Siem_Emer,ContMalMec_Emer_Flor,...,RECUBRIMIENTO_VEGETAL__SUELO_RASTA_ESPACIADO,RECUBRIMIENTO_VEGETAL__SUELO_RASTA_MUY BUENO,RECUBRIMIENTO_VEGETAL__SUELO_RASTA_REGULAR,RECUBRIMIENTO_VEGETAL__SUELO_RASTA_SIN COBERTURA,RECUBRIMIENTO_VEGETAL__SUELO_RASTA_nan,d.interno_EXCESIVO,d.interno_LENTO A MUY LENTO,d.interno_nan,drenaje_externo_NINGUNO,drenaje_externo_nan
0,40,5,63,68,60000,13,0,0,0,0,...,0,1,0,0,0,0,1,0,1,0
1,43,5,64,63,60000,15,0,0,0,0,...,0,0,0,0,0,0,1,0,1,0
2,44,5,59,66,60000,12,0,0,0,0,...,0,0,0,0,0,0,1,0,1,0
3,45,5,64,59,60000,12,0,0,0,0,...,0,0,0,0,0,0,1,0,1,0
4,46,5,63,60,60000,16,0,0,0,0,...,0,0,0,0,0,0,1,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
794,4378,7,47,84,70000,18,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
795,4379,5,47,81,62000,16,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
796,4380,6,49,79,61000,17,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
797,4382,7,48,95,65000,17,0,0,0,0,...,0,0,1,0,0,0,0,0,1,0


## Wrappers - Selección de Caracteristicas.

In [15]:
# Implementación Wrappers
# Foward selections
excel = "resultados_fs.xlsx"
from mlxtend.feature_selection import SequentialFeatureSelector as SFS
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import GridSearchCV


k_features = [10]
contador_features = 0
iter_number = 3

for item in range(iter_number):
    for i in range(len(k_features)):
        nomenclaturaHoja = "k_featutes_"+str(item)+"_"+str(i)

        sfs = SFS(LinearRegression(),
                k_features=k_features[i],
                forward=True,
                floating=False,
                scoring = 'r2',
                cv = 10)

        # Ajuste del modelo
        sfs.fit(X, Y)
        # Obtengo los nombres de las caracteristicas | features
        tw_best_atri = sfs.k_feature_names_ 
        # Guardo los nombres de c/d iteracion en un excel para posteriores analisis
        df_features = pd.DataFrame(tw_best_atri).rename(columns={0: "Features"})

        # Apertura del excel
        with pd.ExcelWriter(excel,engine="openpyxl", mode = 'a', if_sheet_exists="overlay"
                                        ) as writer:
                        df_features.to_excel(writer, index=None, sheet_name=nomenclaturaHoja)

        contador_features= contador_features + 1
        print("Ejecucion_Numero_: ", contador_features)
        

Ejecucion_Numero_:  1
Ejecucion_Numero_:  2
Ejecucion_Numero_:  3
